diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..a81c8ee1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/Dockerfile b/Dockerfile
index c4b67626..0e4b7500 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -9,5 +9,7 @@ FROM ${FROM_IMAGE_NAME}
 ADD requirements.txt .
 RUN pip install -r requirements.txt
 
+RUN pip install torch==1.3.1
+
 WORKDIR /code
 ADD . .
diff --git a/README.md b/README.md
index 8cc3c108..0dd02696 100644
--- a/README.md
+++ b/README.md
@@ -311,23 +311,24 @@ Benchmarking
       ./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"]
     ```
     - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here
-    [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)   
+
+    [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt)
 
 <img src="./terabyte_0875_loss_accuracy_plots.png" width="900" height="320">
 
 *NOTE: Benchmarking scripts accept extra arguments which will be passed along to the model, such as --num-batches=100 to limit the number of data samples*
 
-4) The code supports interface with [MLPerf benchmark](https://mlperf.org). 
+4) The code supports interface with [MLPerf benchmark](https://mlperf.org).
    - Please refer to the following training parameters
    ```
      --mlperf-logging that keeps track of multiple metrics, including area under the curve (AUC)
-   
+
      --mlperf-acc-threshold that allows early stopping based on accuracy metric
-   
+
      --mlperf-auc-threshold that allows early stopping based on AUC metric
-   
+
      --mlperf-bin-loader that enables preprocessing of data into a single binary file
-   
+
      --mlperf-bin-shuffle that controls whether a random shuffle of mini-batches is performed
    ```
    - The MLPerf training model is completely specified and can be trained using the following script
@@ -367,6 +368,8 @@ pydot (*optional*)
 
 torchviz (*optional*)
 
+tqdm
+
 
 License
 -------
diff --git a/README.params.md b/README.params.md
new file mode 100644
index 00000000..86a43f71
--- /dev/null
+++ b/README.params.md
@@ -0,0 +1,51 @@
+
+# DLRM Distributed Branch
+
+Extend the PyTorch implementation to run DLRM on multi nodes on distributed platforms.
+The distributed version will be needed when data model becomes large.
+
+It inherents all the parameters from master DLRM implementation. 
+The distributed version add one more parameter:
+
+**--dist-backend**:
+   The backend support for the distributed version. As in torch.distributed package,
+   it can be "nccl", "mpi", and "gloo".
+
+In addition, it introduces the following new parameter::
+**--arch-project-size** : 
+   Reducing the number of interaction features for the dot operation. 
+   A project operation is applied to the dotted features to reduce its dimension size.
+   This is mainly due to the memory concern. It reduces the memory size needed for top MLP. 
+   A side effect is that it may also imrpove the model accuracy.
+
+## Usage
+
+Currently, it is launched with mpirun on multi-nodes. The hostfile need to be created or 
+a host list should be given. The DLRM parameters should be given in the same way as single
+node master branch.
+```bash
+mpirun -np 128 -hostfile hostfile python dlrm_s_pytorch.py ...
+```
+
+## Example
+```bash
+python dlrm_s_pytorch.py 
+   --arch-sparse-feature-size=128 
+   --arch-mlp-bot="2000-1024-1024-128" 
+   --arch-mlp-top="4096-4096-4096-1" 
+   --arch-embedding-size=$large_arch_emb 
+   --data-generation=random 
+   --loss-function=bce 
+   --round-targets=True 
+   --learning-rate=0.1 
+   --mini-batch-size=2048 
+   --print-freq=10240 
+   --print-time 
+   --test-mini-batch-size=16384 
+   --test-num-workers=16
+   --num-indices-per-lookup-fixed=1 
+   --num-indices-per-lookup=100
+   --arch-projection-size 30
+   --use-gpu
+```
+
diff --git a/data_loader_terabyte.py b/data_loader_terabyte.py
index b520fc96..d1a38efa 100644
--- a/data_loader_terabyte.py
+++ b/data_loader_terabyte.py
@@ -325,7 +325,7 @@ def _test_bin():
 
     original_dataset = CriteoDataset(
         dataset='terabyte',
-        max_ind_range=-1,
+        max_ind_range=10 * 1000 * 1000,
         sub_sample_rate=1,
         randomize=True,
         split=args.split,
diff --git a/dlrm_data.py b/dlrm_data.py
new file mode 100644
index 00000000..28afd8da
--- /dev/null
+++ b/dlrm_data.py
@@ -0,0 +1,277 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: delivering inputs and targets for the dlrm benchmark
+# The inpts and outputs are used according to the following two option(s)
+# 1) random distribution, generated and loaded based on uniform distribution
+# 2) synthetic data, the synthetic pre-generated data would be loaded.
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+import sys
+import numpy as np
+from numpy import random as ra
+import torch
+from torch.utils.data import Dataset  # , RandomSampler
+
+
+class RandomDataset(Dataset):
+    """ Uniform distribution """
+    def __init__(
+            self,
+            m_den,
+            ln_emb,
+            data_size,
+            num_batches,
+            mini_batch_size,
+            num_indices_per_lookup,
+            num_indices_per_lookup_fixed,
+            num_targets=1,
+            round_targets=False,
+            data_generation="random",
+            trace_file="",
+            enable_padding=False,
+            reset_seed_on_access=False,
+            rand_seed=0
+    ):
+        # compute batch size
+        nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size))
+        if num_batches != 0:
+            nbatches = num_batches
+            data_size = nbatches * mini_batch_size
+            # print("Total number of batches %d" % nbatches)
+
+        # save args (recompute data_size if needed)
+        self.m_den = m_den
+        self.ln_emb = ln_emb
+        self.data_size = data_size
+        self.num_batches = nbatches
+        self.mini_batch_size = mini_batch_size
+        self.num_indices_per_lookup = num_indices_per_lookup
+        self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed
+        self.num_targets = num_targets
+        self.round_targets = round_targets
+        self.data_generation = data_generation
+        self.trace_file = trace_file
+        self.enable_padding = enable_padding
+        self.reset_seed_on_access = reset_seed_on_access
+        self.rand_seed = rand_seed
+
+    def reset_numpy_seed(self, numpy_rand_seed):
+        np.random.seed(numpy_rand_seed)
+        # torch.manual_seed(numpy_rand_seed)
+
+    def __getitem__(self, index):
+
+        if isinstance(index, slice):
+            return [
+                self[idx] for idx in range(
+                    index.start or 0, index.stop or len(self), index.step or 1
+                )
+            ]
+
+        # WARNING: reset seed on access to first element
+        # (e.g. if same random samples needed across epochs)
+        if self.reset_seed_on_access and index == 0:
+            self.reset_numpy_seed(self.rand_seed)
+
+        # number of data points in a batch
+        n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size))
+
+        # generate a batch of dense and sparse features
+        if self.data_generation == "random":
+            (X, lS_o, lS_i) = generate_uniform_input_batch(
+                self.m_den,
+                self.ln_emb,
+                n,
+                self.num_indices_per_lookup,
+                self.num_indices_per_lookup_fixed
+            )
+
+        # generate a batch of target (probability of a click)
+        T = generate_random_output_batch(n, self.num_targets, self.round_targets)
+
+        return (X, lS_o, lS_i, T)
+
+    def __len__(self):
+        # WARNING: note that we produce bacthes of outputs in __getitem__
+        # therefore we should use num_batches rather than data_size below
+        return self.num_batches
+
+
+def collate_wrapper_random(list_of_tuples):
+    # where each tuple is (X, lS_o, lS_i, T)
+    (X, lS_o, lS_i, T) = list_of_tuples[0]
+    return (X,
+            torch.stack(lS_o),
+            lS_i,
+            T)
+
+
+def make_random_data_and_loader(args, ln_emb, m_den):
+
+    train_data = RandomDataset(
+        m_den,
+        ln_emb,
+        args.data_size,
+        args.num_batches,
+        args.mini_batch_size,
+        args.num_indices_per_lookup,
+        args.num_indices_per_lookup_fixed,
+        1,  # num_targets
+        args.round_targets,
+        args.data_generation,
+        args.data_trace_file,
+        args.data_trace_enable_padding,
+        reset_seed_on_access=True,
+        rand_seed=args.numpy_rand_seed
+    )  # WARNING: generates a batch of lookups at once
+    train_loader = torch.utils.data.DataLoader(
+        train_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,  # True
+        # persistent_workers=True,
+    )
+    return train_data, train_loader
+
+
+def generate_random_output_batch(n, num_targets, round_targets=False):
+    # target (probability of a click)
+    if round_targets:
+        P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32)
+    else:
+        P = ra.rand(n, num_targets).astype(np.float32)
+
+    return torch.tensor(P)
+
+
+# uniform ditribution (input data)
+def generate_uniform_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+):
+    # dense feature
+    #Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            r = ra.random(sparse_group_size)
+            sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
+
+class SyntheticDataset(Dataset):
+
+    def __init__(
+        self,
+        mini_batch_size,
+        ln_emb,
+        nbatches=1,
+        synthetic_data_folder="./synthetic_data/syn_data_bs65536/",
+    ):
+        self.synthetic_data_folder = synthetic_data_folder
+        self.num_batches = nbatches
+        self.mini_batch_size = mini_batch_size
+        self.ln_emb = ln_emb
+
+        self.X = torch.load(f"{self.synthetic_data_folder}/X_0.pt")
+        self.lS_o = torch.load(f"{self.synthetic_data_folder}/lS_o_0.pt")
+        self.lS_i = torch.load(f"{self.synthetic_data_folder}/lS_i_0.pt")
+        self.T = torch.load(f"{self.synthetic_data_folder}/T_0.pt")
+        # print('data loader initiated ...')
+
+    def __getitem__(self, index):
+        # module out index for reuse
+        index = index % (len(self.X) // self.mini_batch_size)
+        sInd = index * self.mini_batch_size
+        eInd = sInd + self.mini_batch_size
+        if sInd >= len(self.X):
+            sys.exit(f' mini_batch_size({self.mini_batch_size}) * '
+               f'num_batches({self.num_batches}) has to be less'
+               f' than size of data({len(self.X)})'
+            )
+        X = self.X[sInd:eInd]
+        lS_o = [i[:][sInd:eInd] - i[:][sInd] for i in self.lS_o]
+
+        if eInd < len(self.lS_o[0]):
+            lS_i = [val[self.lS_o[ind][sInd]:self.lS_o[ind][eInd]] for ind, val in enumerate(self.lS_i)]
+        elif sInd < len(self.lS_o[0]):
+            lS_i = [val[self.lS_o[ind][sInd]:] for ind, val in enumerate(self.lS_i)]
+#        for i in range(len(lS_i)):
+#            bound = self.ln_emb[i]
+#            if not bound == 26000000:
+#                lS_i[i] %= bound
+
+        T = self.T[sInd:eInd]
+        return (X, lS_o, lS_i, T)
+
+    def __len__(self):
+        return self.num_batches
+
+
+def synthetic_data_loader(args, ln_emb, m_den):
+
+    train_data = SyntheticDataset(
+        args.mini_batch_size,
+        ln_emb,
+        # how to repeat ?
+        # nbatches=min(args.num_batches, 65536 // args.mini_batch_size),
+        nbatches=args.num_batches,
+        synthetic_data_folder=args.synthetic_data_folder,
+    )
+    train_loader = torch.utils.data.DataLoader(
+        train_data,
+        batch_size=1,
+        shuffle=False,
+        num_workers=args.num_workers,
+        collate_fn=collate_wrapper_random,
+        pin_memory=False,
+        drop_last=False,
+    )
+    return train_data, train_loader
+
+
+def data_loader(args, ln_emb, m_den):
+    data_gens = {"random": make_random_data_and_loader,
+                 "synthetic": synthetic_data_loader,
+    }
+    train_data, train_ld = data_gens[args.data_generation](args, ln_emb, m_den)
+
+    return train_data, train_ld
diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py
index 6cbe382a..1d9845ba 100644
--- a/dlrm_data_pytorch.py
+++ b/dlrm_data_pytorch.py
@@ -34,7 +34,7 @@
 import torch
 from torch.utils.data import Dataset, RandomSampler
 
-import data_loader_terabyte
+## import data_loader_terabyte
 
 
 # Kaggle Display Advertising Challenge Dataset
@@ -537,6 +537,13 @@ def __init__(
             trace_file="",
             enable_padding=False,
             reset_seed_on_access=False,
+            
+            rand_data_dist="uniform",
+            rand_data_min=1,
+            rand_data_max=1,
+            rand_data_mu=-1,
+            rand_data_sigma=1,
+
             rand_seed=0
     ):
         # compute batch size
@@ -561,6 +568,12 @@ def __init__(
         self.enable_padding = enable_padding
         self.reset_seed_on_access = reset_seed_on_access
         self.rand_seed = rand_seed
+ 
+        self.rand_data_dist = rand_data_dist
+        self.rand_data_min = rand_data_min
+        self.rand_data_max = rand_data_max
+        self.rand_data_mu = rand_data_mu
+        self.rand_data_sigma = rand_data_sigma
 
     def reset_numpy_seed(self, numpy_rand_seed):
         np.random.seed(numpy_rand_seed)
@@ -585,12 +598,18 @@ def __getitem__(self, index):
 
         # generate a batch of dense and sparse features
         if self.data_generation == "random":
-            (X, lS_o, lS_i) = generate_uniform_input_batch(
+            (X, lS_o, lS_i) = generate_dist_input_batch(
                 self.m_den,
                 self.ln_emb,
                 n,
                 self.num_indices_per_lookup,
-                self.num_indices_per_lookup_fixed
+                self.num_indices_per_lookup_fixed,
+	
+                rand_data_dist=self.rand_data_dist,
+                rand_data_min=self.rand_data_min,
+                rand_data_max=self.rand_data_max,
+                rand_data_mu=self.rand_data_mu,
+                rand_data_sigma=self.rand_data_sigma,
             )
         elif self.data_generation == "synthetic":
             (X, lS_o, lS_i) = generate_synthetic_input_batch(
@@ -778,6 +797,67 @@ def generate_uniform_input_batch(
     return (Xt, lS_emb_offsets, lS_emb_indices)
 
 
+# random data from uniform or gaussian ditribution (input data)
+def generate_dist_input_batch(
+    m_den,
+    ln_emb,
+    n,
+    num_indices_per_lookup,
+    num_indices_per_lookup_fixed,
+    rand_data_dist,
+    rand_data_min,
+    rand_data_max,
+    rand_data_mu,
+    rand_data_sigma,
+):
+    # dense feature
+    Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32))
+
+    # sparse feature (sparse indices)
+    lS_emb_offsets = []
+    lS_emb_indices = []
+    # for each embedding generate a list of n lookups,
+    # where each lookup is composed of multiple sparse indices
+    for size in ln_emb:
+        lS_batch_offsets = []
+        lS_batch_indices = []
+        offset = 0
+        for _ in range(n):
+            # num of sparse indices to be used per embedding (between
+            if num_indices_per_lookup_fixed:
+                sparse_group_size = np.int64(num_indices_per_lookup)
+            else:
+                # random between [1,num_indices_per_lookup])
+                r = ra.random(1)
+                sparse_group_size = np.int64(
+                    np.round(max([1.0], r * min(size, num_indices_per_lookup)))
+                )
+            # sparse indices to be used per embedding
+            if rand_data_dist == "gaussian":
+                if rand_data_mu == -1:
+                    rand_data_mu = (rand_data_max + rand_data_min) / 2.0
+                r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size)
+                sparse_group = np.clip(r, rand_data_min, rand_data_max)
+                sparse_group = np.unique(sparse_group).astype(np.int64)
+            elif rand_data_dist == "uniform":
+                r = ra.random(sparse_group_size)
+                sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64))
+            else:
+                raise(rand_data_dist, "distribution is not supported. \
+                     please select uniform or gaussian")
+
+            # reset sparse_group_size in case some index duplicates were removed
+            sparse_group_size = np.int64(sparse_group.size)
+            # store lengths and indices
+            lS_batch_offsets += [offset]
+            lS_batch_indices += sparse_group.tolist()
+            # update offset for next iteration
+            offset += sparse_group_size
+        lS_emb_offsets.append(torch.tensor(lS_batch_offsets))
+        lS_emb_indices.append(torch.tensor(lS_batch_indices))
+
+    return (Xt, lS_emb_offsets, lS_emb_indices)
+
 # synthetic distribution (input data)
 def generate_synthetic_input_batch(
     m_den,
@@ -813,7 +893,7 @@ def generate_synthetic_input_batch(
             # sparse indices to be used per embedding
             file_path = trace_file
             line_accesses, list_sd, cumm_sd = read_dist_from_file(
-                file_path.replace("j", str(i))
+                file_path.replace("j", str(0))
             )
             # debug prints
             # print("input")
@@ -1007,7 +1087,7 @@ def read_dist_from_file(file_path):
         with open(file_path, "r") as f:
             lines = f.read().splitlines()
     except Exception:
-        print("Wrong file or file path")
+        print("Wrong file or file path in read: ", file_path)
     # read unique accesses
     unique_accesses = [int(el) for el in lines[0].split(", ")]
     # read cumulative distribution (elements are passed as two separate lists)
@@ -1030,7 +1110,7 @@ def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd):
             s = str(cumm_sd)
             f.write(s[1 : len(s) - 1] + "\n")
     except Exception:
-        print("Wrong file or file path")
+        print("Wrong file or file path in write: ", file_path)
 
 
 if __name__ == "__main__":
diff --git a/dlrm_profile.py b/dlrm_profile.py
new file mode 100644
index 00000000..152dce83
--- /dev/null
+++ b/dlrm_profile.py
@@ -0,0 +1,1595 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# miscellaneous
+import builtins
+import functools
+# import bisect
+# import shutil
+import time
+import json
+# data generation
+import dlrm_data_pytorch as dp
+
+# numpy
+import numpy as np
+import socket
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+## import onnx
+
+# pytorch
+import torch
+from torch import onnx
+import torch.nn as nn
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+
+# For distributed run
+import extend_distributed as ext_dist
+
+# quotient-remainder trick
+from tricks.qr_embedding_bag import QREmbeddingBag
+# mixed-dimension trick
+from tricks.md_embedding_bag import PrEmbeddingBag, md_solver
+
+import sklearn.metrics
+
+import uuid
+import project
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import dlrm_data as dd
+
+# Add dlrm self profiling timers
+import profile as tm
+# import pyprof
+# pyprof.init()  # causing errors, some symbols not found
+
+# import synthetic_data_loader as fb_syn_data
+
+# from torchviz import make_dot
+# import torch.nn.functional as Functional
+# from torch.nn.parameter import Parameter
+
+from torch.optim.lr_scheduler import _LRScheduler
+
+exc = getattr(builtins, "IOError", "FileNotFoundError")
+
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            # approach 1
+            LL.weight.data = torch.tensor(W, requires_grad=True)
+            LL.bias.data = torch.tensor(bt, requires_grad=True)
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln):
+        emb_l = nn.ModuleList()
+        # save the numpy random state
+        np_rand_state = np.random.get_state()
+        for i in range(0, ln.size):
+            if ext_dist.my_size > 1:
+                if not i in self.local_emb_indices: continue
+            # Use per table random seed for Embedding initialization
+            np.random.seed(self.l_emb_seeds[i])
+            n = ln[i]
+            # construct embedding operator
+            if self.qr_flag and n > self.qr_threshold:
+                EE = QREmbeddingBag(n, m, self.qr_collisions,
+                    operation=self.qr_operation, mode="sum", sparse=True)
+            elif self.md_flag:
+                base = max(m)
+                _m = m[i] if n > self.md_threshold else base
+                EE = PrEmbeddingBag(n, _m, base)
+                # use np initialization as below for consistency...
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
+                ).astype(np.float32)
+                EE.embs.weight.data = torch.tensor(W, requires_grad=True)
+
+            else:
+                #_weight = torch.empty([n, m]).uniform_(-np.sqrt(1 / n), np.sqrt(1 / n))
+                #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight= _weight)
+                #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
+
+                # initialize embeddings
+                # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+                ).astype(np.float32)
+                # approach 1
+                EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True))
+                #EE.weight.data = torch.tensor(W, requires_grad=True)
+                # approach 2
+                # EE.weight.data.copy_(torch.tensor(W))
+                # approach 3
+                # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
+
+            if ext_dist.my_size > 1:
+                if i in self.local_emb_indices:
+                    emb_l.append(EE)
+            else:
+                emb_l.append(EE)
+
+        # Restore the numpy random state
+        np.random.set_state(np_rand_state)
+        return emb_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        proj_size = 0,
+        arch_interaction_op=None,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        sync_dense_params=True,
+        loss_threshold=0.0,
+        ndevices=-1,
+        qr_flag=False,
+        qr_operation="mult",
+        qr_collisions=0,
+        qr_threshold=200,
+        md_flag=False,
+        md_threshold=200,
+    ):
+        super(DLRM_Net, self).__init__()
+
+        if (
+            (m_spa is not None)
+            and (ln_emb is not None)
+            and (ln_bot is not None)
+            and (ln_top is not None)
+            and (arch_interaction_op is not None)
+        ):
+
+            # save arguments
+            self.proj_size = proj_size
+            self.ndevices = ndevices
+            self.output_d = 0
+            self.parallel_model_batch_size = -1
+            self.parallel_model_is_not_prepared = True
+            self.arch_interaction_op = arch_interaction_op
+            self.arch_interaction_itself = arch_interaction_itself
+            self.sync_dense_params = sync_dense_params
+            self.loss_threshold = loss_threshold
+            # create variables for QR embedding if applicable
+            self.qr_flag = qr_flag
+            if self.qr_flag:
+                self.qr_collisions = qr_collisions
+                self.qr_operation = qr_operation
+                self.qr_threshold = qr_threshold
+            # create variables for MD embedding if applicable
+            self.md_flag = md_flag
+            if self.md_flag:
+                self.md_threshold = md_threshold
+
+            # generate np seeds for Emb table initialization
+            self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb))
+
+            #If running distributed, get local slice of embedding tables
+            if ext_dist.my_size > 1:
+                n_emb = len(ln_emb)
+                self.n_global_emb = n_emb
+                self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb)
+                self.local_emb_slice = ext_dist.get_my_slice(n_emb)
+                self.local_emb_indices = list(range(n_emb))[self.local_emb_slice]
+                #ln_emb = ln_emb[self.local_emb_slice]
+
+            # create operators
+            if ndevices <= 1:
+                self.emb_l = self.create_emb(m_spa, ln_emb)
+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
+            if (proj_size > 0):
+                self.proj_l = project.create_proj(len(ln_emb)+1, proj_size)
+
+    def apply_mlp(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_proj(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_emb(self, lS_o, lS_i, emb_l):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+
+        ly = []
+        for k, sparse_index_group_batch in enumerate(lS_i):
+            sparse_offset_group_batch = lS_o[k]
+
+            # embedding lookup
+            # We are using EmbeddingBag, which implicitly uses sum operator.
+            # The embeddings are represented as tall matrices, with sum
+            # happening vertically across 0 axis, resulting in a row vector
+            E = emb_l[k]
+            V = E(sparse_index_group_batch, sparse_offset_group_batch)
+
+            ly.append(V)
+
+        # print(ly)
+        return ly
+
+    def interact_features(self, x, ly):
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            if (self.proj_size > 0):
+                R = project.project(T, x, self.proj_l)
+                #TT = torch.transpose(T, 1, 2)
+                #TS = torch.reshape(TT, (-1, TT.size(2)))
+                #TC = self.apply_mlp(TS, self.proj_l)
+                #TR = torch.reshape(TC, (-1, d ,self.proj_size))
+                #Z  = torch.bmm(T, TR)
+                #Zflat = Z.view((batch_size, -1))
+                #R = torch.cat([x] + [Zflat], dim=1)
+            else:
+                Z = torch.bmm(T, torch.transpose(T, 1, 2))
+                # append dense feature with the interactions (into a row vector)
+                # approach 1: all
+                # Zflat = Z.view((batch_size, -1))
+                # approach 2: unique
+                _, ni, nj = Z.shape
+                # approach 1: tril_indices
+                # offset = 0 if self.arch_interaction_itself else -1
+                # li, lj = torch.tril_indices(ni, nj, offset=offset)
+                # approach 2: custom
+                offset = 1 if self.arch_interaction_itself else 0
+                li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+                lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+                Zflat = Z[:, li, lj]
+                # concatenate dense features and interactions
+                R = torch.cat([x] + [Zflat], dim=1)
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            R = torch.cat([x] + ly, dim=1)
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def forward(self, dense_x, lS_o, lS_i):
+        if ext_dist.my_size > 1:
+            return self.distributed_forward(dense_x, lS_o, lS_i)
+        elif self.ndevices <= 1:
+            return self.sequential_forward(dense_x, lS_o, lS_i)
+        else:
+            return self.parallel_forward(dense_x, lS_o, lS_i)
+
+    def sequential_forward(self, dense_x, lS_o, lS_i):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def distributed_forward(self, dense_x, lS_o, lS_i):
+        batch_size = dense_x.size()[0]
+        # WARNING: # of ranks must be <= batch size in distributed_forward call
+        if batch_size < ext_dist.my_size:
+            sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size))
+        if batch_size % ext_dist.my_size != 0:
+            sys.exit("ERROR: batch_size %d can not split across %d ranks evenly" % (batch_size, ext_dist.my_size))
+
+        dense_x = dense_x[ext_dist.get_my_slice(batch_size)]
+        lS_o = lS_o[self.local_emb_slice]
+        lS_i = lS_i[self.local_emb_slice]
+
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in distributed_forward call")
+
+        # embeddings
+        tm.tmEmb.start()
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        tm.tmEmb.stop()
+
+        # print("ly: ", ly)
+        # debug prints
+        # print(ly)
+
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each rank. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each rank.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in distributed_forward call")
+
+        tm.tmA2A.start()
+        a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
+        tm.tmA2A.stop()
+
+        tm.tmBot.start()
+        x = self.apply_mlp(dense_x, self.bot_l)
+        tm.tmBot.stop()
+
+        # debug prints
+        # print(x)
+
+        tm.tmA2A1.start()
+        ly = a2a_req.wait()
+        tm.tmA2A1.stop()
+        # print("ly: ", ly)
+        ly = list(ly)
+
+        # interactions
+        tm.tmInt.start()
+        z = self.interact_features(x, ly)
+        tm.tmInt.stop()
+        # debug prints
+        # print(z)
+
+        # top mlp
+        tm.tmTop.start()
+        p = self.apply_mlp(z, self.top_l)
+        tm.tmTop.stop()
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(
+                p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z = p
+
+        ### gather the distributed results on each rank ###
+        # For some reason it requires explicit sync before all_gather call if
+        # tensor is on GPU memory
+        tm.tmAllGa.start()
+        if z.is_cuda: torch.cuda.synchronize()
+        (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size)
+        z = ext_dist.all_gather(z, batch_split_lengths)
+        tm.tmAllGa.stop()
+        #print("Z: %s" % z)
+
+        return z
+
+    def parallel_forward(self, dense_x, lS_o, lS_i):
+        ### prepare model (overwrite) ###
+        # WARNING: # of devices must be >= batch size in parallel_forward call
+        batch_size = dense_x.size()[0]
+        ndevices = min(self.ndevices, batch_size, len(self.emb_l))
+        device_ids = range(ndevices)
+        # WARNING: must redistribute the model if mini-batch size changes(this is common
+        # for last mini-batch, when # of elements in the dataset/batch size is not even
+        if self.parallel_model_batch_size != batch_size:
+            self.parallel_model_is_not_prepared = True
+
+        if self.parallel_model_is_not_prepared or self.sync_dense_params:
+            # replicate mlp (data parallelism)
+            self.bot_l_replicas = replicate(self.bot_l, device_ids)
+            self.top_l_replicas = replicate(self.top_l, device_ids)
+            self.parallel_model_batch_size = batch_size
+
+        if self.parallel_model_is_not_prepared:
+            # distribute embeddings (model parallelism)
+            t_list = []
+            for k, emb in enumerate(self.emb_l):
+                d = torch.device("cuda:" + str(k % ndevices))
+                emb.to(d)
+                t_list.append(emb.to(d))
+            self.emb_l = nn.ModuleList(t_list)
+            self.parallel_model_is_not_prepared = False
+
+        ### prepare input (overwrite) ###
+        # scatter dense features (data parallelism)
+        # print(dense_x.device)
+        dense_x = scatter(dense_x, device_ids, dim=0)
+        # distribute sparse features (model parallelism)
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in parallel_forward call")
+
+        t_list = []
+        i_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            t_list.append(lS_o[k].to(d))
+            i_list.append(lS_i[k].to(d))
+        lS_o = t_list
+        lS_i = i_list
+
+        ### compute results in parallel ###
+        # bottom mlp
+        # WARNING: Note that the self.bot_l is a list of bottom mlp modules
+        # that have been replicated across devices, while dense_x is a tuple of dense
+        # inputs that has been scattered across devices on the first (batch) dimension.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of dense_x.
+        x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
+        # debug prints
+        # print(x)
+
+        # embeddings
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        # debug prints
+        # print(ly)
+
+        # butterfly shuffle (implemented inefficiently for now)
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each device. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each device.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
+
+        t_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            y = scatter(ly[k], device_ids, dim=0)
+            t_list.append(y)
+        # adjust the list to be ordered per device
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+        # debug prints
+        # print(ly)
+
+        # interactions
+        z = []
+        for k in range(ndevices):
+            zk = self.interact_features(x[k], ly[k])
+            z.append(zk)
+        # debug prints
+        # print(z)
+
+        # top mlp
+        # WARNING: Note that the self.top_l is a list of top mlp modules that
+        # have been replicated across devices, while z is a list of interaction results
+        # that by construction are scattered across devices on the first (batch) dim.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of z.
+        p = parallel_apply(self.top_l_replicas, z, None, device_ids)
+
+        ### gather the distributed results ###
+        p0 = gather(p, self.output_d, dim=0)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z0 = torch.clamp(
+                p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z0 = p0
+
+        return z0
+
+
+def dash_separated_ints(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value)
+
+    return value
+
+
+def dash_separated_floats(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            float(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of floats" % value)
+
+    return value
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import sys
+    import os
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-project-size", type=int, default=0)
+
+    # j will be replaced with the table number
+    parser.add_argument(
+        "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument(
+        "--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    parser.add_argument(
+        "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot")
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    # embedding table options
+    parser.add_argument("--md-flag", action="store_true", default=False)
+    parser.add_argument("--md-threshold", type=int, default=200)
+    parser.add_argument("--md-temperature", type=float, default=0.3)
+    parser.add_argument("--md-round-dims", action="store_true", default=False)
+    parser.add_argument("--qr-flag", action="store_true", default=False)
+    parser.add_argument("--qr-threshold", type=int, default=200)
+    parser.add_argument("--qr-operation", type=str, default="mult")
+    parser.add_argument("--qr-collisions", type=int, default=4)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
+    parser.add_argument(
+        "--loss-weights", type=dash_separated_floats, default="1.0-1.0")  # for wbce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument(
+        "--data-generation", type=str, default="random"
+    )  # synthetic or dataset
+    parser.add_argument("--synthetic-data-folder", type=str,
+        default="./synthetic_data/syn_data_bs65536")
+    # add Gaussian distribution
+    parser.add_argument("--rand-data-dist", type=str, default="uniform")  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # onnx
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    # distributed run
+    parser.add_argument("--dist-backend", type=str, default="")
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    # store/load model
+    parser.add_argument("--out-dir", type=str, default=".")
+    parser.add_argument("--save-model", type=str, default="")
+    parser.add_argument("--load-model", type=str, default="")
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
+    parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
+
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+
+    args = parser.parse_args()
+
+    print(socket.gethostname())
+
+    ext_dist.init_distributed(backend=args.dist_backend)
+
+    # print("success size= ", ext_dist.my_size, ext_dist.my_rank)
+
+    ext_dist.barrier()
+
+    if args.mlperf_logging:
+        print('command line args: ', json.dumps(vars(args)))
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+    torch.set_printoptions(precision=args.print_precision)
+    torch.manual_seed(args.numpy_rand_seed)
+
+    if (args.test_mini_batch_size < 0):
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if (args.test_num_workers < 0):
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+    if args.mini_batch_size % ext_dist.my_size !=0 or args.test_mini_batch_size % ext_dist.my_size != 0:
+        print("Either test minibatch (%d) or train minibatch (%d) does not split across %d ranks" % (args.test_mini_batch_size, args.mini_batch_size, ext_dist.my_size))
+        sys.exit(1)
+
+    use_gpu = args.use_gpu and torch.cuda.is_available()
+    if use_gpu:
+        torch.cuda.manual_seed_all(args.numpy_rand_seed)
+        torch.backends.cudnn.deterministic = True
+        if ext_dist.my_size > 1:
+            ngpus = torch.cuda.device_count()  # 1
+            if ext_dist.my_local_size > torch.cuda.device_count():
+                print("Not sufficient GPUs available... local_size = %d, ngpus = %d" % (ext_dist.my_local_size, ngpus))
+                sys.exit(1)
+            ngpus = 1
+            device = torch.device("cuda", ext_dist.my_local_rank)
+        else:
+            device = torch.device("cuda", 0)
+            ngpus = torch.cuda.device_count()  # 1
+            ngpus=1
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device = torch.device("cpu")
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    # input data
+    if (args.data_generation == "dataset"):
+
+        train_data, train_ld, test_data, test_ld = \
+            dp.make_criteo_data_and_loaders(args)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(list(map(
+                lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                ln_emb
+            )))
+        m_den = train_data.m_den
+        ln_bot[0] = m_den
+
+    elif args.data_generation == "synthetic":
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld = dd.data_loader(args, ln_emb, m_den)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        table_feature_map = None #  {idx : idx for idx in range(len(ln_emb))}
+
+    else:
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if (args.arch_project_size > 0):
+            num_int = num_fea * args.arch_project_size + m_den_out
+        else:
+            if args.arch_interaction_itself:
+                num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+            else:
+                num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit(
+            "ERROR: --arch-interaction-op="
+            + args.arch_interaction_op
+            + " is not supported"
+        )
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit(
+            "ERROR: arch-dense-feature-size "
+            + str(m_den)
+            + " does not match first dim of bottom mlp "
+            + str(ln_bot[0])
+        )
+    if args.qr_flag:
+        if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
+            sys.exit(
+                "ERROR: 2 arch-sparse-feature-size "
+                + str(2 * m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+                + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
+            )
+        if args.qr_operation != "concat" and m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    else:
+        if m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    if num_int != ln_top[0]:
+        sys.exit(
+            "ERROR: # of feature interactions "
+            + str(num_int)
+            + " does not match first dimension of top mlp "
+            + str(ln_top[0])
+        )
+
+    # assign mixed dimensions if applicable
+    if args.md_flag:
+        m_spa = md_solver(
+            torch.tensor(ln_emb),
+            args.md_temperature,  # alpha
+            d0=m_spa,
+            round_dim=args.md_round_dims
+        ).tolist()
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print(
+            "mlp top arch "
+            + str(ln_top.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_top)
+        print("# of interactions")
+        print(num_int)
+        print(
+            "mlp bot arch "
+            + str(ln_bot.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print(
+            "# of embeddings (= # of sparse features) "
+            + str(ln_emb.size)
+            + ", with dimensions "
+            + str(m_spa)
+            + "x:"
+        )
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
+            # early exit if nbatches was set by the user and has been exceeded
+            if nbatches > 0 and j >= nbatches:
+                break
+
+            print("mini-batch: %d" % j)
+            print(X.detach().cpu().numpy())
+            # transform offsets to lengths when printing
+            print(
+                [
+                    np.diff(
+                        S_o.detach().cpu().tolist() + list(lS_i[i].shape)
+                    ).tolist()
+                    for i, S_o in enumerate(lS_o)
+                ]
+            )
+            print([S_i.detach().cpu().tolist() for S_i in lS_i])
+            print(T.detach().cpu().numpy())
+
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    dlrm = DLRM_Net(
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        args.arch_project_size,
+        arch_interaction_op=args.arch_interaction_op,
+        arch_interaction_itself=args.arch_interaction_itself,
+        sigmoid_bot=-1,
+        sigmoid_top=ln_top.size - 2,
+        sync_dense_params=args.sync_dense_params,
+        loss_threshold=args.loss_threshold,
+        ndevices=ndevices,
+        qr_flag=args.qr_flag,
+        qr_operation=args.qr_operation,
+        qr_collisions=args.qr_collisions,
+        qr_threshold=args.qr_threshold,
+        md_flag=args.md_flag,
+        md_threshold=args.md_threshold,
+    )
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+        # print(dlrm)
+
+    if use_gpu:
+        # Custom Model-Data Parallel
+        # the mlps are replicated and use data parallelism, while
+        # the embeddings are distributed and use model parallelism
+        dlrm = dlrm.to(device)  # .cuda()
+        if dlrm.ndevices > 1:
+            dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb)
+
+    if ext_dist.my_size > 1:
+        if use_gpu:
+            device_ids = [ext_dist.my_local_rank]
+            dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids)
+            dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids)
+        else:
+            dlrm.bot_l = DDP(dlrm.bot_l)
+            dlrm.top_l = DDP(dlrm.top_l)
+
+    # specify the loss function
+    if args.loss_function == "mse":
+        loss_fn = torch.nn.MSELoss(reduction="mean")
+    elif args.loss_function == "bce":
+        loss_fn = torch.nn.BCELoss(reduction="mean")
+    elif args.loss_function == "wbce":
+        loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-"))
+        loss_fn = torch.nn.BCELoss(reduction="none")
+    else:
+        sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported")
+
+    if not args.inference_only:
+        # specify the optimizer algorithm
+
+        if ext_dist.my_size == 1:
+            optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate)
+            #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step,
+            #                                 args.lr_num_decay_steps)
+        else:
+            optimizer = torch.optim.SGD([
+                {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate},
+                {"params": dlrm.bot_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size},
+                {"params": dlrm.top_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size}
+            ], lr=args.learning_rate)
+
+    ### main loop ###
+    def time_wrap(use_gpu):
+        if use_gpu:
+            torch.cuda.synchronize()
+        return time.time()
+
+    def dlrm_wrap(X, lS_o, lS_i, use_gpu, device):
+        if use_gpu:  # .cuda()
+            # lS_i can be either a list of tensors or a stacked tensor.
+            # Handle each case below:
+            tm.tmH2D.start()
+            lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \
+                else lS_i.to(device)
+            lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \
+                else lS_o.to(device)
+            X = X.to(device)
+            tm.tmH2D.stop()
+
+            return dlrm(
+                X,
+                lS_o,
+                lS_i
+            )
+        else:
+            return dlrm(X, lS_o, lS_i)
+
+    def loss_fn_wrap(Z, T, use_gpu, device):
+        if args.loss_function == "mse" or args.loss_function == "bce":
+            if use_gpu:
+                return loss_fn(Z, T.to(device))
+            else:
+                return loss_fn(Z, T)
+        elif args.loss_function == "wbce":
+            if use_gpu:
+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T).to(device)
+                loss_fn_ = loss_fn(Z, T.to(device))
+            else:
+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T)
+                loss_fn_ = loss_fn(Z, T.to(device))
+            loss_sc_ = loss_ws_ * loss_fn_
+            # debug prints
+            # print(loss_ws_)
+            # print(loss_fn_)
+            return loss_sc_.mean()
+
+    # training or inference
+    best_gA_test = 0
+    best_auc_test = 0
+    skip_upto_epoch = 0
+    skip_upto_batch = 0
+    total_time = 0
+    total_loss = 0
+    total_accu = 0
+    total_iter = 0
+    total_samp = 0
+    k = 0
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+        if use_gpu:
+            if dlrm.ndevices > 1:
+                # NOTE: when targeting inference on multiple GPUs,
+                # load the model as is on CPU or GPU, with the move
+                # to multiple GPUs to be done in parallel_forward
+                ld_model = torch.load(args.load_model)
+            else:
+                # NOTE: when targeting inference on single GPU,
+                # note that the call to .to(device) has already happened
+                ld_model = torch.load(
+                    args.load_model,
+                    map_location=torch.device('cuda')
+                    # map_location=lambda storage, loc: storage.cuda(0)
+                )
+        else:
+            # when targeting inference on CPU
+            ld_model = torch.load(args.load_model, map_location=torch.device('cpu'))
+        dlrm.load_state_dict(ld_model["state_dict"])
+        ld_j = ld_model["iter"]
+        ld_k = ld_model["epoch"]
+        ld_nepochs = ld_model["nepochs"]
+        ld_nbatches = ld_model["nbatches"]
+        ld_nbatches_test = ld_model["nbatches_test"]
+        ld_gA = ld_model["train_acc"]
+        ld_gL = ld_model["train_loss"]
+        ld_total_loss = ld_model["total_loss"]
+        ld_total_accu = ld_model["total_accu"]
+        ld_gA_test = ld_model["test_acc"]
+        ld_gL_test = ld_model["test_loss"]
+        if not args.inference_only:
+            optimizer.load_state_dict(ld_model["opt_state_dict"])
+            best_gA_test = ld_gA_test
+            total_loss = ld_total_loss
+            total_accu = ld_total_accu
+            skip_upto_epoch = ld_k  # epochs
+            skip_upto_batch = ld_j  # batches
+        else:
+            args.print_freq = ld_nbatches
+            args.test_freq = 0
+
+        print(
+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
+            )
+        )
+        print(
+            "Training state: loss = {:.6f}, accuracy = {:3.3f} %".format(
+                ld_gL, ld_gA * 100
+            )
+        )
+        print(
+            "Testing state: loss = {:.6f}, accuracy = {:3.3f} %".format(
+                ld_gL_test, ld_gA_test * 100
+            )
+        )
+
+    ext_dist.barrier()
+    startTime = time.time()
+    startTime0 = startTime
+    skipped = 0
+
+    #print("Processing data")
+    #t1 = time.time()
+    syndatasetlen = min(65536 // args.mini_batch_size, nbatches)
+    #myobj = list(enumerate(train_ld))
+    #t2 = time.time()
+    #print("Processing data takes {} seconds with len={} {} {} {}".format(t2-t1, len(myobj), nbatches, args.mini_batch_size, syndatasetlen))
+    print("time/loss/accuracy (if enabled):")
+    with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof:
+    # with torch.autograd.profiler.emit_nvtx():
+
+        while k < args.nepochs:
+            if k < skip_upto_epoch:
+                continue
+
+            if use_gpu:
+                tm.tmSync1.start()
+                torch.cuda.synchronize()
+                tm.tmSync1.stop()
+            accum_time_begin = time.time()
+
+            if args.mlperf_logging:
+                previous_iteration_time = None
+
+            # for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
+            for j in range(nbatches):
+                tm.tmGetData.start() 
+                # X, lS_o, lS_i, T = myobj[j%syndatasetlen][1]
+                X, lS_o, lS_i, T = train_data.__getitem__(j%syndatasetlen)
+                tm.tmGetData.stop()
+
+                if j == 0 and args.save_onnx:
+                    (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i)
+
+                if j < skip_upto_batch:
+                    continue
+
+                if (skipped == 2):
+                    ext_dist.barrier()
+                    startTime = time.time()
+                    ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank)
+                    # torch.cuda.profiler.cudart().cudaProfilerStart()
+                    torch.cuda.profiler.start()
+                    tm.tmClear()
+                skipped = skipped + 1
+
+                if args.mlperf_logging:
+                    current_time = time_wrap(use_gpu)
+                    if previous_iteration_time:
+                        iteration_time = current_time - previous_iteration_time
+                    else:
+                        iteration_time = 0
+                    previous_iteration_time = current_time
+                else:
+                    if use_gpu:
+                        tm.tmSync2.start()
+                        torch.cuda.synchronize()
+                        tm.tmSync2.stop()
+                    t1 = time.time()
+
+                # early exit if nbatches was set by the user and has been exceeded
+                if nbatches > 0 and j >= nbatches:
+                    break
+                '''
+                # debug prints
+                print("input and targets")
+                print(X.detach().cpu().numpy())
+                print([np.diff(S_o.detach().cpu().tolist()
+                       + list(lS_i[i].shape)).tolist() for i, S_o in enumerate(lS_o)])
+                print([S_i.detach().cpu().numpy().tolist() for S_i in lS_i])
+                print(T.detach().cpu().numpy())
+                '''
+                # Skip the batch if batch size not multiple of total ranks
+                if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0:
+                    print("Warning: Skiping the batch %d with size %d" % (j, X.size(0)))
+                    continue
+
+
+                # forward pass
+                tm.tmFwd.start()
+                Z = dlrm_wrap(X, lS_o, lS_i, use_gpu, device)
+                tm.tmFwd.stop()
+
+                # loss
+                tm.tmLoss.start()
+                E = loss_fn_wrap(Z, T, use_gpu, device)
+                '''
+                # debug prints
+                print("output and loss")
+                print(Z.detach().cpu().numpy())
+                print(E.detach().cpu().numpy())
+                '''
+                # compute loss and accuracy
+                L = E.detach().cpu().numpy()  # numpy array
+                S = Z.detach().cpu().numpy()  # numpy array
+                T = T.detach().cpu().numpy()  # numpy array
+                mbs = T.shape[0]  # = args.mini_batch_size except maybe for last
+                A = np.sum((np.round(S, 0) == T).astype(np.uint8))
+                tm.tmLoss.stop()
+
+                if not args.inference_only:
+                    # scaled error gradient propagation
+                    # (where we do not accumulate gradients across mini-batches)
+                    tm.tmZero.start()
+                    optimizer.zero_grad()
+                    tm.tmZero.stop()
+
+                    # backward pass
+                    tm.tmBwd.start()
+                    E.backward()
+                    tm.tmBwd.stop()
+
+                    # debug prints (check gradient norm)
+                    # for l in mlp.layers:
+                    #     if hasattr(l, 'weight'):
+                    #          print(l.weight.grad.norm().item())
+
+                    # optimizer
+                    tm.tmOpt.start()
+                    optimizer.step()
+                    tm.tmOpt.stop()
+
+                    ### lr_scheduler.step()
+
+                if args.mlperf_logging:
+                    total_time += iteration_time
+                else:
+                    if use_gpu:
+                        tm.tmSync3.start()
+                        torch.cuda.synchronize()
+                        tm.tmSync3.stop()
+                    t2 = time.time()
+                    total_time += t2 - t1
+
+                total_accu += A
+                total_loss += L * mbs
+                total_iter += 1
+                total_samp += mbs
+
+                should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
+                should_test = (
+                    (args.test_freq > 0)
+                    and (args.data_generation == "dataset")
+                    and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+                )
+
+                # print time, loss and accuracy
+                if should_print or should_test:
+                    gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                    total_time = 0
+
+                    gA = total_accu / total_samp
+                    total_accu = 0
+
+                    gL = total_loss / total_samp
+                    total_loss = 0
+
+                    str_run_type = "inference" if args.inference_only else "training"
+                    print(
+                        "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format(
+                            str_run_type, j + 1, nbatches, k, gT
+                        )
+                        + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL,
+                            gA * 100, total_iter, ext_dist.my_rank)
+                    )
+                    # Uncomment the line below to print out the total time with overhead
+                    if ext_dist.my_rank < 2:
+                      tt1 = time.time()
+                      ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \
+                       .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1))
+                    total_iter = 0
+                    total_samp = 0
+
+                # testing
+                if should_test and not args.inference_only:
+                    # don't measure training iter time in a test iteration
+                    if args.mlperf_logging:
+                        previous_iteration_time = None
+
+                    test_accu = 0
+                    test_loss = 0
+                    test_samp = 0
+
+                    accum_test_time_begin = time_wrap(use_gpu)
+                    if args.mlperf_logging:
+                        scores = []
+                        targets = []
+
+                    for i, (X_test, lS_o_test, lS_i_test, T_test) in enumerate(test_ld):
+                        # early exit if nbatches was set by the user and was exceeded
+                        if nbatches > 0 and i >= nbatches:
+                            break
+
+                        # Skip the batch if batch size not multiple of total ranks
+                        if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0:
+                            print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
+                            continue
+
+                        t1_test = time_wrap(use_gpu)
+
+                        # forward pass
+                        Z_test = dlrm_wrap(
+                            X_test, lS_o_test, lS_i_test, use_gpu, device
+                        )
+                        if args.mlperf_logging:
+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
+                            T_test = T_test.detach().cpu().numpy()  # numpy array
+                            scores.append(S_test)
+                            targets.append(T_test)
+                        else:
+                            # loss
+                            E_test = loss_fn_wrap(Z_test, T_test, use_gpu, device)
+
+                            # compute loss and accuracy
+                            L_test = E_test.detach().cpu().numpy()  # numpy array
+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
+                            T_test = T_test.detach().cpu().numpy()  # numpy array
+                            mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                            A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
+                            test_accu += A_test
+                            test_loss += L_test * mbs_test
+                            test_samp += mbs_test
+
+                        t2_test = time_wrap(use_gpu)
+
+                    if args.mlperf_logging:
+                        scores = np.concatenate(scores, axis=0)
+                        targets = np.concatenate(targets, axis=0)
+
+                        metrics = {
+                            'loss' : sklearn.metrics.log_loss,
+                            'recall' : lambda y_true, y_score:
+                            sklearn.metrics.recall_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'precision' : lambda y_true, y_score:
+                            sklearn.metrics.precision_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'f1' : lambda y_true, y_score:
+                            sklearn.metrics.f1_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'ap' : sklearn.metrics.average_precision_score,
+                            'roc_auc' : sklearn.metrics.roc_auc_score,
+                            'accuracy' : lambda y_true, y_score:
+                            sklearn.metrics.accuracy_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            # 'pre_curve' : sklearn.metrics.precision_recall_curve,
+                            # 'roc_curve' :  sklearn.metrics.roc_curve,
+                        }
+
+                        # print("Compute time for validation metric : ", end="")
+                        # first_it = True
+                        validation_results = {}
+                        for metric_name, metric_function in metrics.items():
+                            # if first_it:
+                            #     first_it = False
+                            # else:
+                            #     print(", ", end="")
+                            # metric_compute_start = time_wrap(False)
+                            validation_results[metric_name] = metric_function(
+                                targets,
+                                scores
+                            )
+                            # metric_compute_end = time_wrap(False)
+                            # met_time = metric_compute_end - metric_compute_start
+                            # print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
+                            #      end="")
+                        # print(" ms")
+                        gA_test = validation_results['accuracy']
+                        gL_test = validation_results['loss']
+                    else:
+                        gA_test = test_accu / test_samp
+                        gL_test = test_loss / test_samp
+
+                    is_best = gA_test > best_gA_test
+                    if is_best:
+                        best_gA_test = gA_test
+                        if not (args.save_model == ""):
+                            print("Saving model to {}".format(args.save_model))
+                            torch.save(
+                                {
+                                    "epoch": k,
+                                    "nepochs": args.nepochs,
+                                    "nbatches": nbatches,
+                                    "nbatches_test": nbatches_test,
+                                    "iter": j + 1,
+                                    "state_dict": dlrm.state_dict(),
+                                    "train_acc": gA,
+                                    "train_loss": gL,
+                                    "test_acc": gA_test,
+                                    "test_loss": gL_test,
+                                    "total_loss": total_loss,
+                                    "total_accu": total_accu,
+                                    "opt_state_dict": optimizer.state_dict(),
+                                },
+                                args.save_model,
+                            )
+
+                    if args.mlperf_logging:
+                        is_best = validation_results['roc_auc'] > best_auc_test
+                        if is_best:
+                            best_auc_test = validation_results['roc_auc']
+
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                            + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format(
+                                validation_results['loss'],
+                                validation_results['recall'],
+                                validation_results['precision']
+                            )
+                            + " f1 {:.4f}, ap {:.4f},".format(
+                                validation_results['f1'],
+                                validation_results['ap'],
+                            )
+                            + " auc {:.4f}, best auc {:.4f},".format(
+                                validation_results['roc_auc'],
+                                best_auc_test
+                            )
+                            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                                validation_results['accuracy'] * 100,
+                                best_gA_test * 100
+                            )
+                        )
+                    else:
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
+                            + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
+                                gL_test, gA_test * 100, best_gA_test * 100
+                            )
+                        )
+                    # Uncomment the line below to print out the total time with overhead
+                    # print("Total test time for this group: {}" \
+                    # .format(time_wrap(use_gpu) - accum_test_time_begin))
+
+                    if (args.mlperf_logging
+                        and (args.mlperf_acc_threshold > 0)
+                        and (best_gA_test > args.mlperf_acc_threshold)):
+                        print("MLPerf testing accuracy threshold "
+                              + str(args.mlperf_acc_threshold)
+                              + " reached, stop training")
+                        break
+
+                    if (args.mlperf_logging
+                        and (args.mlperf_auc_threshold > 0)
+                        and (best_auc_test > args.mlperf_auc_threshold)):
+                        print("MLPerf testing auc threshold "
+                              + str(args.mlperf_auc_threshold)
+                              + " reached, stop training")
+                        break
+
+                #if (ext_dist.my_rank == 0 and should_print):
+                #    print("ITER : ", j, " from nvidia-smi")
+                #    os.system("nvidia-smi")
+
+            k += 1  # nepochs
+
+    #if (ext_dist.my_rank == 0):
+    #    # print(torch.cuda.memory_allocated(0))
+    #    print(torch.cuda.memory_summary(0))
+    #    # print("from nvidia-smi")
+    #    os.system("nvidia-smi")
+
+    tt2 = time.time()
+    endTime = tt2 - startTime
+    ext_dist.barrier()
+    tt3 = time.time()
+    finalTime = tt3 - startTime
+    # torch.cuda.profiler.cudart().cudaProfilerStop()
+    torch.cuda.profiler.stop()
+    if (skipped > 2):
+        skipped -= 2
+    ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \
+        iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0,
+        finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True)
+    if (ext_dist.my_rank < 2):
+        tm.tmSummary(ext_dist.my_rank)
+
+    file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank)
+    # profiling
+    if args.enable_profiling:
+        os.makedirs(args.out_dir, exist_ok=True)
+        with open("TT"+str(uuid.uuid4().hex), "w") as prof_f:
+            prof_f.write(prof.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cpu_time_total",
+            ))
+
+#        with open("%s.prof" % file_prefix, "w") as prof_f:
+#            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
+#            prof.export_chrome_trace("./%s.json" % file_prefix)
+#            print(prof.key_averages().table(sort_by="cpu_time_total"))
+
+    # plot compute graph
+    if args.plot_compute_graph:
+        sys.exit(
+            "ERROR: Please install pytorchviz package in order to use the"
+            + " visualization. Then, uncomment its import above as well as"
+            + " three lines below and run the code again."
+        )
+        # os.makedirs(args.out_dir, exist_ok=True)
+        # V = Z.mean() if args.inference_only else E
+        # dot = make_dot(V, params=dict(dlrm.named_parameters()))
+        # dot.render('%s_graph' % file_prefix) # write .pdf file
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+
+    # export the model in onnx
+    if args.save_onnx:
+
+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        torch.onnx.export(
+            dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True
+        )
+
+        # recover the model back
+        dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix)
+        # check the onnx model
+        onnx.checker.check_model(dlrm_pytorch_onnx)
diff --git a/dlrm_s_caffe2.py b/dlrm_s_caffe2.py
index 47b27d61..eb3e3638 100644
--- a/dlrm_s_caffe2.py
+++ b/dlrm_s_caffe2.py
@@ -79,6 +79,7 @@
 # caffe2
 from caffe2.proto import caffe2_pb2
 from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace
+# from caffe2.python.predictor import mobile_exporter
 
 """
 # auxiliary routine used to split input on the mini-bacth dimension
@@ -607,6 +608,9 @@ def create_model(self, X, S_lengths, S_indices, T):
         tril_indices = np.array([j + i * num_fea
                                  for i in range(num_fea) for j in range(i + offset)])
         self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices)
+        if self.save_onnx:
+            tish = tril_indices.shape
+            self.onnx_tsd[self.tint + "_tril_indices"] = (onnx.TensorProto.INT32, tish)
 
         # create compute graph
         if T is not None:
diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py
index 3aeeec0c..5129a39b 100644
--- a/dlrm_s_pytorch.py
+++ b/dlrm_s_pytorch.py
@@ -65,6 +65,7 @@
 
 # numpy
 import numpy as np
+import socket
 
 # onnx
 # The onnx import causes deprecation warnings every time workers
@@ -72,10 +73,11 @@
 import warnings
 with warnings.catch_warnings():
     warnings.filterwarnings("ignore", category=DeprecationWarning)
-import onnx
+## import onnx
 
 # pytorch
 import torch
+from torch import onnx
 import torch.nn as nn
 from torch.nn.parallel.parallel_apply import parallel_apply
 from torch.nn.parallel.replicate import replicate
@@ -91,12 +93,57 @@
 
 import sklearn.metrics
 
+import uuid
+import project
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import dlrm_data as dd
+
+# import synthetic_data_loader as fb_syn_data
+
 # from torchviz import make_dot
 # import torch.nn.functional as Functional
 # from torch.nn.parameter import Parameter
 
+from torch.optim.lr_scheduler import _LRScheduler
+
 exc = getattr(builtins, "IOError", "FileNotFoundError")
 
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
 
 ### define dlrm in PyTorch ###
 class DLRM_Net(nn.Module):
@@ -154,9 +201,9 @@ def create_emb(self, m, ln):
             if self.qr_flag and n > self.qr_threshold:
                 EE = QREmbeddingBag(n, m, self.qr_collisions,
                     operation=self.qr_operation, mode="sum", sparse=True)
-            elif self.md_flag and n > self.md_threshold:
-                _m = m[i]
+            elif self.md_flag:
                 base = max(m)
+                _m = m[i] if n > self.md_threshold else base
                 EE = PrEmbeddingBag(n, _m, base)
                 # use np initialization as below for consistency...
                 W = np.random.uniform(
@@ -198,6 +245,7 @@ def __init__(
         ln_emb=None,
         ln_bot=None,
         ln_top=None,
+        proj_size = 0,
         arch_interaction_op=None,
         arch_interaction_itself=False,
         sigmoid_bot=-1,
@@ -223,6 +271,7 @@ def __init__(
         ):
 
             # save arguments
+            self.proj_size = proj_size
             self.ndevices = ndevices
             self.output_d = 0
             self.parallel_model_batch_size = -1
@@ -259,6 +308,8 @@ def __init__(
                 self.emb_l = self.create_emb(m_spa, ln_emb)
             self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
             self.top_l = self.create_mlp(ln_top, sigmoid_top)
+            if (proj_size > 0):
+                self.proj_l = project.create_proj(len(ln_emb)+1, proj_size)
 
     def apply_mlp(self, x, layers):
         # approach 1: use ModuleList
@@ -268,6 +319,14 @@ def apply_mlp(self, x, layers):
         # approach 2: use Sequential container to wrap all layers
         return layers(x)
 
+    def apply_proj(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
     def apply_emb(self, lS_o, lS_i, emb_l):
         # WARNING: notice that we are processing the batch at once. We implicitly
         # assume that the data is laid out such that:
@@ -298,22 +357,32 @@ def interact_features(self, x, ly):
             (batch_size, d) = x.shape
             T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
             # perform a dot product
-            Z = torch.bmm(T, torch.transpose(T, 1, 2))
-            # append dense feature with the interactions (into a row vector)
-            # approach 1: all
-            # Zflat = Z.view((batch_size, -1))
-            # approach 2: unique
-            _, ni, nj = Z.shape
-            # approach 1: tril_indices
-            # offset = 0 if self.arch_interaction_itself else -1
-            # li, lj = torch.tril_indices(ni, nj, offset=offset)
-            # approach 2: custom
-            offset = 1 if self.arch_interaction_itself else 0
-            li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
-            lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
-            Zflat = Z[:, li, lj]
-            # concatenate dense features and interactions
-            R = torch.cat([x] + [Zflat], dim=1)
+            if (self.proj_size > 0):
+                R = project.project(T, x, self.proj_l)
+                #TT = torch.transpose(T, 1, 2)
+                #TS = torch.reshape(TT, (-1, TT.size(2)))
+                #TC = self.apply_mlp(TS, self.proj_l)
+                #TR = torch.reshape(TC, (-1, d ,self.proj_size))
+                #Z  = torch.bmm(T, TR)
+                #Zflat = Z.view((batch_size, -1))
+                #R = torch.cat([x] + [Zflat], dim=1)
+            else:
+                Z = torch.bmm(T, torch.transpose(T, 1, 2))
+                # append dense feature with the interactions (into a row vector)
+                # approach 1: all
+                # Zflat = Z.view((batch_size, -1))
+                # approach 2: unique
+                _, ni, nj = Z.shape
+                # approach 1: tril_indices
+                # offset = 0 if self.arch_interaction_itself else -1
+                # li, lj = torch.tril_indices(ni, nj, offset=offset)
+                # approach 2: custom
+                offset = 1 if self.arch_interaction_itself else 0
+                li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+                lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+                Zflat = Z[:, li, lj]
+                # concatenate dense features and interactions
+                R = torch.cat([x] + [Zflat], dim=1)
         elif self.arch_interaction_op == "cat":
             # concatenation features (into a row vector)
             R = torch.cat([x] + ly, dim=1)
@@ -417,14 +486,15 @@ def distributed_forward(self, dense_x, lS_o, lS_i):
             z = p
 
         ### gather the distributed results on each rank ###
-        # For some reason it requires explicit sync before all_gather call if 
+        # For some reason it requires explicit sync before all_gather call if
         # tensor is on GPU memory
         if z.is_cuda: torch.cuda.synchronize()
         (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size)
         z = ext_dist.all_gather(z, batch_split_lengths)
         #print("Z: %s" % z)
+
         return z
- 
+
     def parallel_forward(self, dense_x, lS_o, lS_i):
         ### prepare model (overwrite) ###
         # WARNING: # of devices must be >= batch size in parallel_forward call
@@ -534,6 +604,30 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         return z0
 
 
+def dash_separated_ints(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value)
+
+    return value
+
+
+def dash_separated_floats(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            float(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of floats" % value)
+
+    return value
+
+
 if __name__ == "__main__":
     ### import packages ###
     import sys
@@ -546,11 +640,18 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
     )
     # model related parameters
     parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
-    parser.add_argument("--arch-embedding-size", type=str, default="4-3-2")
+
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-project-size", type=int, default=0)
+
     # j will be replaced with the table number
-    parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2")
-    parser.add_argument("--arch-mlp-top", type=str, default="4-2-1")
-    parser.add_argument("--arch-interaction-op", type=str, default="dot")
+    parser.add_argument(
+        "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument(
+        "--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    parser.add_argument(
+        "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot")
     parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
     # embedding table options
     parser.add_argument("--md-flag", action="store_true", default=False)
@@ -564,7 +665,8 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
     # activations and loss
     parser.add_argument("--activation-function", type=str, default="relu")
     parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
-    parser.add_argument("--loss-weights", type=str, default="1.0-1.0")  # for wbce
+    parser.add_argument(
+        "--loss-weights", type=dash_separated_floats, default="1.0-1.0")  # for wbce
     parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
     parser.add_argument("--round-targets", type=bool, default=False)
     # data
@@ -573,6 +675,15 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
     parser.add_argument(
         "--data-generation", type=str, default="random"
     )  # synthetic or dataset
+    parser.add_argument("--synthetic-data-folder", type=str,
+        default="./synthetic_data/syn_data_bs65536")
+    # add Gaussian distribution
+    parser.add_argument("--rand-data-dist", type=str, default="uniform")  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+
     parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
     parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
     parser.add_argument("--raw-data-file", type=str, default="")
@@ -621,10 +732,22 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
     parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
     parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
     parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
+
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+
     args = parser.parse_args()
 
+    print(socket.gethostname())
+
     ext_dist.init_distributed(backend=args.dist_backend)
 
+    # print("success size= ", ext_dist.my_size, ext_dist.my_rank)
+
+    ext_dist.barrier()
+
     if args.mlperf_logging:
         print('command line args: ', json.dumps(vars(args)))
 
@@ -658,6 +781,7 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         else:
             device = torch.device("cuda", 0)
             ngpus = torch.cuda.device_count()  # 1
+            ngpus=1
         print("Using {} GPU(s)...".format(ngpus))
     else:
         device = torch.device("cpu")
@@ -682,11 +806,20 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
             )))
         m_den = train_data.m_den
         ln_bot[0] = m_den
+
+    elif args.data_generation == "synthetic":
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld = dd.data_loader(args, ln_emb, m_den)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        table_feature_map = None #  {idx : idx for idx in range(len(ln_emb))}
+
     else:
         # input and target at random
         ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
         m_den = ln_bot[0]
-        train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den)
+        train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den)
         nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
 
     ### parse command line arguments ###
@@ -697,10 +830,13 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         # approach 1: all
         # num_int = num_fea * num_fea + m_den_out
         # approach 2: unique
-        if args.arch_interaction_itself:
-            num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+        if (args.arch_project_size > 0):
+            num_int = num_fea * args.arch_project_size + m_den_out
         else:
-            num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+            if args.arch_interaction_itself:
+                num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+            else:
+                num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
     elif args.arch_interaction_op == "cat":
         num_int = num_fea * m_den_out
     else:
@@ -824,6 +960,7 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         ln_emb,
         ln_bot,
         ln_top,
+        args.arch_project_size,
         arch_interaction_op=args.arch_interaction_op,
         arch_interaction_itself=args.arch_interaction_itself,
         sigmoid_bot=-1,
@@ -852,15 +989,15 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
         dlrm = dlrm.to(device)  # .cuda()
         if dlrm.ndevices > 1:
             dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb)
-    
+
     if ext_dist.my_size > 1:
         if use_gpu:
             device_ids = [ext_dist.my_local_rank]
-            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids)
-            dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids)
+            dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids)
+            dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids)
         else:
-            dlrm.bot_l = ext_dist.DDP(dlrm.bot_l)
-            dlrm.top_l = ext_dist.DDP(dlrm.top_l)
+            dlrm.bot_l = DDP(dlrm.bot_l)
+            dlrm.top_l = DDP(dlrm.top_l)
 
     # specify the loss function
     if args.loss_function == "mse":
@@ -875,8 +1012,11 @@ def parallel_forward(self, dense_x, lS_o, lS_i):
 
     if not args.inference_only:
         # specify the optimizer algorithm
+
         if ext_dist.my_size == 1:
             optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate)
+            #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step,
+            #                                 args.lr_num_decay_steps)
         else:
             optimizer = torch.optim.SGD([
                 {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate},
@@ -997,8 +1137,12 @@ def loss_fn_wrap(Z, T, use_gpu, device):
         )
 
     ext_dist.barrier()
+    startTime = time.time()
+    startTime0 = startTime
+    skipped = 0
+
     print("time/loss/accuracy (if enabled):")
-    with torch.autograd.profiler.profile(args.enable_profiling, use_gpu) as prof:
+    with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof:
         while k < args.nepochs:
             if k < skip_upto_epoch:
                 continue
@@ -1009,9 +1153,18 @@ def loss_fn_wrap(Z, T, use_gpu, device):
                 previous_iteration_time = None
 
             for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
+                if j == 0 and args.save_onnx:
+                    (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i)
+
                 if j < skip_upto_batch:
                     continue
 
+                if (skipped == 2):
+                    ext_dist.barrier()
+                    startTime = time.time()
+                    ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank)
+                skipped = skipped + 1
+
                 if args.mlperf_logging:
                     current_time = time_wrap(use_gpu)
                     if previous_iteration_time:
@@ -1071,6 +1224,7 @@ def loss_fn_wrap(Z, T, use_gpu, device):
 
                     # optimizer
                     optimizer.step()
+                    ### lr_scheduler.step()
 
                 if args.mlperf_logging:
                     total_time += iteration_time
@@ -1105,11 +1259,14 @@ def loss_fn_wrap(Z, T, use_gpu, device):
                         "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format(
                             str_run_type, j + 1, nbatches, k, gT
                         )
-                        + "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100)
+                        + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL,
+                            gA * 100, total_iter, ext_dist.my_rank)
                     )
                     # Uncomment the line below to print out the total time with overhead
-                    # print("Accumulated time so far: {}" \
-                    # .format(time_wrap(use_gpu) - accum_time_begin))
+                    if ext_dist.my_rank < 2:
+                      tt1 = time_wrap(use_gpu)
+                      ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \
+                       .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1))
                     total_iter = 0
                     total_samp = 0
 
@@ -1297,16 +1454,42 @@ def loss_fn_wrap(Z, T, use_gpu, device):
                               + " reached, stop training")
                         break
 
+                #if (ext_dist.my_rank == 0 and should_print):
+                #    print("ITER : ", j, " from nvidia-smi")
+                #    os.system("nvidia-smi")
+
             k += 1  # nepochs
 
+    #if (ext_dist.my_rank == 0):
+    #    # print(torch.cuda.memory_allocated(0))
+    #    print(torch.cuda.memory_summary(0))
+    #    # print("from nvidia-smi")
+    #    os.system("nvidia-smi")
+
+    tt2 = time.time()
+    endTime = tt2 - startTime
+    ext_dist.barrier()
+    tt3 = time.time()
+    finalTime = tt3 - startTime
+    if (skipped > 2):
+        skipped -= 2
+    ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \
+        iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0,
+        finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True)
+
     file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank)
     # profiling
     if args.enable_profiling:
         os.makedirs(args.out_dir, exist_ok=True)
-        with open("%s.prof" % file_prefix, "w") as prof_f:
-            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
-            prof.export_chrome_trace("./%s.json" % file_prefix)
-        # print(prof.key_averages().table(sort_by="cpu_time_total"))
+        with open("TT"+str(uuid.uuid4().hex), "w") as prof_f:
+            prof_f.write(prof.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cpu_time_total",
+            ))
+
+#        with open("%s.prof" % file_prefix, "w") as prof_f:
+#            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
+#            prof.export_chrome_trace("./%s.json" % file_prefix)
+#            print(prof.key_averages().table(sort_by="cpu_time_total"))
 
     # plot compute graph
     if args.plot_compute_graph:
@@ -1328,12 +1511,12 @@ def loss_fn_wrap(Z, T, use_gpu, device):
 
     # export the model in onnx
     if args.save_onnx:
-        os.makedirs(args.out_dir, exist_ok=True)
-        with open("%s.onnx" % file_prefix, "w+b") as dlrm_pytorch_onnx_file:
-            (X, lS_o, lS_i, _) = train_data[0]  # get first batch of elements
-            torch.onnx._export(
-                dlrm, (X, lS_o, lS_i), dlrm_pytorch_onnx_file, verbose=True
-            )
+
+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        torch.onnx.export(
+            dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True
+        )
+
         # recover the model back
         dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix)
         # check the onnx model
diff --git a/extend_distributed.py b/extend_distributed.py
index d7fd9dd1..e816654d 100644
--- a/extend_distributed.py
+++ b/extend_distributed.py
@@ -4,11 +4,15 @@
 from torch.autograd import Function
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.distributed as dist
+
+import profile as tm
+
 try:
     import torch_ccl
 except ImportError as e:
     #print(e)
     torch_ccl = False
+import time
 
 my_rank = -1
 my_size = -1
@@ -39,6 +43,47 @@ def get_split_lengths(n):
         my_len = splits[my_rank]
     return (my_len, splits)
 
+def get_world_rank_from_env():
+  return env2int(
+        ["RANK",
+         "PMI_RANK",
+         "OMPI_COMM_WORLD_RANK",
+         "MV2_COMM_WORLD_RANK",
+         "SLURM_PROCID"],
+        -1
+  )
+
+def get_world_size_from_env():
+  return env2int(
+        ["WORLD_SIZE",
+         "PMI_SIZE",
+         "OMPI_COMM_WORLD_SIZE",
+         "MV2_COMM_WORLD_SIZE",
+         "SLURM_NPROCS"],
+        -1
+  )
+
+def get_local_rank_from_env():
+  return env2int(
+        ["LOCAL_RANK",
+         "MPI_LOCALRANKID",
+         "OMPI_COMM_WORLD_LOCAL_RANK",
+         "MV2_COMM_WORLD_LOCAL_RANK",
+         "SLURM_LOCALID",
+        ],  
+        -1,  
+  )   
+
+def get_local_size_from_env():
+  return env2int(
+        ["LOCAL_SIZE",
+         "MPI_LOCALNRANKS",
+         "OMPI_COMM_WORLD_LOCAL_SIZE",
+         "MV2_COMM_WORLD_LOCAL_SIZE",
+        ],
+        -1,
+  )
+
 def init_distributed(rank = -1, size = -1, backend=''):
     global myreq
     global my_rank
@@ -62,30 +107,74 @@ def init_distributed(rank = -1, size = -1, backend=''):
     if backend != '':
         #guess Rank and size
         if rank == -1:
-            rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0)
+            rank = get_world_rank_from_env()
         if size == -1:
-            size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1)
+            size = get_world_size_from_env()
+        assert rank >= 0
+        assert size > 0
+
         if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank)
         if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size)
         if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500'
         if not os.environ.get('MASTER_ADDR', None):
-            local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
-            if local_size != size and backend != 'mpi':
-                print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default")
-                print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR")
-            os.environ['MASTER_ADDR'] = '127.0.0.1'
+            if "SLURM_NODELIST" in os.environ:
+                master_addr = os.environ["SLURM_NODELIST"].replace('-', ',').split(',')[0].replace("[", "")
+            elif "HOSTNAME" in os.environ:
+                # handle other cases ?
+                master_addr = os.environ["HOSTNAME"]
+            else:
+                master_addr = "127.0.0.1"
+            os.environ["MASTER_ADDR"] = master_addr
+
+#    myenv = os.environ
+#    for e in myenv:
+#      print(e, "=", myenv[e])
+#    print("=== Done ===")
 
     if size > 1:
+        my_local_rank = get_local_rank_from_env()
+        my_local_size = get_local_size_from_env()
+        if my_local_size == -1: 
+          if "SLURM_TASKS_PER_NODE" in os.environ:
+            locsize = os.environ["SLURM_TASKS_PER_NODE"].split("(")[0]
+            my_local_size = int(locsize)
+
+        assert(my_local_rank >= 0)
+        assert(my_local_size >= 0)
+        print("Check local rank ", my_local_rank, " size ", my_local_size, " global rank ", rank, " global size ", size,  os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
+
         dist.init_process_group(backend, rank=rank, world_size=size)
         my_rank = dist.get_rank()
         my_size = dist.get_world_size()
-        my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0)
-        my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1)
-        if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend))
+
+        if my_rank >= 0: print("Running on %d ranks using %s backend" % (my_size, backend))
         if hasattr(dist, 'all_to_all_single'):
             try:
-                dist.all_to_all_single(torch.empty([0]), torch.empty([0]))
+              a = torch.arange(my_size) + my_rank * my_size
+              b = torch.zeros(my_size).to(torch.int64)
+              c = torch.zeros(my_size).to(torch.int64)
+              for i in range(my_size):
+                c[i] = my_rank + i * my_size
+
+              t1 = time.time()
+              if (torch.cuda.is_available):
+                dev = torch.device('cuda', my_local_rank)
+                a = a.to(dev)
+                b = b.to(dev)
+                c = c.to(dev)
+                dist.all_to_all_single(b, a)
+                if my_rank == 0:
+                    print("alltoall on rank :", my_rank, "a = ", a, " b = ", b)
+              else:
+                dist.all_to_all_single(b, a)
+              t2 = time.time()
+
+              if torch.equal(b, c):
                 alltoall_supported = True
+                if my_rank == 0:
+                    print("All to all single test passed for rank ", my_rank, " time ", t2 - t1)
+              else:
+                print("Failed alltoall single test! for rank= ", my_rank, " time ", t2 - t1)
             except RuntimeError:
                 pass
         if a2a_impl == 'alltoall' and alltoall_supported == False:
@@ -250,7 +339,8 @@ class All2All_Req(Function):
     @staticmethod
     def forward(ctx, a2ai, *inputs):
         global myreq
-        #print("All2All_Req:forward")
+        # print("All2All_Req:forward ", my_rank)
+        tm.tmA2A10.start()
         mb_split_lengths = a2ai.gNS
         if mb_split_lengths: mb_split_lengths = [m * a2ai.E for m in mb_split_lengths]
         emb_split_lengths = a2ai.gSS
@@ -267,12 +357,14 @@ def forward(ctx, a2ai, *inputs):
         a2ai.emb_split_lengths = emb_split_lengths
         myreq.a2ai = a2ai
         ctx.a2ai = a2ai
+        tm.tmA2A10.stop()
         return myreq.tensor
 
     @staticmethod
     def backward(ctx, *grad_output):
         global myreq
-        #print("All2All_Req:backward")
+        # print("All2All_Req:backward ", my_rank)
+        tm.tmA2A12.start()
         a2ai = ctx.a2ai
         myreq.req.wait()
         myreq.req = None
@@ -280,6 +372,7 @@ def backward(ctx, *grad_output):
         grad_inputs = grad_input.view([a2ai.N, -1]).split(a2ai.E, dim=1)
         grad_inputs = [gin.contiguous() for gin in grad_inputs]
         myreq.tensor = None
+        tm.tmA2A12.stop()
         return (None, *grad_inputs)
 
 
@@ -287,7 +380,8 @@ class All2All_Wait(Function):
     @staticmethod
     def forward(ctx, *output):
         global myreq
-        #print("All2All_Wait:forward")
+        # print("All2All_Wait:forward ", my_rank)
+        tm.tmA2A11.start()
         a2ai = myreq.a2ai
         ctx.a2ai = a2ai
         myreq.req.wait()
@@ -296,12 +390,15 @@ def forward(ctx, *output):
         emb_split_lengths = a2ai.emb_split_lengths if a2ai.emb_split_lengths else a2ai.lS * a2ai.lN * a2ai.E
         outputs = output[0].split(emb_split_lengths)
         outputs = tuple([out.view([a2ai.lN, -1]) for out in outputs])
+        tm.tmA2A11.stop()
+        # print("All2All_Wait:forward done")
         return outputs
 
     @staticmethod
     def backward(ctx, *grad_outputs):
         global myreq
-        #print("All2All_Wait:backward")
+        # print("All2All_Wait:backward ", my_rank)
+        tm.tmA2A13.start()
         a2ai = ctx.a2ai
         grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs]
         grad_output = torch.cat(grad_outputs)
@@ -309,6 +406,8 @@ def backward(ctx, *grad_outputs):
         req = dist.all_to_all_single(grad_input, grad_output, a2ai.mb_split_lengths, a2ai.emb_split_lengths, async_op=True)
         myreq.req = req
         myreq.tensor = grad_input
+        tm.tmA2A13.stop()
+        # print("All2All_Wait:backward done")
         return (grad_output,)
 
 class AllGather(Function):
@@ -374,7 +473,7 @@ def alltoall(inputs, per_rank_split_lengths):
     a2ai.S = sum(per_rank_split_lengths) if per_rank_split_lengths else a2ai.lS * my_size
 
     if a2a_impl == '' and alltoall_supported or a2a_impl == 'alltoall':
-        #print("Using All2All_Req")
+        print("Using All2All_Req")
         output = All2All_Req.apply(a2ai, *inputs)
         myreq.WaitFunction = All2All_Wait
     elif a2a_impl == '' or a2a_impl == 'scatter':
diff --git a/job.all.sh b/job.all.sh
new file mode 100644
index 00000000..cf56aaca
--- /dev/null
+++ b/job.all.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+#SBATCH --job-name=testdlrm   #The name you want the job to have
+#SBATCH --output=/private/home/hongzhang/tmp/dlrm/output-%j
+#SBATCH --error=/private/home/hongzhang/tmp/dlrm/error-%j
+#SBATCH --nodes=1 # -C volta32gb    #The number of compute nodes to use
+#SBATCH --ntasks=8     #The total number of cpu tasks to run
+#SBATCH --time=00:40:00  # max time
+#SBATCH --exclusive       # exclusive nodes
+#SBATCH --gres=gpu:volta:8 -C volta32gb
+#SBATCH --mem-per-cpu=60GB
+
+# for mpirun host file
+echo $SLURM_NODELIST
+echo $SLURM_NODELIST > hostfile1
+
+source /private/home/hongzhang/.zshrc
+#module purge
+#module load anaconda3/2019.07
+#module load cuda/10.1
+#module load cudnn/v7.6.5.32-cuda.10.1
+#module load openmpi/4.0.2/gcc.7.4.0-cuda.10.1
+
+#export NCCL_ROOT_DIR=/private/home/hongzhang/codes/nccl/build
+#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NCCL_ROOT_DIR/lib
+#export CUDA_PATH=$CUDA_HOME
+#export CUDNN_PATH=$CUDNN_ROOT_DIR
+#export MPI_PATH=$MPI_HOME
+#export NCCL_PATH=$NCCL_ROOT_DIR
+
+conda activate mytorch
+
+which python3
+
+# large_arch_emb="2600-2600-2600-2600-2600-2600-2600-2600"
+# large_arch_emb="26000000-26000000-26000000-26000000-26000000-26000000-26000000-26000000"
+large_arch_emb_usr=$(printf '260%.0s' {1..815})
+large_arch_emb_usr=${large_arch_emb_usr//"02"/"0-2"} 
+large_arch_emb_ads=$(printf '140%.0s' {1..544}) 
+large_arch_emb_ads=${large_arch_emb_ads//"01"/"0-1"}
+large_arch_emb="$large_arch_emb_usr-$large_arch_emb_ads"
+
+# --hostfile hostfile1
+# random
+# /public/apps/openmpi/4.0.2/gcc.7.4.0/bin/mpirun -prefix /public/apps/openmpi/4.0.2/gcc.7.4.0/ -v -np 8 python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=random --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=30 --num-batches=4 --arch-project-size=30
+
+# fb_synthetic
+/public/apps/openmpi/4.0.2/gcc.7.4.0/bin/mpirun -prefix /public/apps/openmpi/4.0.2/gcc.7.4.0/ -v -np 8 python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=synthetic --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=28 --num-batches=4 --arch-project-size=30
+
+# srun --label /private/home/hongzhang/.conda/envs/mytorch/bin/python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=random --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=30 --num-batches=4
diff --git a/profile.py b/profile.py
new file mode 100644
index 00000000..bb29033a
--- /dev/null
+++ b/profile.py
@@ -0,0 +1,139 @@
+# Add some self profiling information 
+# Allow nested timer exists
+
+import time
+
+class TimerError(Exception):
+    """Exception in ProfTimer class"""
+
+class ProfTimer:
+    def __init__(self, timername="Timer for DLRM Activity"):
+        self._name = timername
+        self._start = 0.0
+        self._count = 0
+        self._elapsed = 0.0
+
+    def start(self):
+        """Start a new timer"""
+        self._start = time.perf_counter()
+
+    def stop(self):
+        if self._start == 0.0:
+            raise TimerError(f"Timer is not running.")
+        self._elapsed += time.perf_counter() - self._start
+        self._count += 1
+        self._start = 0.0
+ 
+    def count(self):
+        return _self._count
+
+    def reset(self):
+        self._elapsed = 0.0
+        self._count = 0
+
+    def elapsed(self):
+        return self._elapsed 
+
+    def output(self, level):
+        if level == 0:
+            print(f"{self._name }: {self._elapsed:0.6f} seconds with counts {self._count}")
+        else:
+            print(f"    {self._name }: {self._elapsed:0.6f} seconds with counts {self._count}")
+   
+alltimers = []
+tmGetData = ProfTimer("GetData")
+tmFwd     = ProfTimer("Forword")
+tmLoss    = ProfTimer("Loss   ")
+tmZero    = ProfTimer("Zero   ")
+tmBwd     = ProfTimer("Backwrd")
+tmOpt     = ProfTimer("Opt    ")
+tmSync    = ProfTimer("CudaSyn")
+tmSync1   = ProfTimer("CudaSy1")
+tmSync2   = ProfTimer("CudaSy2")
+tmSync3   = ProfTimer("CudaSy3")
+
+tmH2D     = ProfTimer("CopyH2D")
+tmEmb     = ProfTimer("EMB    ")
+tmA2A     = ProfTimer("All2All")
+tmA2A1    = ProfTimer("All2All1")
+tmBot     = ProfTimer("Bottom ")
+tmInt     = ProfTimer("Inter  ")
+tmTop     = ProfTimer("Top MLP")
+tmAllGa   = ProfTimer("Allgath")
+
+tmA2A10    = ProfTimer("All2All10")
+tmA2A11    = ProfTimer("All2All11")
+tmA2A12    = ProfTimer("All2All12")
+tmA2A13    = ProfTimer("All2All13")
+
+def tmClear():
+        
+    tmGetData.reset()
+    tmFwd.reset()
+    tmLoss.reset()
+    tmZero.reset()
+    tmBwd.reset()
+    tmOpt.reset()
+    tmSync.reset()
+    tmSync1.reset()
+    tmSync2.reset()
+    tmSync3.reset()
+
+    tmH2D.reset()
+    tmEmb.reset()
+    tmA2A.reset()
+    tmA2A1.reset()
+    tmBot.reset()
+    tmInt.reset()
+    tmTop.reset()
+    tmAllGa.reset()
+
+    tmA2A10.reset()
+    tmA2A11.reset()
+    tmA2A12.reset()
+    tmA2A13.reset()
+
+def tmSummary(pid):
+    
+    print("Summary of the tm timers:")
+    print("---------{:6d}----------------".format(pid))
+    tmGetData.output(0)
+    tmFwd.output(0)
+    tmH2D.output(1)
+    tmEmb.output(1)
+    tmA2A.output(1)
+    tmA2A1.output(1)
+    tmBot.output(1)
+    tmInt.output(1)
+    tmTop.output(1)
+    tmAllGa.output(1)
+    tmLoss.output(0)
+    tmZero.output(0)
+    tmBwd.output(0)
+    tmOpt.output(0)
+#    tmSync.output(0)
+    tmSync1.output(0)
+    tmSync2.output(0)
+    tmSync3.output(0)
+
+    tmA2A10.output(1)
+    tmA2A11.output(1)
+    tmA2A12.output(1)
+    tmA2A13.output(1)
+    print("========={:6d}================".format(pid))
+
+if __name__ == "__main__":
+    
+    t1 = ProfTimer("Test1")
+    t1.start()
+    time.sleep(3)
+    t1.stop()
+    t1.elapsed()
+    t1.output()
+
+    t1.start()
+    time.sleep(5)
+    t1.stop()
+    t1.output()
+
+
diff --git a/project.py b/project.py
new file mode 100644
index 00000000..b83fa85e
--- /dev/null
+++ b/project.py
@@ -0,0 +1,67 @@
+
+# This feature can be used to reduce the memory size consumed by the feature layer of the top MLP.
+# Suppose we have n sparse features, each sparse features is represented by an embedding of size d,
+# then, we can represent the sparse embeddings by a matrix X = (n, d). The dot product between sparse
+# features is X(X^T), which is a symmetric matrix of (n, n) and will be fed into the top MLP. 
+# Actually We only need the upper or lower traingles to eliminate duplication. If n is large,
+# such as, n = 1000, then the number of dot features fed into the MLP will be n^2/2 = 50,000.
+# Considering the layer size 4096, the weight parameters will be a matrix (n^2/2, 4096), which
+# may consume a large amount of precious memory resources.
+
+# To reduce the number of dot features, we introduce a parameter called arch-projec-size (k) to compress
+# the embeddings. We introduce a parameter matrix Y = (n, k) to compute the weighted sum of the
+# dot features. The compressed embeddings is represented by (X^T)Y. Then, we compute the compressed dot 
+# features by X(X^T)Y = (n, k). Therefore, we can reduce the dot features fed into MLP from n*n/2
+# to n*k.
+
+import sys
+import torch
+import torch.nn as nn
+import numpy as np
+
+"""
+Compute the projected dot features
+T: (batch_size, n, d), batched raw embeddings
+x: dense features
+proj_layer: the projection layer created by create_proj
+"""
+def project(T, x, proj_layer):
+
+  TT = torch.transpose(T, 1, 2)
+  # TS = torch.reshape(TT, (-1, TT.size(2)))
+  # TC = proj_layer(TS)
+  # TR = torch.reshape(TC, (-1, T.shape[2], k))
+  TR = proj_layer(TT)
+  Z  = torch.bmm(T, TR)
+  Zflat = Z.view((T.shape[0], -1))
+  R = torch.cat([x] + [Zflat], dim=1)
+
+  return R
+
+"""
+Create the project layer
+n: number of sparse features
+m: projection size
+"""
+def create_proj(n, m):
+  # build MLP layer by layer
+  layers = nn.ModuleList()
+  # construct fully connected operator
+  LL = nn.Linear(int(n), int(m), bias=True)
+
+  # initialize the weights
+  # with torch.no_grad():
+  # custom Xavier input, output or two-sided fill
+  mean = 0.0  # std_dev = np.sqrt(variance)
+  std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+  W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+  std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+  bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+  # approach 1
+  LL.weight.data = torch.tensor(W, requires_grad=True)
+  LL.bias.data = torch.tensor(bt, requires_grad=True)
+  # approach 2: constant value ?
+  layers.append(LL)
+
+  return torch.nn.Sequential(*layers)
+
diff --git a/requirements.txt b/requirements.txt
index c5cad56a..b198a127 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,3 +5,4 @@ pydot
 torch
 torchviz
 scikit-learn
+tqdm
diff --git a/tools/visualize.py b/tools/visualize.py
new file mode 100755
index 00000000..f16504cb
--- /dev/null
+++ b/tools/visualize.py
@@ -0,0 +1,1030 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# This script performs the visualization of the embedding tables created in
+# DLRM during the training procedure. We use two popular techniques for
+# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and
+# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html).
+# These links also provide instructions on how to install these packages
+# in different environments.
+#
+# Warning: the size of the data to be visualized depends on the RAM on your machine.
+#
+#
+# Connand line examples:
+#
+# Full analysis of embeddings and data representations for Criteo Kaggle data:
+# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 
+#         --raw-data-file=../../criteo/input/train.txt --skip-categorical-analysis 
+#         --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz
+#
+#
+# To run just the analysis of categoricala data for Criteo Kaggle data set:
+# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 \
+#         --raw-data-file=../../criteo/input/train.txt --data-randomize=none --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz \
+#         --skip-embedding --skip-data-plots
+#
+#
+# The following command line arguments are available to the user:
+#
+#    --load-model                   - DLRM model file
+#    --data-set                     - one of ["kaggle", "terabyte"]
+#    --max-ind-range                - max index range used during the traning
+#    --output-dir                   - output directory, if not specified, it will be traeted from the model and datset names
+#    --max-umap-size                - max number of points to visualize using UMAP, default=50000
+#    --use-tsne                     - use T-SNE
+#    --max-tsne-size                - max number of points to visualize using T-SNE, default=1000)
+#    --skip-embedding               - skips analysis of embedding tables
+#    --umap-metric                  - metric for UMAP 
+#    --skip-data-plots              - skips data plots
+#    --skip-categorical-analysis    - skips categorical analysis
+# 
+#    # data file related
+#    --raw-data-file
+#    --processed-data-file
+#    --data-sub-sample-rate
+#    --data-randomize
+#    --memory-map
+#    --mini-batch-size
+#    --num-workers
+#    --test-mini-batch-size
+#    --test-num-workers
+#    --num-batches    
+#    --mlperf-logging
+
+import os
+import sys
+import argparse
+import numpy as np
+import umap
+import hdbscan
+import json
+import torch
+import math
+import matplotlib
+import matplotlib.pyplot as plt
+import collections
+
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import f1_score
+from sklearn.metrics import precision_score
+from sklearn.metrics import recall_score
+
+from sklearn import manifold
+
+import dlrm_data_pytorch as dp
+from dlrm_s_pytorch import DLRM_Net
+
+
+def visualize_embeddings_umap(emb_l, 
+                              output_dir    = "",
+                              max_size      = 500000, 
+                              umap_metric   = "euclidean",
+                              cat_counts    = None,
+                              use_max_count = True):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu().numpy()
+        print("umap", E.shape)
+
+        # create histogram of norms
+        bins = 50
+        norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])]
+#        plt.hist(norms, bins = bins)
+#        plt.title("Cat norm hist var. "+str(k))
+        hist, bins = np.histogram(norms, bins=bins)
+        logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins))
+
+        plt.figure(figsize=(8,8))
+        plt.title("Categorical norms: " + str(k) + " cardinality " + str(len(cat_counts[k])))
+        plt.hist(norms, bins=logbins)
+        plt.xscale("log")
+#        plt.legend()
+        plt.savefig(output_dir+"/cat-norm-histogram-"+str(k)+".png")
+        plt.close()
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+        min_cnt = 0
+        
+#        reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1)
+        reducer = umap.UMAP(random_state=42, metric=umap_metric)
+        
+        if use_max_count is False or n_vis == E.shape[0]:
+            Y = reducer.fit_transform(E[:n_vis,:])
+        else:
+            
+            # select values with couns > 1
+            done  = False
+            min_cnt = 1
+            while done == False:
+                el_cnt = (cat_counts[k] > min_cnt).sum()
+                if el_cnt <= max_size:
+                    done = True
+                else:
+                    min_cnt = min_cnt+1
+           
+            E1= []
+            for i in range(0, E.shape[0]):
+                if cat_counts[k][i] > min_cnt:
+                    E1.append(E[i,:])
+            
+            print("max_count_len", len(E1), "mincount", min_cnt)
+            Y = reducer.fit_transform(np.array(E1))
+
+            n_vis = len(E1)
+
+        plt.figure(figsize=(8,8))
+        
+        linewidth = 0
+        size      = 1
+        
+        if Y.shape[0] < 2500:
+            linewidth = 1 
+            size      = 5
+
+        if cat_counts is None:
+            plt.scatter(-Y[:,0], -Y[:,1], s=size, marker=".", linewidth=linewidth)
+        else:
+            #print(cat_counts[k])
+            n_disp = min(len(cat_counts[k]), Y.shape[0])
+            cur_max = math.log(max(cat_counts[k]))
+            norm_cat_count = [math.log(cat_counts[k][i]+1)/cur_max for i in range(0, len(cat_counts[k]))]
+            plt.scatter(-Y[0:n_disp,0], -Y[0:n_disp,1], s=size, marker=".", linewidth=linewidth, c=np.array(norm_cat_count)[0:n_disp], cmap="viridis")
+            plt.colorbar()
+            
+        plt.title("UMAP: categorical var. " + str(k) + "  (" + str(n_vis) + " of " + str(E.shape[0]) + ", min count " + str(min_cnt) + ")")
+        plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-umap.png")
+        plt.close()
+
+
+def visualize_embeddings_tsne(emb_l, 
+                              output_dir = "",
+                              max_size   = 10000):
+
+    for k in range(0, len(emb_l)):
+
+        E = emb_l[k].weight.detach().cpu()    
+        print("tsne", E.shape)
+
+        if E.shape[0] < 20:
+            print("Skipping small embedding")
+            continue
+
+        n_vis = min(max_size, E.shape[0])
+        
+        tsne = manifold.TSNE(init="pca", random_state=0, method="exact")
+    
+        Y = tsne.fit_transform(E[:n_vis,:])
+
+        plt.figure(figsize=(8, 8))
+
+        linewidth = 0
+        if Y.shape[0] < 5000:
+            linewidth = 1 
+
+        plt.scatter(-Y[:,0], -Y[:,1], s=1, marker=".", linewidth=linewidth)
+        
+        plt.title("TSNE: categorical var. " + str(k) + "  (" + str(n_vis) + " of " + str(E.shape[0]) + ")")
+        plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-tsne.png")
+        plt.close()
+
+
+def analyse_categorical_data(X_cat, n_days=10, output_dir=""):
+
+    # analyse categorical variables
+    n_vec = len(X_cat)
+    n_cat = len(X_cat[0])
+    n_days = n_days
+    
+    print("n_vec", n_vec, "n_cat", n_cat)
+#    for c in train_data.X_cat:
+#        print(n_cat, c)
+
+    all_cat = np.array(X_cat)
+    print("all_cat.shape", all_cat.shape)
+    day_size = all_cat.shape[0]/n_days
+
+    for i in range(0,n_cat):
+        l_d   = []
+        l_s1  = []
+        l_s2  = []
+        l_int = []
+        l_rem = []
+
+        cat = all_cat[:,i]
+        print("cat", i, cat.shape)
+        for d in range(1,n_days):
+            offset = int(d*day_size)
+            #print(offset)
+            cat1 = cat[:offset]
+            cat2 = cat[offset:]
+
+            s1 = set(cat1)
+            s2 = set(cat2)
+
+            intersect = list(s1 & s2) 
+            #print(intersect)
+            l_d.append(d)
+            l_s1.append(len(s1))
+            l_s2.append(len(s2))
+            l_int.append(len(intersect))
+            l_rem.append((len(s1)-len(intersect)))
+
+            print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect)))
+
+        print("spit",    l_d)
+        print("before",  l_s1)
+        print("after",   l_s2)
+        print("inters.", l_int)
+        print("removed", l_rem)
+
+        plt.figure(figsize=(8,8))
+        plt.plot(l_d, l_s1,  "g", label="before")
+        plt.plot(l_d, l_s2,  "r", label="after")
+        plt.plot(l_d, l_int, "b", label="intersect")
+        plt.plot(l_d, l_rem, "y", label="removed")
+        plt.title("categorical var. "+str(i))
+        plt.legend()
+        plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png")
+        plt.close()
+
+
+def analyse_categorical_counts(X_cat, emb_l=None, output_dir=""):
+
+    # analyse categorical variables
+    n_vec = len(X_cat)
+    n_cat = len(X_cat[0])
+    
+    print("n_vec", n_vec, "n_cat", n_cat)
+#    for c in train_data.X_cat:
+#        print(n_cat, c)
+
+    all_cat = np.array(X_cat)
+    print("all_cat.shape", all_cat.shape)
+
+    all_counts = []
+
+    for i in range(0,n_cat):
+        
+        cat = all_cat[:,i]
+        if emb_l is None:
+            s      = set(cat)
+            counts = np.zeros((len(s)))
+            print("cat", i, cat.shape, len(s))
+        else:
+            s = emb_l[i].weight.detach().cpu().shape[0]
+            counts = np.zeros((s))
+            print("cat", i, cat.shape, s)
+
+        for d in range(0,n_vec):
+            cv = int(cat[d])
+            counts[cv] = counts[cv]+1
+
+        all_counts.append(counts)
+
+        if emb_l is None:
+            plt.figure(figsize=(8,8))
+            plt.plot(counts)
+            plt.title("Categorical var "+str(i) + " cardinality " + str(len(counts)))
+            #        plt.legend()
+        else:
+            E = emb_l[i].weight.detach().cpu().numpy()
+            norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])]
+
+            fig, (ax0, ax1) = plt.subplots(2, 1)
+            fig.suptitle("Categorical variable: " + str(i)+" cardinality "+str(len(counts)))
+
+            ax0.plot(counts)
+            ax0.set_yscale("log")
+            ax0.set_title("Counts", fontsize=10)
+    
+            ax1.plot(norms)
+            ax1.set_title("Norms", fontsize=10)
+
+        plt.savefig(output_dir+"/cat_counts-"+str(i).zfill(3)+".png")
+        plt.close()
+    
+    return all_counts
+    
+
+def dlrm_output_wrap(dlrm, X, lS_o, lS_i, T):
+
+    all_feat_vec = []
+    all_cat_vec  = []
+    x_vec        = None
+    t_out        = None
+    c_out        = None
+    z_out        = []
+    p_out        = None
+
+    z_size = len(dlrm.top_l)
+
+    x = dlrm.apply_mlp(X, dlrm.bot_l)
+    # debug prints
+    #print("intermediate")
+    #print(x[0].detach().cpu().numpy())
+    x_vec = x[0].detach().cpu().numpy()
+    all_feat_vec.append(x_vec)
+#    all_X.append(x[0].detach().cpu().numpy())
+
+    # process sparse features(using embeddings), resulting in a list of row vectors
+    ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l)
+
+    for e in ly:
+        #print(e.detach().cpu().numpy())
+        all_feat_vec.append(e[0].detach().cpu().numpy())
+        all_cat_vec.append(e[0].detach().cpu().numpy())
+
+    all_feat_vec= np.concatenate(all_feat_vec, axis=0)
+    all_cat_vec= np.concatenate(all_cat_vec, axis=0)
+
+#    all_features.append(all_feat_vec)
+#    all_cat.append(all_cat_vec)
+    t_out = int(T.detach().cpu().numpy()[0,0])
+#    all_T.append(int(T.detach().cpu().numpy()[0,0]))
+
+    z = dlrm.interact_features(x, ly)
+    # print(z.detach().cpu().numpy())
+#    z_out = z.detach().cpu().numpy().flatten()
+    z_out.append(z.detach().cpu().numpy().flatten())
+#    all_z[0].append(z.detach().cpu().numpy().flatten())
+
+        # obtain probability of a click (using top mlp)
+#        print(dlrm.top_l)
+#        p = dlrm.apply_mlp(z, dlrm.top_l)
+
+    for i in range(0, z_size):
+        z = dlrm.top_l[i](z)
+
+#        if i < z_size-1:
+#            curr_z = z.detach().cpu().numpy().flatten()
+        z_out.append(z.detach().cpu().numpy().flatten())
+#            all_z[i+1].append(curr_z)
+#            print("z append", i)
+            
+#        print("z",i, z.detach().cpu().numpy().flatten().shape)
+
+    p = z
+
+    # clamp output if needed
+    if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0:
+        z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold))
+    else:
+        z = p
+
+    class_thresh = 0.0 #-0.25
+    zp = z.detach().cpu().numpy()[0,0]+ class_thresh
+    
+    p_out = int(zp+0.5)
+    if p_out > 1:
+        p_out = 1
+    if p_out < 0:
+        p_out = 0
+
+#    all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5))
+
+    #print(int(z.detach().cpu().numpy()[0,0]+0.5))
+    if int(p_out) == t_out:
+        c_out = 0
+    else:
+        c_out = 1
+
+    return all_feat_vec, x_vec, all_cat_vec, t_out, c_out, z_out, p_out
+
+
+def create_umap_data(dlrm, data_ld, max_size=50000, offset=0,  info=""):
+    
+    all_features = []
+    all_X        = []
+    all_cat      = []
+    all_T        = []
+    all_c        = []
+    all_z        = []
+    all_pred     = []
+    
+    z_size = len(dlrm.top_l)
+    print("z_size", z_size)
+    for i in range(0, z_size):
+        all_z.append([])
+    
+    for j, (X, lS_o, lS_i, T) in enumerate(data_ld):
+
+        if j < offset:
+            continue
+        
+        if j >= max_size+offset:
+            break
+        
+        af, x, cat, t, c, z, p = dlrm_output_wrap(dlrm, X, lS_o, lS_i, T)
+       
+        all_features.append(af)
+        all_X.append(x)
+        all_cat.append(cat)
+        all_T.append(t)
+        all_c.append(c)
+        all_pred.append(p)
+        
+        for i in range(0, z_size):
+            all_z[i].append(z[i])
+
+#    # calculate classifier metrics 
+    ac = accuracy_score(all_T, all_pred)
+    f1 = f1_score(all_T, all_pred)
+    ps = precision_score(all_T, all_pred)
+    rc = recall_score(all_T, all_pred)
+
+    print(info, "accuracy", ac, "f1", f1, "precision", ps, "recall", rc)
+
+    return all_features, all_X, all_cat, all_T, all_z, all_c, all_pred
+
+
+def plot_all_data_3(umap_Y,
+                    umap_T,
+                    train_Y          = None, 
+                    train_T          = None, 
+                    test_Y           = None, 
+                    test_T           = None, 
+                    total_train_size = "", 
+                    total_test_size  = "", 
+                    info             = "",
+                    output_dir       = "",
+                    orig_space_dim   = 0):
+    
+    size = 1
+    colors = ["red","green"]
+
+    fig, (ax0, ax1, ax2) = plt.subplots(1, 3)
+    fig.suptitle("UMAP: " + info + " space dim "+str(orig_space_dim))
+
+    ax0.scatter(umap_Y[:,0], umap_Y[:,1], s=size, c=umap_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+    ax0.set_title("UMAP ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7)
+    
+    if train_Y is not None and train_T is not None:
+        ax1.scatter(train_Y[:,0], train_Y[:,1], s=size, c=train_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+        ax1.set_title("Train ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7)
+
+    if test_Y is not None and test_T is not None:
+        ax2.scatter(test_Y[:,0], test_Y[:,1], s=size, c=test_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0)
+        ax2.set_title("Test ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7)
+
+    plt.savefig(output_dir+"/"+info+"-umap.png")
+    plt.close()
+
+
+def plot_one_class_3(umap_Y,
+                     umap_T,
+                     train_Y,
+                     train_T,
+                     test_Y, 
+                     test_T, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = "", 
+                     total_test_size  = "", 
+                     info             = "",
+                     output_dir       = "",
+                     orig_space_dim   = 0):
+    
+    size = 1
+    
+    fig, (ax0, ax1, ax2) = plt.subplots(1, 3)
+    fig.suptitle("UMAP: "+ info + " space dim "+str(orig_space_dim))
+
+    ind_l_umap     = [i for i,x in enumerate(umap_T) if x == target]
+    Y_umap_l       = np.array([umap_Y[i,:] for i in ind_l_umap])
+
+    ax0.scatter(Y_umap_l[:,0], Y_umap_l[:,1], s=size, c=col, marker=".", linewidth=0)
+    ax0.set_title("UMAP, ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7)
+    
+    if train_Y is not None and train_T is not None:
+        ind_l_test = [i for i,x in enumerate(train_T) if x == target]
+        Y_test_l   = np.array([train_Y[i,:] for i in ind_l_test])
+        
+        ax1.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0)
+        ax1.set_title("Train, ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7)
+
+    if test_Y is not None and test_T is not None:
+        ind_l_test = [i for i,x in enumerate(test_T) if x == target]
+        Y_test_l   = np.array([test_Y[i,:] for i in ind_l_test])
+
+        ax2.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0)
+        ax2.set_title("Test, ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7)
+
+    plt.savefig(output_dir+"/"+info+"-umap.png")
+    plt.close()
+
+
+def visualize_umap_data(umap_Y,
+                        umap_T,
+                        umap_C,
+                        umap_P,
+                        train_Y, 
+                        train_T, 
+                        train_C,
+                        train_P,
+                        test_Y           = None,
+                        test_T           = None, 
+                        test_C           = None,
+                        test_P           = None,
+                        total_train_size = "", 
+                        total_test_size  = "",  
+                        info             = "",
+                        output_dir       = "",
+                        orig_space_dim   = 0):
+
+    # all classes
+    plot_all_data_3(umap_Y           = umap_Y,
+                    umap_T           = umap_T,
+                    train_Y          = train_Y,
+                    train_T          = train_T, 
+                    test_Y           = test_Y, 
+                    test_T           = test_T, 
+                    total_train_size = total_train_size,
+                    total_test_size  = total_test_size,
+                    info             = info,
+                    output_dir       = output_dir,
+                    orig_space_dim   = orig_space_dim)
+
+    # all predictions
+    plot_all_data_3(umap_Y           = umap_Y,
+                    umap_T           = umap_P,
+                    train_Y          = train_Y,
+                    train_T          = train_P, 
+                    test_Y           = test_Y, 
+                    test_T           = test_P, 
+                    total_train_size = total_train_size,
+                    total_test_size  = total_test_size,
+                    info             = info+", all-predictions",
+                    output_dir       = output_dir,
+                    orig_space_dim   = orig_space_dim)
+
+    
+    # class 0
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_T,
+                     train_Y          = train_Y,
+                     train_T          = train_T,
+                     test_Y           = test_Y, 
+                     test_T           = test_T, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info+" class " + str(0),
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # class 1
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_T,
+                     train_Y          = train_Y,
+                     train_T          = train_T,
+                     test_Y           = test_Y, 
+                     test_T           = test_T, 
+                     target           = 1, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " class " + str(1),
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # correct classification
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_C,
+                     train_Y          = train_Y,
+                     train_T          = train_C,
+                     test_Y           = test_Y, 
+                     test_T           = test_C, 
+                     target           = 0, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " correct ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # errors
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_C,
+                     train_Y          = train_Y,
+                     train_T          = train_C,
+                     test_Y           = test_Y, 
+                     test_T           = test_C, 
+                     target           = 1, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " errors ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # prediction 0
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_P,
+                     train_Y          = train_Y,
+                     train_T          = train_P,
+                     test_Y           = test_Y, 
+                     test_T           = test_P, 
+                     target           = 0, 
+                     col              = "red", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " predict-0 ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+    # prediction 1
+    plot_one_class_3(umap_Y           = umap_Y,
+                     umap_T           = umap_P,
+                     train_Y          = train_Y,
+                     train_T          = train_P,
+                     test_Y           = test_Y, 
+                     test_T           = test_P, 
+                     target           = 1, 
+                     col              = "green", 
+                     total_train_size = total_train_size, 
+                     total_test_size  = total_test_size, 
+                     info             = info + " predict-1 ",
+                     output_dir       = output_dir,
+                     orig_space_dim   = orig_space_dim)
+
+def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""):
+
+    clusterer       = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True)
+    umap_labels     = clusterer.fit_predict(umap_data)
+    train_labels, _ = hdbscan.approximate_predict(clusterer, train_data)
+    test_labels,  _ = hdbscan.approximate_predict(clusterer, test_data)
+
+    fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3)
+    fig.suptitle("HDBSCAN clastering: "+ info )
+
+    # plot umap data
+    umap_clustered = (umap_labels >= 0)
+    umap_coll = collections.Counter(umap_clustered)
+    print("umap_clustered", umap_coll)
+#    print("umap_data", umap_data.shape)
+#    print("~umap_clustered", umap_clustered.count(False), ~umap_clustered)
+    ax00.scatter(umap_data[~umap_clustered, 0],
+                 umap_data[~umap_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7)
+    ax10.scatter(umap_data[umap_clustered, 0],
+                 umap_data[umap_clustered, 1],
+                 c=umap_labels[umap_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7)
+    
+    # plot train data
+    train_clustered = (train_labels >= 0)
+    train_coll = collections.Counter(train_clustered)
+    ax01.scatter(train_data[~train_clustered, 0],
+                 train_data[~train_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7)
+    ax11.scatter(train_data[train_clustered, 0],
+                 train_data[train_clustered, 1],
+                 c=train_labels[train_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7)
+    
+    # plot test data
+    test_clustered = (test_labels >= 0)
+    test_coll = collections.Counter(test_clustered)
+    ax02.scatter(test_data[~test_clustered, 0],
+                 test_data[~test_clustered, 1],
+                 c=(0.5, 0.5, 0.5),
+                 s=0.1,
+                 alpha=0.5)
+    ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7)
+    ax12.scatter(test_data[test_clustered, 0],
+                 test_data[test_clustered, 1],
+                 c=test_labels[test_clustered],
+                 s=0.1,
+                 cmap="Spectral")
+    ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7)
+    
+    plt.savefig(output_dir+"/"+info+"-hdbscan.png")
+    plt.close()
+
+
+def visualize_all_data_umap(dlrm, 
+                            train_ld, 
+                            test_ld       = None, 
+                            max_umap_size = 50000,
+                            output_dir    = "",
+                            umap_metric   = "euclidean"):
+
+    data_ratio = 1
+    
+    print("creating umap data")
+    umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap")
+    
+    # transform train and test data
+    train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train")
+    test_feat,  test_X,  test_cat,  test_T,  test_z,  test_c,  test_p  = create_umap_data(dlrm=dlrm, data_ld=test_ld,  max_size=max_umap_size*data_ratio, offset=0,             info="test")
+
+    print("umap_train_feat", np.array(umap_train_feat).shape)
+    reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat)
+
+    train_feat_Y = reducer_all_feat.transform(train_feat)
+    test_feat_Y  = reducer_all_feat.transform(test_feat)
+    
+    visualize_umap_data(umap_Y           = umap_feat_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_feat_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_feat_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "all-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_feat).shape[1])
+
+    hdbscan_clustering(umap_data  = umap_feat_Y, 
+                       train_data = train_feat_Y, 
+                       test_data  = test_feat_Y, 
+                       info       = "umap-all-features", 
+                       output_dir = output_dir)
+
+#    hdbscan_clustering(umap_data  = np.array(umap_train_feat), 
+#                       train_data = np.array(train_feat), 
+#                       test_data  = np.array(test_feat), 
+#                       info       = "all-features", 
+#                       output_dir = output_dir)
+
+    print("umap_train_X", np.array(umap_train_X).shape)
+    reducer_X = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_X_Y = reducer_X.fit_transform(umap_train_X)
+
+    train_X_Y = reducer_X.transform(train_X)
+    test_X_Y  = reducer_X.transform(test_X)
+
+    visualize_umap_data(umap_Y           = umap_X_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_X_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_X_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "cont-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_X).shape[1])
+
+    print("umap_train_cat", np.array(umap_train_cat).shape)
+    reducer_cat = umap.UMAP(random_state=42, metric=umap_metric)
+    umap_cat_Y = reducer_cat.fit_transform(umap_train_cat)
+
+    train_cat_Y = reducer_cat.transform(train_cat)
+    test_cat_Y  = reducer_cat.transform(test_cat)
+
+    visualize_umap_data(umap_Y           = umap_cat_Y,
+                        umap_T           = umap_train_T,
+                        umap_C           = umap_train_c,
+                        umap_P           = umap_train_p,
+                        train_Y          = train_cat_Y, 
+                        train_T          = train_T, 
+                        train_C          = train_c,
+                        train_P          = train_p,
+                        test_Y           = test_cat_Y,
+                        test_T           = test_T, 
+                        test_C           = test_c,
+                        test_P           = test_p,
+                        total_train_size = str(len(train_ld)), 
+                        total_test_size  = str(len(test_ld)), 
+                        info             = "cat-features",
+                        output_dir       = output_dir,
+                        orig_space_dim   = np.array(umap_train_cat).shape[1])
+
+    # UMAP for z data
+    for i in range(0,len(umap_train_z)):
+        print("z", i, np.array(umap_train_z[i]).shape)
+        reducer_z = umap.UMAP(random_state=42, metric=umap_metric)
+        umap_z_Y = reducer_z.fit_transform(umap_train_z[i])
+
+        train_z_Y = reducer_z.transform(train_z[i])
+        test_z_Y  = reducer_z.transform(test_z[i])
+
+        visualize_umap_data(umap_Y           = umap_z_Y,
+                            umap_T           = umap_train_T,
+                            umap_C           = umap_train_c,
+                            umap_P           = umap_train_p,
+                            train_Y          = train_z_Y, 
+                            train_T          = train_T, 
+                            train_C          = train_c,
+                            train_P          = train_p,
+                            test_Y           = test_z_Y,
+                            test_T           = test_T, 
+                            test_C           = test_c,
+                            test_P           = test_p,
+                            total_train_size = str(len(train_ld)), 
+                            total_test_size  = str(len(test_ld)), 
+                            info             = "z-features-"+str(i),
+                            output_dir       = output_dir,
+                            orig_space_dim   = np.array(umap_train_z[i]).shape[1])
+
+
+def analyze_model_data(output_dir,
+                       dlrm,
+                       train_ld,
+                       test_ld,
+                       train_data,
+                       skip_embedding            = False,
+                       use_tsne                  = False,
+                       max_umap_size             = 50000,
+                       max_tsne_size             = 10000,
+                       skip_categorical_analysis = False,
+                       skip_data_plots           = False,
+                       umap_metric               = "euclidean"):
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if skip_embedding is False:
+
+        cat_counts = None
+        
+        cat_counts = analyse_categorical_counts(X_cat=train_data.X_cat, emb_l=dlrm.emb_l, output_dir=output_dir)
+
+        visualize_embeddings_umap(emb_l       = dlrm.emb_l,
+                                  output_dir  = output_dir,
+                                  max_size    = max_umap_size,
+                                  umap_metric = umap_metric,
+                                  cat_counts  = cat_counts)
+
+        if use_tsne is True:
+            visualize_embeddings_tsne(emb_l      = dlrm.emb_l,
+                                      output_dir = output_dir,
+                                      max_size   = max_tsne_size)
+
+    # data visualization and analysis
+    if skip_data_plots is False:
+        visualize_all_data_umap(dlrm=dlrm, train_ld=train_ld, test_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir, umap_metric=umap_metric)
+
+    # analyse categorical variables
+    if skip_categorical_analysis is False and args.data_randomize == "none":
+        analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir)
+
+
+
+if __name__ == "__main__":
+
+    output_dir = ""
+    
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Exploratory DLRM analysis"
+    )
+
+    parser.add_argument("--load-model", type=str, default="")
+    parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset")
+#    parser.add_argument("--dataset-path", required=True, help="path to the dataset")
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+#    parser.add_argument("--mlperf-bin-loader", action="store_true", default=False)
+    parser.add_argument("--output-dir", type=str, default="")
+    parser.add_argument("--skip-embedding", action="store_true", default=False)
+    parser.add_argument("--umap-metric", type=str, default="euclidean")
+    parser.add_argument("--skip-data-plots", action="store_true", default=False)
+    parser.add_argument("--skip-categorical-analysis", action="store_true", default=False)
+    
+    # umap relatet
+    parser.add_argument("--max-umap-size", type=int, default=50000)
+    # tsne related
+    parser.add_argument("--use-tsne", action="store_true", default=False)
+    parser.add_argument("--max-tsne-size", type=int, default=1000)
+    # data file related
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--data-randomize", type=str, default="total")  # none, total or day or none
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--test-mini-batch-size", type=int, default=1)
+    parser.add_argument("--test-num-workers", type=int, default=0)
+    parser.add_argument("--num-batches", type=int, default=0)    
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+
+    args = parser.parse_args()
+
+    print("command line args: ", json.dumps(vars(args)))
+
+    if output_dir == "":
+        output_dir = args.data_set+"-"+os.path.split(args.load_model)[-1]+"-vis_all"
+    print("output_dir:", output_dir)
+    
+    if args.data_set == "kaggle":
+        # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh)
+        m_spa=16
+        ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572])
+        ln_bot=np.array([13,512,256,64,16])
+        ln_top=np.array([367,512,256,1])
+        
+    elif args.dataset == "terabyte":
+
+        if args.max_ind_range == 10000000:
+            # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000)
+            m_spa=64
+            ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36])
+            ln_bot=np.array([13,512,256,64])
+            ln_top=np.array([415,512,512,256,1])
+        elif args.max_ind_range == 40000000:
+            # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000)
+            m_spa=128
+            ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36])
+            ln_bot=np.array([13,512,256,128])
+            ln_top=np.array([479,1024,1024,512,256,1])
+        else:
+            raise ValueError("only --max-in-range 10M or 40M is supported")
+    else:
+        raise ValueError("only kaggle|terabyte dataset options are supported")
+
+    # check input parameters
+    if args.data_randomize != "none" and args.skip_categorical_analysis is not True:
+        print("Incorrect option for categoricat analysis, use:  --data-randomize=none")
+        sys.exit(-1)
+
+    dlrm = DLRM_Net(
+            m_spa,
+            ln_emb,
+            ln_bot,
+            ln_top,
+            arch_interaction_op="dot",
+            arch_interaction_itself=False,
+            sigmoid_bot=-1,
+            sigmoid_top=ln_top.size - 2,
+            sync_dense_params=True,
+            loss_threshold=0.0,
+            ndevices=-1,
+            qr_flag=False,
+            qr_operation=None,
+            qr_collisions=None,
+            qr_threshold=None,
+            md_flag=False,
+            md_threshold=None,
+        )
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+
+        ld_model = torch.load(args.load_model, map_location=torch.device("cpu"))
+        dlrm.load_state_dict(ld_model["state_dict"])
+
+        print("Model loaded", args.load_model)
+        #print(dlrm)
+
+    z_size = len(dlrm.top_l)
+    for i in range(0, z_size):
+         print("z", i, dlrm.top_l[i])
+
+    # load data
+    train_data = None
+    test_data  = None
+    
+    if args.raw_data_file is not "" or args.processed_data_file is not "":
+        train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args)
+
+    analyze_model_data(output_dir                = output_dir,
+                       dlrm                      = dlrm,
+                       train_ld                  = train_ld,
+                       test_ld                   = test_ld,
+                       train_data                = train_data,
+                       skip_embedding            = args.skip_embedding,
+                       use_tsne                  = args.use_tsne,
+                       max_umap_size             = args.max_umap_size,
+                       max_tsne_size             = args.max_tsne_size,
+                       skip_categorical_analysis = args.skip_categorical_analysis,
+                       skip_data_plots           = args.skip_data_plots,
+                       umap_metric               = args.umap_metric)
+
diff --git a/tricks/md_embedding_bag.py b/tricks/md_embedding_bag.py
index 53c9f7af..7c4071a2 100644
--- a/tricks/md_embedding_bag.py
+++ b/tricks/md_embedding_bag.py
@@ -34,7 +34,10 @@ def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None):
     d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B)
     if round_dim:
         d = pow_2_round(d)
-    return d
+    undo_sort = [0] * len(indices)
+    for i, v in enumerate(indices):
+        undo_sort[v] = i
+    return d[undo_sort]
 
 
 def alpha_power_rule(n, alpha, d0=None, B=None):
diff --git a/tt.py b/tt.py
new file mode 100644
index 00000000..357ac3e4
--- /dev/null
+++ b/tt.py
@@ -0,0 +1,1616 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Description: an implementation of a deep learning recommendation model (DLRM)
+# The model input consists of dense and sparse features. The former is a vector
+# of floating point values. The latter is a list of sparse indices into
+# embedding tables, which consist of vectors of floating point values.
+# The selected vectors are passed to mlp networks denoted by triangles,
+# in some cases the vectors are interacted through operators (Ops).
+#
+# output:
+#                         vector of values
+# model:                        |
+#                              /\
+#                             /__\
+#                               |
+#       _____________________> Op  <___________________
+#     /                         |                      \
+#    /\                        /\                      /\
+#   /__\                      /__\           ...      /__\
+#    |                          |                       |
+#    |                         Op                      Op
+#    |                    ____/__\_____           ____/__\____
+#    |                   |_Emb_|____|__|    ...  |_Emb_|__|___|
+# input:
+# [ dense features ]     [sparse indices] , ..., [sparse indices]
+#
+# More precise definition of model layers:
+# 1) fully connected layers of an mlp
+# z = f(y)
+# y = Wx + b
+#
+# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk])
+# z = Op(e1,...,ek)
+# obtain vectors e1=E[:,p1], ..., ek=E[:,pk]
+#
+# 3) Operator Op can be one of the following
+# Sum(e1,...,ek) = e1 + ... + ek
+# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek]
+# Cat(e1,...,ek) = [e1', ..., ek']'
+# where ' denotes transpose operation
+#
+# References:
+# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang,
+# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu,
+# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii,
+# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko,
+# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong,
+# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and
+# Recommendation Systems", CoRR, arXiv:1906.00091, 2019
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# miscellaneous
+import builtins
+import functools
+# import bisect
+# import shutil
+import time
+import json
+# data generation
+import dlrm_data_pytorch as dp
+
+# numpy
+import numpy as np
+import socket
+
+# onnx
+# The onnx import causes deprecation warnings every time workers
+# are spawned during testing. So, we filter out those warnings.
+import warnings
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+## import onnx
+
+# pytorch
+import torch
+from torch import onnx
+import torch.nn as nn
+from torch.nn.parallel.parallel_apply import parallel_apply
+from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import gather, scatter
+
+# For distributed run
+import extend_distributed as ext_dist
+
+# quotient-remainder trick
+from tricks.qr_embedding_bag import QREmbeddingBag
+# mixed-dimension trick
+from tricks.md_embedding_bag import PrEmbeddingBag, md_solver
+
+import sklearn.metrics
+
+import uuid
+import project
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import dlrm_data as dd
+
+# Add dlrm self profiling timers
+import profile as tm
+# import pyprof
+# pyprof.init()  # causing errors, some symbols not found
+
+# import synthetic_data_loader as fb_syn_data
+
+# from torchviz import make_dot
+# import torch.nn.functional as Functional
+# from torch.nn.parameter import Parameter
+
+from torch.optim.lr_scheduler import _LRScheduler
+
+exc = getattr(builtins, "IOError", "FileNotFoundError")
+
+class LRPolicyScheduler(_LRScheduler):
+    def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps):
+        self.num_warmup_steps = num_warmup_steps
+        self.decay_start_step = decay_start_step
+        self.decay_end_step = decay_start_step + num_decay_steps
+        self.num_decay_steps = num_decay_steps
+
+        if self.decay_start_step < self.num_warmup_steps:
+            sys.exit("Learning rate warmup must finish before the decay starts")
+
+        super(LRPolicyScheduler, self).__init__(optimizer)
+
+    def get_lr(self):
+        step_count = self._step_count
+        if step_count < self.num_warmup_steps:
+            # warmup
+            scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps
+            lr = [base_lr * scale for base_lr in self.base_lrs]
+            self.last_lr = lr
+        elif self.decay_start_step <= step_count and step_count < self.decay_end_step:
+            # decay
+            decayed_steps = step_count - self.decay_start_step
+            scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2
+            min_lr = 0.0000001
+            lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs]
+            self.last_lr = lr
+        else:
+            if self.num_decay_steps > 0:
+                # freeze at last, either because we're after decay
+                # or because we're between warmup and decay
+                lr = self.last_lr
+            else:
+                # do not adjust
+                lr = self.base_lrs
+        return lr
+
+### define dlrm in PyTorch ###
+class DLRM_Net(nn.Module):
+    def create_mlp(self, ln, sigmoid_layer):
+        # build MLP layer by layer
+        layers = nn.ModuleList()
+        for i in range(0, ln.size - 1):
+            n = ln[i]
+            m = ln[i + 1]
+
+            # construct fully connected operator
+            LL = nn.Linear(int(n), int(m), bias=True)
+
+            # initialize the weights
+            # with torch.no_grad():
+            # custom Xavier input, output or two-sided fill
+            mean = 0.0  # std_dev = np.sqrt(variance)
+            std_dev = np.sqrt(2 / (m + n))  # np.sqrt(1 / m) # np.sqrt(1 / n)
+            W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32)
+            std_dev = np.sqrt(1 / m)  # np.sqrt(2 / (m + 1))
+            bt = np.random.normal(mean, std_dev, size=m).astype(np.float32)
+            # approach 1
+            LL.weight.data = torch.tensor(W, requires_grad=True)
+            LL.bias.data = torch.tensor(bt, requires_grad=True)
+            # approach 2
+            # LL.weight.data.copy_(torch.tensor(W))
+            # LL.bias.data.copy_(torch.tensor(bt))
+            # approach 3
+            # LL.weight = Parameter(torch.tensor(W),requires_grad=True)
+            # LL.bias = Parameter(torch.tensor(bt),requires_grad=True)
+            layers.append(LL)
+
+            # construct sigmoid or relu operator
+            if i == sigmoid_layer:
+                layers.append(nn.Sigmoid())
+            else:
+                layers.append(nn.ReLU())
+
+        # approach 1: use ModuleList
+        # return layers
+        # approach 2: use Sequential container to wrap all layers
+        return torch.nn.Sequential(*layers)
+
+    def create_emb(self, m, ln):
+        emb_l = nn.ModuleList()
+        # save the numpy random state
+        np_rand_state = np.random.get_state()
+        for i in range(0, ln.size):
+            if ext_dist.my_size > 1:
+                if not i in self.local_emb_indices: continue
+            # Use per table random seed for Embedding initialization
+            np.random.seed(self.l_emb_seeds[i])
+            n = ln[i]
+            # construct embedding operator
+            if self.qr_flag and n > self.qr_threshold:
+                EE = QREmbeddingBag(n, m, self.qr_collisions,
+                    operation=self.qr_operation, mode="sum", sparse=True)
+            elif self.md_flag:
+                base = max(m)
+                _m = m[i] if n > self.md_threshold else base
+                EE = PrEmbeddingBag(n, _m, base)
+                # use np initialization as below for consistency...
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m)
+                ).astype(np.float32)
+                EE.embs.weight.data = torch.tensor(W, requires_grad=True)
+
+            else:
+                #_weight = torch.empty([n, m]).uniform_(-np.sqrt(1 / n), np.sqrt(1 / n))
+                #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight= _weight)
+                #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True)
+
+                # initialize embeddings
+                # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n))
+                W = np.random.uniform(
+                    low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m)
+                ).astype(np.float32)
+                # approach 1
+                EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True))
+                #EE.weight.data = torch.tensor(W, requires_grad=True)
+                # approach 2
+                # EE.weight.data.copy_(torch.tensor(W))
+                # approach 3
+                # EE.weight = Parameter(torch.tensor(W),requires_grad=True)
+
+            if ext_dist.my_size > 1:
+                if i in self.local_emb_indices:
+                    emb_l.append(EE)
+            else:
+                emb_l.append(EE)
+
+        # Restore the numpy random state
+        np.random.set_state(np_rand_state)
+        return emb_l
+
+    def __init__(
+        self,
+        m_spa=None,
+        ln_emb=None,
+        ln_bot=None,
+        ln_top=None,
+        proj_size = 0,
+        arch_interaction_op=None,
+        arch_interaction_itself=False,
+        sigmoid_bot=-1,
+        sigmoid_top=-1,
+        sync_dense_params=True,
+        loss_threshold=0.0,
+        ndevices=-1,
+        qr_flag=False,
+        qr_operation="mult",
+        qr_collisions=0,
+        qr_threshold=200,
+        md_flag=False,
+        md_threshold=200,
+    ):
+        super(DLRM_Net, self).__init__()
+
+        if (
+            (m_spa is not None)
+            and (ln_emb is not None)
+            and (ln_bot is not None)
+            and (ln_top is not None)
+            and (arch_interaction_op is not None)
+        ):
+
+            # save arguments
+            self.proj_size = proj_size
+            self.ndevices = ndevices
+            self.output_d = 0
+            self.parallel_model_batch_size = -1
+            self.parallel_model_is_not_prepared = True
+            self.arch_interaction_op = arch_interaction_op
+            self.arch_interaction_itself = arch_interaction_itself
+            self.sync_dense_params = sync_dense_params
+            self.loss_threshold = loss_threshold
+            # create variables for QR embedding if applicable
+            self.qr_flag = qr_flag
+            if self.qr_flag:
+                self.qr_collisions = qr_collisions
+                self.qr_operation = qr_operation
+                self.qr_threshold = qr_threshold
+            # create variables for MD embedding if applicable
+            self.md_flag = md_flag
+            if self.md_flag:
+                self.md_threshold = md_threshold
+
+            # generate np seeds for Emb table initialization
+            self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb))
+
+            #If running distributed, get local slice of embedding tables
+            if ext_dist.my_size > 1:
+                n_emb = len(ln_emb)
+                self.n_global_emb = n_emb
+                self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb)
+                self.local_emb_slice = ext_dist.get_my_slice(n_emb)
+                self.local_emb_indices = list(range(n_emb))[self.local_emb_slice]
+                #ln_emb = ln_emb[self.local_emb_slice]
+
+            # create operators
+            if ndevices <= 1:
+                self.emb_l = self.create_emb(m_spa, ln_emb)
+            self.bot_l = self.create_mlp(ln_bot, sigmoid_bot)
+            self.top_l = self.create_mlp(ln_top, sigmoid_top)
+            if (proj_size > 0):
+                self.proj_l = project.create_proj(len(ln_emb)+1, proj_size)
+
+    def apply_mlp(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_proj(self, x, layers):
+        # approach 1: use ModuleList
+        # for layer in layers:
+        #     x = layer(x)
+        # return x
+        # approach 2: use Sequential container to wrap all layers
+        return layers(x)
+
+    def apply_emb(self, lS_o, lS_i, emb_l):
+        # WARNING: notice that we are processing the batch at once. We implicitly
+        # assume that the data is laid out such that:
+        # 1. each embedding is indexed with a group of sparse indices,
+        #   corresponding to a single lookup
+        # 2. for each embedding the lookups are further organized into a batch
+        # 3. for a list of embedding tables there is a list of batched lookups
+
+        ly = []
+        for k, sparse_index_group_batch in enumerate(lS_i):
+            sparse_offset_group_batch = lS_o[k]
+
+            # embedding lookup
+            # We are using EmbeddingBag, which implicitly uses sum operator.
+            # The embeddings are represented as tall matrices, with sum
+            # happening vertically across 0 axis, resulting in a row vector
+            E = emb_l[k]
+            V = E(sparse_index_group_batch, sparse_offset_group_batch)
+
+            ly.append(V)
+
+        # print(ly)
+        return ly
+
+    def interact_features(self, x, ly):
+        if self.arch_interaction_op == "dot":
+            # concatenate dense and sparse features
+            (batch_size, d) = x.shape
+            T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d))
+            # perform a dot product
+            if (self.proj_size > 0):
+                R = project.project(T, x, self.proj_l)
+                #TT = torch.transpose(T, 1, 2)
+                #TS = torch.reshape(TT, (-1, TT.size(2)))
+                #TC = self.apply_mlp(TS, self.proj_l)
+                #TR = torch.reshape(TC, (-1, d ,self.proj_size))
+                #Z  = torch.bmm(T, TR)
+                #Zflat = Z.view((batch_size, -1))
+                #R = torch.cat([x] + [Zflat], dim=1)
+            else:
+                Z = torch.bmm(T, torch.transpose(T, 1, 2))
+                # append dense feature with the interactions (into a row vector)
+                # approach 1: all
+                # Zflat = Z.view((batch_size, -1))
+                # approach 2: unique
+                _, ni, nj = Z.shape
+                # approach 1: tril_indices
+                # offset = 0 if self.arch_interaction_itself else -1
+                # li, lj = torch.tril_indices(ni, nj, offset=offset)
+                # approach 2: custom
+                offset = 1 if self.arch_interaction_itself else 0
+                li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
+                lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])
+                Zflat = Z[:, li, lj]
+                # concatenate dense features and interactions
+                R = torch.cat([x] + [Zflat], dim=1)
+        elif self.arch_interaction_op == "cat":
+            # concatenation features (into a row vector)
+            R = torch.cat([x] + ly, dim=1)
+        else:
+            sys.exit(
+                "ERROR: --arch-interaction-op="
+                + self.arch_interaction_op
+                + " is not supported"
+            )
+
+        return R
+
+    def forward(self, dense_x, lS_o, lS_i):
+        if ext_dist.my_size > 1:
+            return self.distributed_forward(dense_x, lS_o, lS_i)
+        elif self.ndevices <= 1:
+            return self.sequential_forward(dense_x, lS_o, lS_i)
+        else:
+            return self.parallel_forward(dense_x, lS_o, lS_i)
+
+    def sequential_forward(self, dense_x, lS_o, lS_i):
+        # process dense features (using bottom mlp), resulting in a row vector
+        x = self.apply_mlp(dense_x, self.bot_l)
+        # debug prints
+        # print("intermediate")
+        # print(x.detach().cpu().numpy())
+
+        # process sparse features(using embeddings), resulting in a list of row vectors
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        # for y in ly:
+        #     print(y.detach().cpu().numpy())
+
+        # interact features (dense and sparse)
+        z = self.interact_features(x, ly)
+        # print(z.detach().cpu().numpy())
+
+        # obtain probability of a click (using top mlp)
+        p = self.apply_mlp(z, self.top_l)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold))
+        else:
+            z = p
+
+        return z
+
+    def distributed_forward(self, dense_x, lS_o, lS_i):
+        batch_size = dense_x.size()[0]
+        # WARNING: # of ranks must be <= batch size in distributed_forward call
+        # if batch_size < ext_dist.my_size:
+        #    sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size))
+        # if batch_size % ext_dist.my_size != 0:
+        #    sys.exit("ERROR: batch_size %d can not split across %d ranks evenly" % (batch_size, ext_dist.my_size))
+
+        ## already handled in input the data
+        ##dense_x = dense_x[ext_dist.get_my_slice(batch_size)]
+        ##lS_o = lS_o[self.local_emb_slice]
+        ##lS_i = lS_i[self.local_emb_slice]
+
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in distributed_forward call")
+
+        # embeddings
+        tm.tmEmb.start()
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        tm.tmEmb.stop()
+
+        # print("ly: ", ly)
+        # debug prints
+        # print(ly)
+
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each rank. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each rank.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in distributed_forward call")
+
+        tm.tmA2A.start()
+        a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank)
+        tm.tmA2A.stop()
+
+        tm.tmBot.start()
+        x = self.apply_mlp(dense_x, self.bot_l)
+        tm.tmBot.stop()
+
+        # debug prints
+        # print(x)
+
+        tm.tmA2A1.start()
+        ly = a2a_req.wait()
+        tm.tmA2A1.stop()
+        # print("ly: ", ly)
+        ly = list(ly)
+
+        # interactions
+        tm.tmInt.start()
+        z = self.interact_features(x, ly)
+        tm.tmInt.stop()
+        # debug prints
+        # print(z)
+
+        # top mlp
+        tm.tmTop.start()
+        p = self.apply_mlp(z, self.top_l)
+        tm.tmTop.stop()
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z = torch.clamp(
+                p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z = p
+
+        ### gather the distributed results on each rank ###
+        # For some reason it requires explicit sync before all_gather call if
+        # tensor is on GPU memory
+        tm.tmAllGa.start()
+        if z.is_cuda: torch.cuda.synchronize()
+        (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size * ext_dist.my_size)
+        z = ext_dist.all_gather(z, batch_split_lengths)
+        tm.tmAllGa.stop()
+        #print("Z: %s" % z)
+
+        return z
+
+    def parallel_forward(self, dense_x, lS_o, lS_i):
+        ### prepare model (overwrite) ###
+        # WARNING: # of devices must be >= batch size in parallel_forward call
+        batch_size = dense_x.size()[0]
+        ndevices = min(self.ndevices, batch_size, len(self.emb_l))
+        device_ids = range(ndevices)
+        # WARNING: must redistribute the model if mini-batch size changes(this is common
+        # for last mini-batch, when # of elements in the dataset/batch size is not even
+        if self.parallel_model_batch_size != batch_size:
+            self.parallel_model_is_not_prepared = True
+
+        if self.parallel_model_is_not_prepared or self.sync_dense_params:
+            # replicate mlp (data parallelism)
+            self.bot_l_replicas = replicate(self.bot_l, device_ids)
+            self.top_l_replicas = replicate(self.top_l, device_ids)
+            self.parallel_model_batch_size = batch_size
+
+        if self.parallel_model_is_not_prepared:
+            # distribute embeddings (model parallelism)
+            t_list = []
+            for k, emb in enumerate(self.emb_l):
+                d = torch.device("cuda:" + str(k % ndevices))
+                emb.to(d)
+                t_list.append(emb.to(d))
+            self.emb_l = nn.ModuleList(t_list)
+            self.parallel_model_is_not_prepared = False
+
+        ### prepare input (overwrite) ###
+        # scatter dense features (data parallelism)
+        # print(dense_x.device)
+        dense_x = scatter(dense_x, device_ids, dim=0)
+        # distribute sparse features (model parallelism)
+        if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)):
+            sys.exit("ERROR: corrupted model input detected in parallel_forward call")
+
+        t_list = []
+        i_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            t_list.append(lS_o[k].to(d))
+            i_list.append(lS_i[k].to(d))
+        lS_o = t_list
+        lS_i = i_list
+
+        ### compute results in parallel ###
+        # bottom mlp
+        # WARNING: Note that the self.bot_l is a list of bottom mlp modules
+        # that have been replicated across devices, while dense_x is a tuple of dense
+        # inputs that has been scattered across devices on the first (batch) dimension.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of dense_x.
+        x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids)
+        # debug prints
+        # print(x)
+
+        # embeddings
+        ly = self.apply_emb(lS_o, lS_i, self.emb_l)
+        # debug prints
+        # print(ly)
+
+        # butterfly shuffle (implemented inefficiently for now)
+        # WARNING: Note that at this point we have the result of the embedding lookup
+        # for the entire batch on each device. We would like to obtain partial results
+        # corresponding to all embedding lookups, but part of the batch on each device.
+        # Therefore, matching the distribution of output of bottom mlp, so that both
+        # could be used for subsequent interactions on each device.
+        if len(self.emb_l) != len(ly):
+            sys.exit("ERROR: corrupted intermediate result in parallel_forward call")
+
+        t_list = []
+        for k, _ in enumerate(self.emb_l):
+            d = torch.device("cuda:" + str(k % ndevices))
+            y = scatter(ly[k], device_ids, dim=0)
+            t_list.append(y)
+        # adjust the list to be ordered per device
+        ly = list(map(lambda y: list(y), zip(*t_list)))
+        # debug prints
+        # print(ly)
+
+        # interactions
+        z = []
+        for k in range(ndevices):
+            zk = self.interact_features(x[k], ly[k])
+            z.append(zk)
+        # debug prints
+        # print(z)
+
+        # top mlp
+        # WARNING: Note that the self.top_l is a list of top mlp modules that
+        # have been replicated across devices, while z is a list of interaction results
+        # that by construction are scattered across devices on the first (batch) dim.
+        # The output is a list of tensors scattered across devices according to the
+        # distribution of z.
+        p = parallel_apply(self.top_l_replicas, z, None, device_ids)
+
+        ### gather the distributed results ###
+        p0 = gather(p, self.output_d, dim=0)
+
+        # clamp output if needed
+        if 0.0 < self.loss_threshold and self.loss_threshold < 1.0:
+            z0 = torch.clamp(
+                p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold)
+            )
+        else:
+            z0 = p0
+
+        return z0
+
+
+def dash_separated_ints(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            int(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of ints" % value)
+
+    return value
+
+
+def dash_separated_floats(value):
+    vals = value.split('-')
+    for val in vals:
+        try:
+            float(val)
+        except ValueError:
+            raise argparse.ArgumentTypeError(
+                "%s is not a valid dash separated list of floats" % value)
+
+    return value
+
+
+if __name__ == "__main__":
+    ### import packages ###
+    import sys
+    import os
+    import argparse
+
+    ### parse arguments ###
+    parser = argparse.ArgumentParser(
+        description="Train Deep Learning Recommendation Model (DLRM)"
+    )
+    # model related parameters
+    parser.add_argument("--arch-sparse-feature-size", type=int, default=2)
+
+    parser.add_argument(
+        "--arch-embedding-size", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument("--arch-project-size", type=int, default=0)
+
+    # j will be replaced with the table number
+    parser.add_argument(
+        "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2")
+    parser.add_argument(
+        "--arch-mlp-top", type=dash_separated_ints, default="4-2-1")
+    parser.add_argument(
+        "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot")
+    parser.add_argument("--arch-interaction-itself", action="store_true", default=False)
+    # embedding table options
+    parser.add_argument("--md-flag", action="store_true", default=False)
+    parser.add_argument("--md-threshold", type=int, default=200)
+    parser.add_argument("--md-temperature", type=float, default=0.3)
+    parser.add_argument("--md-round-dims", action="store_true", default=False)
+    parser.add_argument("--qr-flag", action="store_true", default=False)
+    parser.add_argument("--qr-threshold", type=int, default=200)
+    parser.add_argument("--qr-operation", type=str, default="mult")
+    parser.add_argument("--qr-collisions", type=int, default=4)
+    # activations and loss
+    parser.add_argument("--activation-function", type=str, default="relu")
+    parser.add_argument("--loss-function", type=str, default="mse")  # or bce or wbce
+    parser.add_argument(
+        "--loss-weights", type=dash_separated_floats, default="1.0-1.0")  # for wbce
+    parser.add_argument("--loss-threshold", type=float, default=0.0)  # 1.0e-7
+    parser.add_argument("--round-targets", type=bool, default=False)
+    # data
+    parser.add_argument("--data-size", type=int, default=1)
+    parser.add_argument("--num-batches", type=int, default=0)
+    parser.add_argument(
+        "--data-generation", type=str, default="random"
+    )  # synthetic or dataset
+    parser.add_argument("--synthetic-data-folder", type=str,
+        default="./synthetic_data/syn_data_bs65536")
+    # add Gaussian distribution
+    parser.add_argument("--rand-data-dist", type=str, default="uniform")  # uniform or gaussian
+    parser.add_argument("--rand-data-min", type=float, default=0)
+    parser.add_argument("--rand-data-max", type=float, default=1)
+    parser.add_argument("--rand-data-mu", type=float, default=-1)
+    parser.add_argument("--rand-data-sigma", type=float, default=1)
+
+    parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log")
+    parser.add_argument("--data-set", type=str, default="kaggle")  # or terabyte
+    parser.add_argument("--raw-data-file", type=str, default="")
+    parser.add_argument("--processed-data-file", type=str, default="")
+    parser.add_argument("--data-randomize", type=str, default="total")  # or day or none
+    parser.add_argument("--data-trace-enable-padding", type=bool, default=False)
+    parser.add_argument("--max-ind-range", type=int, default=-1)
+    parser.add_argument("--data-sub-sample-rate", type=float, default=0.0)  # in [0, 1]
+    parser.add_argument("--num-indices-per-lookup", type=int, default=10)
+    parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False)
+    parser.add_argument("--num-workers", type=int, default=0)
+    parser.add_argument("--memory-map", action="store_true", default=False)
+    # training
+    parser.add_argument("--mini-batch-size", type=int, default=1)
+    parser.add_argument("--nepochs", type=int, default=1)
+    parser.add_argument("--learning-rate", type=float, default=0.01)
+    parser.add_argument("--print-precision", type=int, default=5)
+    parser.add_argument("--numpy-rand-seed", type=int, default=123)
+    parser.add_argument("--sync-dense-params", type=bool, default=True)
+    # inference
+    parser.add_argument("--inference-only", action="store_true", default=False)
+    # onnx
+    parser.add_argument("--save-onnx", action="store_true", default=False)
+    # gpu
+    parser.add_argument("--use-gpu", action="store_true", default=False)
+    # distributed run
+    parser.add_argument("--dist-backend", type=str, default="")
+    # debugging and profiling
+    parser.add_argument("--print-freq", type=int, default=1)
+    parser.add_argument("--test-freq", type=int, default=-1)
+    parser.add_argument("--test-mini-batch-size", type=int, default=-1)
+    parser.add_argument("--test-num-workers", type=int, default=-1)
+    parser.add_argument("--print-time", action="store_true", default=False)
+    parser.add_argument("--debug-mode", action="store_true", default=False)
+    parser.add_argument("--enable-profiling", action="store_true", default=False)
+    parser.add_argument("--plot-compute-graph", action="store_true", default=False)
+    # store/load model
+    parser.add_argument("--out-dir", type=str, default=".")
+    parser.add_argument("--save-model", type=str, default="")
+    parser.add_argument("--load-model", type=str, default="")
+    # mlperf logging (disables other output and stops early)
+    parser.add_argument("--mlperf-logging", action="store_true", default=False)
+    # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107
+    parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0)
+    # stop at target AUC Terabyte (no subsampling) 0.8025
+    parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0)
+    parser.add_argument("--mlperf-bin-loader", action='store_true', default=False)
+    parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False)
+
+    # LR policy
+    parser.add_argument("--lr-num-warmup-steps", type=int, default=0)
+    parser.add_argument("--lr-decay-start-step", type=int, default=0)
+    parser.add_argument("--lr-num-decay-steps", type=int, default=0)
+
+    args = parser.parse_args()
+
+    print(socket.gethostname())
+
+    ext_dist.init_distributed(backend=args.dist_backend)
+
+    # print("success size= ", ext_dist.my_size, ext_dist.my_rank)
+
+    ext_dist.barrier()
+
+    if args.mlperf_logging:
+        print('command line args: ', json.dumps(vars(args)))
+
+    ### some basic setup ###
+    np.random.seed(args.numpy_rand_seed)
+    np.set_printoptions(precision=args.print_precision)
+    torch.set_printoptions(precision=args.print_precision)
+    torch.manual_seed(args.numpy_rand_seed)
+
+    if (args.test_mini_batch_size < 0):
+        # if the parameter is not set, use the training batch size
+        args.test_mini_batch_size = args.mini_batch_size
+    if (args.test_num_workers < 0):
+        # if the parameter is not set, use the same parameter for training
+        args.test_num_workers = args.num_workers
+    if args.mini_batch_size % ext_dist.my_size !=0 or args.test_mini_batch_size % ext_dist.my_size != 0:
+        print("Either test minibatch (%d) or train minibatch (%d) does not split across %d ranks" % (args.test_mini_batch_size, args.mini_batch_size, ext_dist.my_size))
+        sys.exit(1)
+
+    use_gpu = args.use_gpu and torch.cuda.is_available()
+    if use_gpu:
+        torch.cuda.manual_seed_all(args.numpy_rand_seed)
+        torch.backends.cudnn.deterministic = True
+        if ext_dist.my_size > 1:
+            ngpus = torch.cuda.device_count()  # 1
+            if ext_dist.my_local_size > torch.cuda.device_count():
+                print("Not sufficient GPUs available... local_size = %d, ngpus = %d" % (ext_dist.my_local_size, ngpus))
+                sys.exit(1)
+            ngpus = 1
+            device = torch.device("cuda", ext_dist.my_local_rank)
+        else:
+            device = torch.device("cuda", 0)
+            ngpus = torch.cuda.device_count()  # 1
+            ngpus=1
+        print("Using {} GPU(s)...".format(ngpus))
+    else:
+        device = torch.device("cpu")
+        print("Using CPU...")
+
+    ### prepare training data ###
+    ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-")
+    # input data
+    if (args.data_generation == "dataset"):
+
+        train_data, train_ld, test_data, test_ld = \
+            dp.make_criteo_data_and_loaders(args)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        nbatches_test = len(test_ld)
+
+        ln_emb = train_data.counts
+        # enforce maximum limit on number of vectors per embedding
+        if args.max_ind_range > 0:
+            ln_emb = np.array(list(map(
+                lambda x: x if x < args.max_ind_range else args.max_ind_range,
+                ln_emb
+            )))
+        m_den = train_data.m_den
+        ln_bot[0] = m_den
+
+    elif args.data_generation == "synthetic":
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld = dd.data_loader(args, ln_emb, m_den)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+        table_feature_map = None #  {idx : idx for idx in range(len(ln_emb))}
+
+    else:
+        # input and target at random
+        ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-")
+        m_den = ln_bot[0]
+        train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den)
+        nbatches = args.num_batches if args.num_batches > 0 else len(train_ld)
+
+    ### parse command line arguments ###
+    m_spa = args.arch_sparse_feature_size
+    num_fea = ln_emb.size + 1  # num sparse + num dense features
+    m_den_out = ln_bot[ln_bot.size - 1]
+    if args.arch_interaction_op == "dot":
+        # approach 1: all
+        # num_int = num_fea * num_fea + m_den_out
+        # approach 2: unique
+        if (args.arch_project_size > 0):
+            num_int = num_fea * args.arch_project_size + m_den_out
+        else:
+            if args.arch_interaction_itself:
+                num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out
+            else:
+                num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out
+    elif args.arch_interaction_op == "cat":
+        num_int = num_fea * m_den_out
+    else:
+        sys.exit(
+            "ERROR: --arch-interaction-op="
+            + args.arch_interaction_op
+            + " is not supported"
+        )
+    arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top
+    ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-")
+
+    # sanity check: feature sizes and mlp dimensions must match
+    if m_den != ln_bot[0]:
+        sys.exit(
+            "ERROR: arch-dense-feature-size "
+            + str(m_den)
+            + " does not match first dim of bottom mlp "
+            + str(ln_bot[0])
+        )
+    if args.qr_flag:
+        if args.qr_operation == "concat" and 2 * m_spa != m_den_out:
+            sys.exit(
+                "ERROR: 2 arch-sparse-feature-size "
+                + str(2 * m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+                + " (note that the last dim of bottom mlp must be 2x the embedding dim)"
+            )
+        if args.qr_operation != "concat" and m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    else:
+        if m_spa != m_den_out:
+            sys.exit(
+                "ERROR: arch-sparse-feature-size "
+                + str(m_spa)
+                + " does not match last dim of bottom mlp "
+                + str(m_den_out)
+            )
+    if num_int != ln_top[0]:
+        sys.exit(
+            "ERROR: # of feature interactions "
+            + str(num_int)
+            + " does not match first dimension of top mlp "
+            + str(ln_top[0])
+        )
+
+    # assign mixed dimensions if applicable
+    if args.md_flag:
+        m_spa = md_solver(
+            torch.tensor(ln_emb),
+            args.md_temperature,  # alpha
+            d0=m_spa,
+            round_dim=args.md_round_dims
+        ).tolist()
+
+    # test prints (model arch)
+    if args.debug_mode:
+        print("model arch:")
+        print(
+            "mlp top arch "
+            + str(ln_top.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_top)
+        print("# of interactions")
+        print(num_int)
+        print(
+            "mlp bot arch "
+            + str(ln_bot.size - 1)
+            + " layers, with input to output dimensions:"
+        )
+        print(ln_bot)
+        print("# of features (sparse and dense)")
+        print(num_fea)
+        print("dense feature size")
+        print(m_den)
+        print("sparse feature size")
+        print(m_spa)
+        print(
+            "# of embeddings (= # of sparse features) "
+            + str(ln_emb.size)
+            + ", with dimensions "
+            + str(m_spa)
+            + "x:"
+        )
+        print(ln_emb)
+
+        print("data (inputs and targets):")
+        for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
+            # early exit if nbatches was set by the user and has been exceeded
+            if nbatches > 0 and j >= nbatches:
+                break
+
+            print("mini-batch: %d" % j)
+            print(X.detach().cpu().numpy())
+            # transform offsets to lengths when printing
+            print(
+                [
+                    np.diff(
+                        S_o.detach().cpu().tolist() + list(lS_i[i].shape)
+                    ).tolist()
+                    for i, S_o in enumerate(lS_o)
+                ]
+            )
+            print([S_i.detach().cpu().tolist() for S_i in lS_i])
+            print(T.detach().cpu().numpy())
+
+    ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1
+
+    ### construct the neural network specified above ###
+    # WARNING: to obtain exactly the same initialization for
+    # the weights we need to start from the same random seed.
+    # np.random.seed(args.numpy_rand_seed)
+    dlrm = DLRM_Net(
+        m_spa,
+        ln_emb,
+        ln_bot,
+        ln_top,
+        args.arch_project_size,
+        arch_interaction_op=args.arch_interaction_op,
+        arch_interaction_itself=args.arch_interaction_itself,
+        sigmoid_bot=-1,
+        sigmoid_top=ln_top.size - 2,
+        sync_dense_params=args.sync_dense_params,
+        loss_threshold=args.loss_threshold,
+        ndevices=ndevices,
+        qr_flag=args.qr_flag,
+        qr_operation=args.qr_operation,
+        qr_collisions=args.qr_collisions,
+        qr_threshold=args.qr_threshold,
+        md_flag=args.md_flag,
+        md_threshold=args.md_threshold,
+    )
+    # test prints
+    if args.debug_mode:
+        print("initial parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+        # print(dlrm)
+
+    if use_gpu:
+        # Custom Model-Data Parallel
+        # the mlps are replicated and use data parallelism, while
+        # the embeddings are distributed and use model parallelism
+        dlrm = dlrm.to(device)  # .cuda()
+        if dlrm.ndevices > 1:
+            dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb)
+
+    if ext_dist.my_size > 1:
+        if use_gpu:
+            device_ids = [ext_dist.my_local_rank]
+            dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids)
+            dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids)
+        else:
+            dlrm.bot_l = DDP(dlrm.bot_l)
+            dlrm.top_l = DDP(dlrm.top_l)
+
+    # specify the loss function
+    if args.loss_function == "mse":
+        loss_fn = torch.nn.MSELoss(reduction="mean")
+    elif args.loss_function == "bce":
+        loss_fn = torch.nn.BCELoss(reduction="mean")
+    elif args.loss_function == "wbce":
+        loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-"))
+        loss_fn = torch.nn.BCELoss(reduction="none")
+    else:
+        sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported")
+
+    if not args.inference_only:
+        # specify the optimizer algorithm
+
+        if ext_dist.my_size == 1:
+            optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate)
+            #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step,
+            #                                 args.lr_num_decay_steps)
+        else:
+            optimizer = torch.optim.SGD([
+                {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate},
+                {"params": dlrm.bot_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size},
+                {"params": dlrm.top_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size}
+            ], lr=args.learning_rate)
+
+    ### main loop ###
+    def time_wrap(use_gpu):
+        if use_gpu:
+            torch.cuda.synchronize()
+        return time.time()
+
+    def dlrm_wrap(X, lS_o, lS_i, use_gpu, device):
+        if use_gpu:  # .cuda()
+            # lS_i can be either a list of tensors or a stacked tensor.
+            # Handle each case below:
+            tm.tmH2D.start()
+            #lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \
+            #    else lS_i.to(device)
+            #lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \
+            #    else lS_o.to(device)
+            #X = X.to(device)
+            tm.tmH2D.stop()
+
+            return dlrm(
+                X,
+                lS_o,
+                lS_i
+            )
+        else:
+            return dlrm(X, lS_o, lS_i)
+
+    def loss_fn_wrap(Z, T, use_gpu, device):
+        if args.loss_function == "mse" or args.loss_function == "bce":
+            if use_gpu:
+                # return loss_fn(Z, T.to(device))
+                return loss_fn(Z, T)
+            else:
+                return loss_fn(Z, T)
+        elif args.loss_function == "wbce":
+            if use_gpu:
+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T).to(device)
+                loss_fn_ = loss_fn(Z, T.to(device))
+            else:
+                loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T)
+                loss_fn_ = loss_fn(Z, T.to(device))
+            loss_sc_ = loss_ws_ * loss_fn_
+            # debug prints
+            # print(loss_ws_)
+            # print(loss_fn_)
+            return loss_sc_.mean()
+
+    # training or inference
+    best_gA_test = 0
+    best_auc_test = 0
+    skip_upto_epoch = 0
+    skip_upto_batch = 0
+    total_time = 0
+    total_loss = 0
+    total_accu = 0
+    total_iter = 0
+    total_samp = 0
+    k = 0
+
+    # Load model is specified
+    if not (args.load_model == ""):
+        print("Loading saved model {}".format(args.load_model))
+        if use_gpu:
+            if dlrm.ndevices > 1:
+                # NOTE: when targeting inference on multiple GPUs,
+                # load the model as is on CPU or GPU, with the move
+                # to multiple GPUs to be done in parallel_forward
+                ld_model = torch.load(args.load_model)
+            else:
+                # NOTE: when targeting inference on single GPU,
+                # note that the call to .to(device) has already happened
+                ld_model = torch.load(
+                    args.load_model,
+                    map_location=torch.device('cuda')
+                    # map_location=lambda storage, loc: storage.cuda(0)
+                )
+        else:
+            # when targeting inference on CPU
+            ld_model = torch.load(args.load_model, map_location=torch.device('cpu'))
+        dlrm.load_state_dict(ld_model["state_dict"])
+        ld_j = ld_model["iter"]
+        ld_k = ld_model["epoch"]
+        ld_nepochs = ld_model["nepochs"]
+        ld_nbatches = ld_model["nbatches"]
+        ld_nbatches_test = ld_model["nbatches_test"]
+        ld_gA = ld_model["train_acc"]
+        ld_gL = ld_model["train_loss"]
+        ld_total_loss = ld_model["total_loss"]
+        ld_total_accu = ld_model["total_accu"]
+        ld_gA_test = ld_model["test_acc"]
+        ld_gL_test = ld_model["test_loss"]
+        if not args.inference_only:
+            optimizer.load_state_dict(ld_model["opt_state_dict"])
+            best_gA_test = ld_gA_test
+            total_loss = ld_total_loss
+            total_accu = ld_total_accu
+            skip_upto_epoch = ld_k  # epochs
+            skip_upto_batch = ld_j  # batches
+        else:
+            args.print_freq = ld_nbatches
+            args.test_freq = 0
+
+        print(
+            "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format(
+                ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test
+            )
+        )
+        print(
+            "Training state: loss = {:.6f}, accuracy = {:3.3f} %".format(
+                ld_gL, ld_gA * 100
+            )
+        )
+        print(
+            "Testing state: loss = {:.6f}, accuracy = {:3.3f} %".format(
+                ld_gL_test, ld_gA_test * 100
+            )
+        )
+
+    ext_dist.barrier()
+    startTime = time.time()
+    startTime0 = startTime
+    skipped = 0
+
+    #print("Processing data")
+    #t1 = time.time()
+    syndatasetlen = min(65536 // args.mini_batch_size, nbatches)
+    #myobj = list(enumerate(train_ld))
+    #t2 = time.time()
+    #print("Processing data takes {} seconds with len={} {} {} {}".format(t2-t1, len(myobj), nbatches, args.mini_batch_size, syndatasetlen))
+    print("time/loss/accuracy (if enabled):")
+    with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof:
+    # with torch.autograd.profiler.emit_nvtx():
+
+        while k < args.nepochs:
+            if k < skip_upto_epoch:
+                continue
+
+            if use_gpu:
+                tm.tmSync1.start()
+                torch.cuda.synchronize()
+                tm.tmSync1.stop()
+            accum_time_begin = time.time()
+
+            if args.mlperf_logging:
+                previous_iteration_time = None
+
+            # for j, (X, lS_o, lS_i, T) in enumerate(train_ld):
+            for j in range(nbatches):
+
+                if (skipped == 2):
+                    ext_dist.barrier()
+                    startTime = time.time()
+                    ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank)
+                    # torch.cuda.profiler.cudart().cudaProfilerStart()
+                    if use_gpu:
+                        torch.cuda.profiler.start()
+                    tm.tmClear()
+                skipped = skipped + 1
+
+                tm.tmGetData.start()
+                if j==0 and use_gpu:
+                    # X, lS_o, lS_i, T = train_data.__getitem__(j%syndatasetlen)
+                    X, lS_o, lS_i, T = next(enumerate(train_ld)
+
+                    print("BB0 X size {} lS_i[0] size {}".format(X.size(), lS_i[0].size()))
+                    mybatch_size = X.size()[0]
+                    if ext_dist.my_size > 1:
+                        X = X[ext_dist.get_my_slice(mybatch_size)]
+                        lS_o = lS_o[dlrm.local_emb_slice]
+                        lS_i = lS_i[dlrm.local_emb_slice]
+
+                    lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \
+                        else lS_i.to(device)
+                    lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \
+                        else lS_o.to(device)
+                    X = X.to(device)
+                    T = T.to(device)
+                    print("BBB X size {} lS_i[0] size {}".format(X.size(), lS_i[0].size()))
+
+                tm.tmGetData.stop()
+
+                if j == 0 and args.save_onnx:
+                    (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i)
+
+                if j < skip_upto_batch:
+                    continue
+
+                if args.mlperf_logging:
+                    current_time = time_wrap(use_gpu)
+                    if previous_iteration_time:
+                        iteration_time = current_time - previous_iteration_time
+                    else:
+                        iteration_time = 0
+                    previous_iteration_time = current_time
+                else:
+                    if use_gpu:
+                        tm.tmSync2.start()
+                        torch.cuda.synchronize()
+                        tm.tmSync2.stop()
+                    t1 = time.time()
+
+                # early exit if nbatches was set by the user and has been exceeded
+                if nbatches > 0 and j >= nbatches:
+                    break
+                '''
+                # debug prints
+                print("input and targets")
+                print(X.detach().cpu().numpy())
+                print([np.diff(S_o.detach().cpu().tolist()
+                       + list(lS_i[i].shape)).tolist() for i, S_o in enumerate(lS_o)])
+                print([S_i.detach().cpu().numpy().tolist() for S_i in lS_i])
+                print(T.detach().cpu().numpy())
+                '''
+                # Skip the batch if batch size not multiple of total ranks
+                if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0:
+                    print("Warning: Skiping the batch %d with size %d" % (j, X.size(0)))
+                    continue
+
+
+                # forward pass
+                tm.tmFwd.start()
+                Z = dlrm_wrap(X, lS_o, lS_i, use_gpu, device)
+                tm.tmFwd.stop()
+
+                # loss
+                tm.tmLoss.start()
+                E = loss_fn_wrap(Z, T, use_gpu, device)
+                '''
+                # debug prints
+                print("output and loss")
+                print(Z.detach().cpu().numpy())
+                print(E.detach().cpu().numpy())
+                '''
+                # compute loss and accuracy
+                L = E.detach().cpu().numpy()  # numpy array
+                S = Z.detach().cpu().numpy()  # numpy array
+                T0 = T.detach().cpu().numpy()  # numpy array
+                mbs = T0.shape[0]  # = args.mini_batch_size except maybe for last
+                A = np.sum((np.round(S, 0) == T0).astype(np.uint8))
+                tm.tmLoss.stop()
+
+                if not args.inference_only:
+                    # scaled error gradient propagation
+                    # (where we do not accumulate gradients across mini-batches)
+                    tm.tmZero.start()
+                    optimizer.zero_grad()
+                    tm.tmZero.stop()
+
+                    # backward pass
+                    tm.tmBwd.start()
+                    E.backward()
+                    tm.tmBwd.stop()
+
+                    # debug prints (check gradient norm)
+                    # for l in mlp.layers:
+                    #     if hasattr(l, 'weight'):
+                    #          print(l.weight.grad.norm().item())
+
+                    # optimizer
+                    tm.tmOpt.start()
+                    optimizer.step()
+                    tm.tmOpt.stop()
+
+                    ### lr_scheduler.step()
+
+                if args.mlperf_logging:
+                    total_time += iteration_time
+                else:
+                    if use_gpu:
+                        tm.tmSync3.start()
+                        torch.cuda.synchronize()
+                        tm.tmSync3.stop()
+                    t2 = time.time()
+                    total_time += t2 - t1
+
+                total_accu += A
+                total_loss += L * mbs
+                total_iter += 1
+                total_samp += mbs
+
+                should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches)
+                should_test = (
+                    (args.test_freq > 0)
+                    and (args.data_generation == "dataset")
+                    and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches))
+                )
+
+                # print time, loss and accuracy
+                if should_print or should_test:
+                    gT = 1000.0 * total_time / total_iter if args.print_time else -1
+                    total_time = 0
+
+                    gA = total_accu / total_samp
+                    total_accu = 0
+
+                    gL = total_loss / total_samp
+                    total_loss = 0
+
+                    str_run_type = "inference" if args.inference_only else "training"
+                    print(
+                        "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format(
+                            str_run_type, j + 1, nbatches, k, gT
+                        )
+                        + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL,
+                            gA * 100, total_iter, ext_dist.my_rank)
+                    )
+                    # Uncomment the line below to print out the total time with overhead
+                    if ext_dist.my_rank < 2:
+                      tt1 = time.time()
+                      ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \
+                       .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1))
+                    total_iter = 0
+                    total_samp = 0
+
+                # testing
+                if should_test and not args.inference_only:
+                    # don't measure training iter time in a test iteration
+                    if args.mlperf_logging:
+                        previous_iteration_time = None
+
+                    test_accu = 0
+                    test_loss = 0
+                    test_samp = 0
+
+                    accum_test_time_begin = time_wrap(use_gpu)
+                    if args.mlperf_logging:
+                        scores = []
+                        targets = []
+
+                    for i, (X_test, lS_o_test, lS_i_test, T_test) in enumerate(test_ld):
+                        # early exit if nbatches was set by the user and was exceeded
+                        if nbatches > 0 and i >= nbatches:
+                            break
+
+                        # Skip the batch if batch size not multiple of total ranks
+                        if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0:
+                            print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0)))
+                            continue
+
+                        t1_test = time_wrap(use_gpu)
+
+                        # forward pass
+                        Z_test = dlrm_wrap(
+                            X_test, lS_o_test, lS_i_test, use_gpu, device
+                        )
+                        if args.mlperf_logging:
+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
+                            T_test = T_test.detach().cpu().numpy()  # numpy array
+                            scores.append(S_test)
+                            targets.append(T_test)
+                        else:
+                            # loss
+                            E_test = loss_fn_wrap(Z_test, T_test, use_gpu, device)
+
+                            # compute loss and accuracy
+                            L_test = E_test.detach().cpu().numpy()  # numpy array
+                            S_test = Z_test.detach().cpu().numpy()  # numpy array
+                            T_test = T_test.detach().cpu().numpy()  # numpy array
+                            mbs_test = T_test.shape[0]  # = mini_batch_size except last
+                            A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8))
+                            test_accu += A_test
+                            test_loss += L_test * mbs_test
+                            test_samp += mbs_test
+
+                        t2_test = time_wrap(use_gpu)
+
+                    if args.mlperf_logging:
+                        scores = np.concatenate(scores, axis=0)
+                        targets = np.concatenate(targets, axis=0)
+
+                        metrics = {
+                            'loss' : sklearn.metrics.log_loss,
+                            'recall' : lambda y_true, y_score:
+                            sklearn.metrics.recall_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'precision' : lambda y_true, y_score:
+                            sklearn.metrics.precision_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'f1' : lambda y_true, y_score:
+                            sklearn.metrics.f1_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            'ap' : sklearn.metrics.average_precision_score,
+                            'roc_auc' : sklearn.metrics.roc_auc_score,
+                            'accuracy' : lambda y_true, y_score:
+                            sklearn.metrics.accuracy_score(
+                                y_true=y_true,
+                                y_pred=np.round(y_score)
+                            ),
+                            # 'pre_curve' : sklearn.metrics.precision_recall_curve,
+                            # 'roc_curve' :  sklearn.metrics.roc_curve,
+                        }
+
+                        # print("Compute time for validation metric : ", end="")
+                        # first_it = True
+                        validation_results = {}
+                        for metric_name, metric_function in metrics.items():
+                            # if first_it:
+                            #     first_it = False
+                            # else:
+                            #     print(", ", end="")
+                            # metric_compute_start = time_wrap(False)
+                            validation_results[metric_name] = metric_function(
+                                targets,
+                                scores
+                            )
+                            # metric_compute_end = time_wrap(False)
+                            # met_time = metric_compute_end - metric_compute_start
+                            # print("{} {:.4f}".format(metric_name, 1000 * (met_time)),
+                            #      end="")
+                        # print(" ms")
+                        gA_test = validation_results['accuracy']
+                        gL_test = validation_results['loss']
+                    else:
+                        gA_test = test_accu / test_samp
+                        gL_test = test_loss / test_samp
+
+                    is_best = gA_test > best_gA_test
+                    if is_best:
+                        best_gA_test = gA_test
+                        if not (args.save_model == ""):
+                            print("Saving model to {}".format(args.save_model))
+                            torch.save(
+                                {
+                                    "epoch": k,
+                                    "nepochs": args.nepochs,
+                                    "nbatches": nbatches,
+                                    "nbatches_test": nbatches_test,
+                                    "iter": j + 1,
+                                    "state_dict": dlrm.state_dict(),
+                                    "train_acc": gA,
+                                    "train_loss": gL,
+                                    "test_acc": gA_test,
+                                    "test_loss": gL_test,
+                                    "total_loss": total_loss,
+                                    "total_accu": total_accu,
+                                    "opt_state_dict": optimizer.state_dict(),
+                                },
+                                args.save_model,
+                            )
+
+                    if args.mlperf_logging:
+                        is_best = validation_results['roc_auc'] > best_auc_test
+                        if is_best:
+                            best_auc_test = validation_results['roc_auc']
+
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k)
+                            + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format(
+                                validation_results['loss'],
+                                validation_results['recall'],
+                                validation_results['precision']
+                            )
+                            + " f1 {:.4f}, ap {:.4f},".format(
+                                validation_results['f1'],
+                                validation_results['ap'],
+                            )
+                            + " auc {:.4f}, best auc {:.4f},".format(
+                                validation_results['roc_auc'],
+                                best_auc_test
+                            )
+                            + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format(
+                                validation_results['accuracy'] * 100,
+                                best_gA_test * 100
+                            )
+                        )
+                    else:
+                        print(
+                            "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0)
+                            + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format(
+                                gL_test, gA_test * 100, best_gA_test * 100
+                            )
+                        )
+                    # Uncomment the line below to print out the total time with overhead
+                    # print("Total test time for this group: {}" \
+                    # .format(time_wrap(use_gpu) - accum_test_time_begin))
+
+                    if (args.mlperf_logging
+                        and (args.mlperf_acc_threshold > 0)
+                        and (best_gA_test > args.mlperf_acc_threshold)):
+                        print("MLPerf testing accuracy threshold "
+                              + str(args.mlperf_acc_threshold)
+                              + " reached, stop training")
+                        break
+
+                    if (args.mlperf_logging
+                        and (args.mlperf_auc_threshold > 0)
+                        and (best_auc_test > args.mlperf_auc_threshold)):
+                        print("MLPerf testing auc threshold "
+                              + str(args.mlperf_auc_threshold)
+                              + " reached, stop training")
+                        break
+
+                #if (ext_dist.my_rank == 0 and should_print):
+                #    print("ITER : ", j, " from nvidia-smi")
+                #    os.system("nvidia-smi")
+
+            k += 1  # nepochs
+
+    #if (ext_dist.my_rank == 0):
+    #    # print(torch.cuda.memory_allocated(0))
+    #    print(torch.cuda.memory_summary(0))
+    #    # print("from nvidia-smi")
+    #    os.system("nvidia-smi")
+
+    tt2 = time.time()
+    endTime = tt2 - startTime
+    ext_dist.barrier()
+    tt3 = time.time()
+    finalTime = tt3 - startTime
+    # torch.cuda.profiler.cudart().cudaProfilerStop()
+    torch.cuda.profiler.stop()
+    if (skipped > 2):
+        skipped -= 2
+    ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \
+        iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0,
+        finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True)
+    if (ext_dist.my_rank < 2):
+        tm.tmSummary(ext_dist.my_rank)
+
+    file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank)
+    # profiling
+    if args.enable_profiling:
+        os.makedirs(args.out_dir, exist_ok=True)
+        with open("TT"+str(uuid.uuid4().hex), "w") as prof_f:
+            prof_f.write(prof.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cpu_time_total",
+            ))
+
+#        with open("%s.prof" % file_prefix, "w") as prof_f:
+#            prof_f.write(prof.key_averages().table(sort_by="cpu_time_total"))
+#            prof.export_chrome_trace("./%s.json" % file_prefix)
+#            print(prof.key_averages().table(sort_by="cpu_time_total"))
+
+    # plot compute graph
+    if args.plot_compute_graph:
+        sys.exit(
+            "ERROR: Please install pytorchviz package in order to use the"
+            + " visualization. Then, uncomment its import above as well as"
+            + " three lines below and run the code again."
+        )
+        # os.makedirs(args.out_dir, exist_ok=True)
+        # V = Z.mean() if args.inference_only else E
+        # dot = make_dot(V, params=dict(dlrm.named_parameters()))
+        # dot.render('%s_graph' % file_prefix) # write .pdf file
+
+    # test prints
+    if not args.inference_only and args.debug_mode:
+        print("updated parameters (weights and bias):")
+        for param in dlrm.parameters():
+            print(param.detach().cpu().numpy())
+
+    # export the model in onnx
+    if args.save_onnx:
+
+        dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx"
+        torch.onnx.export(
+            dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True
+        )
+
+        # recover the model back
+        dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix)
+        # check the onnx model
+        onnx.checker.check_model(dlrm_pytorch_onnx)