diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..a81c8ee1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/Dockerfile b/Dockerfile index c4b67626..0e4b7500 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,5 +9,7 @@ FROM ${FROM_IMAGE_NAME} ADD requirements.txt . RUN pip install -r requirements.txt +RUN pip install torch==1.3.1 + WORKDIR /code ADD . . diff --git a/README.md b/README.md index 8cc3c108..0dd02696 100644 --- a/README.md +++ b/README.md @@ -311,23 +311,24 @@ Benchmarking ./bench/dlrm_s_criteo_terabyte.sh ["--test-freq=10240 --memory-map --data-sub-sample-rate=0.875"] ``` - Corresponding pre-trained model is available under [CC-BY-NC license](https://creativecommons.org/licenses/by-nc/2.0/) and can be downloaded here - [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt) + + [dlrm_emb64_subsample0.875_maxindrange10M_pretrained.pt](https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt) *NOTE: Benchmarking scripts accept extra arguments which will be passed along to the model, such as --num-batches=100 to limit the number of data samples* -4) The code supports interface with [MLPerf benchmark](https://mlperf.org). +4) The code supports interface with [MLPerf benchmark](https://mlperf.org). - Please refer to the following training parameters ``` --mlperf-logging that keeps track of multiple metrics, including area under the curve (AUC) - + --mlperf-acc-threshold that allows early stopping based on accuracy metric - + --mlperf-auc-threshold that allows early stopping based on AUC metric - + --mlperf-bin-loader that enables preprocessing of data into a single binary file - + --mlperf-bin-shuffle that controls whether a random shuffle of mini-batches is performed ``` - The MLPerf training model is completely specified and can be trained using the following script @@ -367,6 +368,8 @@ pydot (*optional*) torchviz (*optional*) +tqdm + License ------- diff --git a/README.params.md b/README.params.md new file mode 100644 index 00000000..86a43f71 --- /dev/null +++ b/README.params.md @@ -0,0 +1,51 @@ + +# DLRM Distributed Branch + +Extend the PyTorch implementation to run DLRM on multi nodes on distributed platforms. +The distributed version will be needed when data model becomes large. + +It inherents all the parameters from master DLRM implementation. +The distributed version add one more parameter: + +**--dist-backend**: + The backend support for the distributed version. As in torch.distributed package, + it can be "nccl", "mpi", and "gloo". + +In addition, it introduces the following new parameter:: +**--arch-project-size** : + Reducing the number of interaction features for the dot operation. + A project operation is applied to the dotted features to reduce its dimension size. + This is mainly due to the memory concern. It reduces the memory size needed for top MLP. + A side effect is that it may also imrpove the model accuracy. + +## Usage + +Currently, it is launched with mpirun on multi-nodes. The hostfile need to be created or +a host list should be given. The DLRM parameters should be given in the same way as single +node master branch. +```bash +mpirun -np 128 -hostfile hostfile python dlrm_s_pytorch.py ... +``` + +## Example +```bash +python dlrm_s_pytorch.py + --arch-sparse-feature-size=128 + --arch-mlp-bot="2000-1024-1024-128" + --arch-mlp-top="4096-4096-4096-1" + --arch-embedding-size=$large_arch_emb + --data-generation=random + --loss-function=bce + --round-targets=True + --learning-rate=0.1 + --mini-batch-size=2048 + --print-freq=10240 + --print-time + --test-mini-batch-size=16384 + --test-num-workers=16 + --num-indices-per-lookup-fixed=1 + --num-indices-per-lookup=100 + --arch-projection-size 30 + --use-gpu +``` + diff --git a/data_loader_terabyte.py b/data_loader_terabyte.py index b520fc96..d1a38efa 100644 --- a/data_loader_terabyte.py +++ b/data_loader_terabyte.py @@ -325,7 +325,7 @@ def _test_bin(): original_dataset = CriteoDataset( dataset='terabyte', - max_ind_range=-1, + max_ind_range=10 * 1000 * 1000, sub_sample_rate=1, randomize=True, split=args.split, diff --git a/dlrm_data.py b/dlrm_data.py new file mode 100644 index 00000000..28afd8da --- /dev/null +++ b/dlrm_data.py @@ -0,0 +1,277 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: delivering inputs and targets for the dlrm benchmark +# The inpts and outputs are used according to the following two option(s) +# 1) random distribution, generated and loaded based on uniform distribution +# 2) synthetic data, the synthetic pre-generated data would be loaded. + +from __future__ import absolute_import, division, print_function, unicode_literals +import sys +import numpy as np +from numpy import random as ra +import torch +from torch.utils.data import Dataset # , RandomSampler + + +class RandomDataset(Dataset): + """ Uniform distribution """ + def __init__( + self, + m_den, + ln_emb, + data_size, + num_batches, + mini_batch_size, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + num_targets=1, + round_targets=False, + data_generation="random", + trace_file="", + enable_padding=False, + reset_seed_on_access=False, + rand_seed=0 + ): + # compute batch size + nbatches = int(np.ceil((data_size * 1.0) / mini_batch_size)) + if num_batches != 0: + nbatches = num_batches + data_size = nbatches * mini_batch_size + # print("Total number of batches %d" % nbatches) + + # save args (recompute data_size if needed) + self.m_den = m_den + self.ln_emb = ln_emb + self.data_size = data_size + self.num_batches = nbatches + self.mini_batch_size = mini_batch_size + self.num_indices_per_lookup = num_indices_per_lookup + self.num_indices_per_lookup_fixed = num_indices_per_lookup_fixed + self.num_targets = num_targets + self.round_targets = round_targets + self.data_generation = data_generation + self.trace_file = trace_file + self.enable_padding = enable_padding + self.reset_seed_on_access = reset_seed_on_access + self.rand_seed = rand_seed + + def reset_numpy_seed(self, numpy_rand_seed): + np.random.seed(numpy_rand_seed) + # torch.manual_seed(numpy_rand_seed) + + def __getitem__(self, index): + + if isinstance(index, slice): + return [ + self[idx] for idx in range( + index.start or 0, index.stop or len(self), index.step or 1 + ) + ] + + # WARNING: reset seed on access to first element + # (e.g. if same random samples needed across epochs) + if self.reset_seed_on_access and index == 0: + self.reset_numpy_seed(self.rand_seed) + + # number of data points in a batch + n = min(self.mini_batch_size, self.data_size - (index * self.mini_batch_size)) + + # generate a batch of dense and sparse features + if self.data_generation == "random": + (X, lS_o, lS_i) = generate_uniform_input_batch( + self.m_den, + self.ln_emb, + n, + self.num_indices_per_lookup, + self.num_indices_per_lookup_fixed + ) + + # generate a batch of target (probability of a click) + T = generate_random_output_batch(n, self.num_targets, self.round_targets) + + return (X, lS_o, lS_i, T) + + def __len__(self): + # WARNING: note that we produce bacthes of outputs in __getitem__ + # therefore we should use num_batches rather than data_size below + return self.num_batches + + +def collate_wrapper_random(list_of_tuples): + # where each tuple is (X, lS_o, lS_i, T) + (X, lS_o, lS_i, T) = list_of_tuples[0] + return (X, + torch.stack(lS_o), + lS_i, + T) + + +def make_random_data_and_loader(args, ln_emb, m_den): + + train_data = RandomDataset( + m_den, + ln_emb, + args.data_size, + args.num_batches, + args.mini_batch_size, + args.num_indices_per_lookup, + args.num_indices_per_lookup_fixed, + 1, # num_targets + args.round_targets, + args.data_generation, + args.data_trace_file, + args.data_trace_enable_padding, + reset_seed_on_access=True, + rand_seed=args.numpy_rand_seed + ) # WARNING: generates a batch of lookups at once + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_random, + pin_memory=False, + drop_last=False, # True + # persistent_workers=True, + ) + return train_data, train_loader + + +def generate_random_output_batch(n, num_targets, round_targets=False): + # target (probability of a click) + if round_targets: + P = np.round(ra.rand(n, num_targets).astype(np.float32)).astype(np.float32) + else: + P = ra.rand(n, num_targets).astype(np.float32) + + return torch.tensor(P) + + +# uniform ditribution (input data) +def generate_uniform_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, +): + # dense feature + #Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + + # sparse feature (sparse indices) + lS_emb_offsets = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in ln_emb: + lS_batch_offsets = [] + lS_batch_indices = [] + offset = 0 + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int64(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int64( + np.round(max([1.0], r * min(size, num_indices_per_lookup))) + ) + # sparse indices to be used per embedding + r = ra.random(sparse_group_size) + sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64)) + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int64(sparse_group.size) + # store lengths and indices + lS_batch_offsets += [offset] + lS_batch_indices += sparse_group.tolist() + # update offset for next iteration + offset += sparse_group_size + lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) + lS_emb_indices.append(torch.tensor(lS_batch_indices)) + + return (Xt, lS_emb_offsets, lS_emb_indices) + + +class SyntheticDataset(Dataset): + + def __init__( + self, + mini_batch_size, + ln_emb, + nbatches=1, + synthetic_data_folder="./synthetic_data/syn_data_bs65536/", + ): + self.synthetic_data_folder = synthetic_data_folder + self.num_batches = nbatches + self.mini_batch_size = mini_batch_size + self.ln_emb = ln_emb + + self.X = torch.load(f"{self.synthetic_data_folder}/X_0.pt") + self.lS_o = torch.load(f"{self.synthetic_data_folder}/lS_o_0.pt") + self.lS_i = torch.load(f"{self.synthetic_data_folder}/lS_i_0.pt") + self.T = torch.load(f"{self.synthetic_data_folder}/T_0.pt") + # print('data loader initiated ...') + + def __getitem__(self, index): + # module out index for reuse + index = index % (len(self.X) // self.mini_batch_size) + sInd = index * self.mini_batch_size + eInd = sInd + self.mini_batch_size + if sInd >= len(self.X): + sys.exit(f' mini_batch_size({self.mini_batch_size}) * ' + f'num_batches({self.num_batches}) has to be less' + f' than size of data({len(self.X)})' + ) + X = self.X[sInd:eInd] + lS_o = [i[:][sInd:eInd] - i[:][sInd] for i in self.lS_o] + + if eInd < len(self.lS_o[0]): + lS_i = [val[self.lS_o[ind][sInd]:self.lS_o[ind][eInd]] for ind, val in enumerate(self.lS_i)] + elif sInd < len(self.lS_o[0]): + lS_i = [val[self.lS_o[ind][sInd]:] for ind, val in enumerate(self.lS_i)] +# for i in range(len(lS_i)): +# bound = self.ln_emb[i] +# if not bound == 26000000: +# lS_i[i] %= bound + + T = self.T[sInd:eInd] + return (X, lS_o, lS_i, T) + + def __len__(self): + return self.num_batches + + +def synthetic_data_loader(args, ln_emb, m_den): + + train_data = SyntheticDataset( + args.mini_batch_size, + ln_emb, + # how to repeat ? + # nbatches=min(args.num_batches, 65536 // args.mini_batch_size), + nbatches=args.num_batches, + synthetic_data_folder=args.synthetic_data_folder, + ) + train_loader = torch.utils.data.DataLoader( + train_data, + batch_size=1, + shuffle=False, + num_workers=args.num_workers, + collate_fn=collate_wrapper_random, + pin_memory=False, + drop_last=False, + ) + return train_data, train_loader + + +def data_loader(args, ln_emb, m_den): + data_gens = {"random": make_random_data_and_loader, + "synthetic": synthetic_data_loader, + } + train_data, train_ld = data_gens[args.data_generation](args, ln_emb, m_den) + + return train_data, train_ld diff --git a/dlrm_data_pytorch.py b/dlrm_data_pytorch.py index 6cbe382a..1d9845ba 100644 --- a/dlrm_data_pytorch.py +++ b/dlrm_data_pytorch.py @@ -34,7 +34,7 @@ import torch from torch.utils.data import Dataset, RandomSampler -import data_loader_terabyte +## import data_loader_terabyte # Kaggle Display Advertising Challenge Dataset @@ -537,6 +537,13 @@ def __init__( trace_file="", enable_padding=False, reset_seed_on_access=False, + + rand_data_dist="uniform", + rand_data_min=1, + rand_data_max=1, + rand_data_mu=-1, + rand_data_sigma=1, + rand_seed=0 ): # compute batch size @@ -561,6 +568,12 @@ def __init__( self.enable_padding = enable_padding self.reset_seed_on_access = reset_seed_on_access self.rand_seed = rand_seed + + self.rand_data_dist = rand_data_dist + self.rand_data_min = rand_data_min + self.rand_data_max = rand_data_max + self.rand_data_mu = rand_data_mu + self.rand_data_sigma = rand_data_sigma def reset_numpy_seed(self, numpy_rand_seed): np.random.seed(numpy_rand_seed) @@ -585,12 +598,18 @@ def __getitem__(self, index): # generate a batch of dense and sparse features if self.data_generation == "random": - (X, lS_o, lS_i) = generate_uniform_input_batch( + (X, lS_o, lS_i) = generate_dist_input_batch( self.m_den, self.ln_emb, n, self.num_indices_per_lookup, - self.num_indices_per_lookup_fixed + self.num_indices_per_lookup_fixed, + + rand_data_dist=self.rand_data_dist, + rand_data_min=self.rand_data_min, + rand_data_max=self.rand_data_max, + rand_data_mu=self.rand_data_mu, + rand_data_sigma=self.rand_data_sigma, ) elif self.data_generation == "synthetic": (X, lS_o, lS_i) = generate_synthetic_input_batch( @@ -778,6 +797,67 @@ def generate_uniform_input_batch( return (Xt, lS_emb_offsets, lS_emb_indices) +# random data from uniform or gaussian ditribution (input data) +def generate_dist_input_batch( + m_den, + ln_emb, + n, + num_indices_per_lookup, + num_indices_per_lookup_fixed, + rand_data_dist, + rand_data_min, + rand_data_max, + rand_data_mu, + rand_data_sigma, +): + # dense feature + Xt = torch.tensor(ra.rand(n, m_den).astype(np.float32)) + + # sparse feature (sparse indices) + lS_emb_offsets = [] + lS_emb_indices = [] + # for each embedding generate a list of n lookups, + # where each lookup is composed of multiple sparse indices + for size in ln_emb: + lS_batch_offsets = [] + lS_batch_indices = [] + offset = 0 + for _ in range(n): + # num of sparse indices to be used per embedding (between + if num_indices_per_lookup_fixed: + sparse_group_size = np.int64(num_indices_per_lookup) + else: + # random between [1,num_indices_per_lookup]) + r = ra.random(1) + sparse_group_size = np.int64( + np.round(max([1.0], r * min(size, num_indices_per_lookup))) + ) + # sparse indices to be used per embedding + if rand_data_dist == "gaussian": + if rand_data_mu == -1: + rand_data_mu = (rand_data_max + rand_data_min) / 2.0 + r = ra.normal(rand_data_mu, rand_data_sigma, sparse_group_size) + sparse_group = np.clip(r, rand_data_min, rand_data_max) + sparse_group = np.unique(sparse_group).astype(np.int64) + elif rand_data_dist == "uniform": + r = ra.random(sparse_group_size) + sparse_group = np.unique(np.round(r * (size - 1)).astype(np.int64)) + else: + raise(rand_data_dist, "distribution is not supported. \ + please select uniform or gaussian") + + # reset sparse_group_size in case some index duplicates were removed + sparse_group_size = np.int64(sparse_group.size) + # store lengths and indices + lS_batch_offsets += [offset] + lS_batch_indices += sparse_group.tolist() + # update offset for next iteration + offset += sparse_group_size + lS_emb_offsets.append(torch.tensor(lS_batch_offsets)) + lS_emb_indices.append(torch.tensor(lS_batch_indices)) + + return (Xt, lS_emb_offsets, lS_emb_indices) + # synthetic distribution (input data) def generate_synthetic_input_batch( m_den, @@ -813,7 +893,7 @@ def generate_synthetic_input_batch( # sparse indices to be used per embedding file_path = trace_file line_accesses, list_sd, cumm_sd = read_dist_from_file( - file_path.replace("j", str(i)) + file_path.replace("j", str(0)) ) # debug prints # print("input") @@ -1007,7 +1087,7 @@ def read_dist_from_file(file_path): with open(file_path, "r") as f: lines = f.read().splitlines() except Exception: - print("Wrong file or file path") + print("Wrong file or file path in read: ", file_path) # read unique accesses unique_accesses = [int(el) for el in lines[0].split(", ")] # read cumulative distribution (elements are passed as two separate lists) @@ -1030,7 +1110,7 @@ def write_dist_to_file(file_path, unique_accesses, list_sd, cumm_sd): s = str(cumm_sd) f.write(s[1 : len(s) - 1] + "\n") except Exception: - print("Wrong file or file path") + print("Wrong file or file path in write: ", file_path) if __name__ == "__main__": diff --git a/dlrm_profile.py b/dlrm_profile.py new file mode 100644 index 00000000..152dce83 --- /dev/null +++ b/dlrm_profile.py @@ -0,0 +1,1595 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: an implementation of a deep learning recommendation model (DLRM) +# The model input consists of dense and sparse features. The former is a vector +# of floating point values. The latter is a list of sparse indices into +# embedding tables, which consist of vectors of floating point values. +# The selected vectors are passed to mlp networks denoted by triangles, +# in some cases the vectors are interacted through operators (Ops). +# +# output: +# vector of values +# model: | +# /\ +# /__\ +# | +# _____________________> Op <___________________ +# / | \ +# /\ /\ /\ +# /__\ /__\ ... /__\ +# | | | +# | Op Op +# | ____/__\_____ ____/__\____ +# | |_Emb_|____|__| ... |_Emb_|__|___| +# input: +# [ dense features ] [sparse indices] , ..., [sparse indices] +# +# More precise definition of model layers: +# 1) fully connected layers of an mlp +# z = f(y) +# y = Wx + b +# +# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) +# z = Op(e1,...,ek) +# obtain vectors e1=E[:,p1], ..., ek=E[:,pk] +# +# 3) Operator Op can be one of the following +# Sum(e1,...,ek) = e1 + ... + ek +# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] +# Cat(e1,...,ek) = [e1', ..., ek']' +# where ' denotes transpose operation +# +# References: +# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, +# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, +# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, +# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, +# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, +# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and +# Recommendation Systems", CoRR, arXiv:1906.00091, 2019 + +from __future__ import absolute_import, division, print_function, unicode_literals + +# miscellaneous +import builtins +import functools +# import bisect +# import shutil +import time +import json +# data generation +import dlrm_data_pytorch as dp + +# numpy +import numpy as np +import socket + +# onnx +# The onnx import causes deprecation warnings every time workers +# are spawned during testing. So, we filter out those warnings. +import warnings +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) +## import onnx + +# pytorch +import torch +from torch import onnx +import torch.nn as nn +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter + +# For distributed run +import extend_distributed as ext_dist + +# quotient-remainder trick +from tricks.qr_embedding_bag import QREmbeddingBag +# mixed-dimension trick +from tricks.md_embedding_bag import PrEmbeddingBag, md_solver + +import sklearn.metrics + +import uuid +import project +from torch.nn.parallel import DistributedDataParallel as DDP + +import dlrm_data as dd + +# Add dlrm self profiling timers +import profile as tm +# import pyprof +# pyprof.init() # causing errors, some symbols not found + +# import synthetic_data_loader as fb_syn_data + +# from torchviz import make_dot +# import torch.nn.functional as Functional +# from torch.nn.parameter import Parameter + +from torch.optim.lr_scheduler import _LRScheduler + +exc = getattr(builtins, "IOError", "FileNotFoundError") + +class LRPolicyScheduler(_LRScheduler): + def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps): + self.num_warmup_steps = num_warmup_steps + self.decay_start_step = decay_start_step + self.decay_end_step = decay_start_step + num_decay_steps + self.num_decay_steps = num_decay_steps + + if self.decay_start_step < self.num_warmup_steps: + sys.exit("Learning rate warmup must finish before the decay starts") + + super(LRPolicyScheduler, self).__init__(optimizer) + + def get_lr(self): + step_count = self._step_count + if step_count < self.num_warmup_steps: + # warmup + scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps + lr = [base_lr * scale for base_lr in self.base_lrs] + self.last_lr = lr + elif self.decay_start_step <= step_count and step_count < self.decay_end_step: + # decay + decayed_steps = step_count - self.decay_start_step + scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2 + min_lr = 0.0000001 + lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs] + self.last_lr = lr + else: + if self.num_decay_steps > 0: + # freeze at last, either because we're after decay + # or because we're between warmup and decay + lr = self.last_lr + else: + # do not adjust + lr = self.base_lrs + return lr + +### define dlrm in PyTorch ### +class DLRM_Net(nn.Module): + def create_mlp(self, ln, sigmoid_layer): + # build MLP layer by layer + layers = nn.ModuleList() + for i in range(0, ln.size - 1): + n = ln[i] + m = ln[i + 1] + + # construct fully connected operator + LL = nn.Linear(int(n), int(m), bias=True) + + # initialize the weights + # with torch.no_grad(): + # custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) + # approach 1 + LL.weight.data = torch.tensor(W, requires_grad=True) + LL.bias.data = torch.tensor(bt, requires_grad=True) + # approach 2 + # LL.weight.data.copy_(torch.tensor(W)) + # LL.bias.data.copy_(torch.tensor(bt)) + # approach 3 + # LL.weight = Parameter(torch.tensor(W),requires_grad=True) + # LL.bias = Parameter(torch.tensor(bt),requires_grad=True) + layers.append(LL) + + # construct sigmoid or relu operator + if i == sigmoid_layer: + layers.append(nn.Sigmoid()) + else: + layers.append(nn.ReLU()) + + # approach 1: use ModuleList + # return layers + # approach 2: use Sequential container to wrap all layers + return torch.nn.Sequential(*layers) + + def create_emb(self, m, ln): + emb_l = nn.ModuleList() + # save the numpy random state + np_rand_state = np.random.get_state() + for i in range(0, ln.size): + if ext_dist.my_size > 1: + if not i in self.local_emb_indices: continue + # Use per table random seed for Embedding initialization + np.random.seed(self.l_emb_seeds[i]) + n = ln[i] + # construct embedding operator + if self.qr_flag and n > self.qr_threshold: + EE = QREmbeddingBag(n, m, self.qr_collisions, + operation=self.qr_operation, mode="sum", sparse=True) + elif self.md_flag: + base = max(m) + _m = m[i] if n > self.md_threshold else base + EE = PrEmbeddingBag(n, _m, base) + # use np initialization as below for consistency... + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m) + ).astype(np.float32) + EE.embs.weight.data = torch.tensor(W, requires_grad=True) + + else: + #_weight = torch.empty([n, m]).uniform_(-np.sqrt(1 / n), np.sqrt(1 / n)) + #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight= _weight) + #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True) + + # initialize embeddings + # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n)) + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m) + ).astype(np.float32) + # approach 1 + EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True)) + #EE.weight.data = torch.tensor(W, requires_grad=True) + # approach 2 + # EE.weight.data.copy_(torch.tensor(W)) + # approach 3 + # EE.weight = Parameter(torch.tensor(W),requires_grad=True) + + if ext_dist.my_size > 1: + if i in self.local_emb_indices: + emb_l.append(EE) + else: + emb_l.append(EE) + + # Restore the numpy random state + np.random.set_state(np_rand_state) + return emb_l + + def __init__( + self, + m_spa=None, + ln_emb=None, + ln_bot=None, + ln_top=None, + proj_size = 0, + arch_interaction_op=None, + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=-1, + sync_dense_params=True, + loss_threshold=0.0, + ndevices=-1, + qr_flag=False, + qr_operation="mult", + qr_collisions=0, + qr_threshold=200, + md_flag=False, + md_threshold=200, + ): + super(DLRM_Net, self).__init__() + + if ( + (m_spa is not None) + and (ln_emb is not None) + and (ln_bot is not None) + and (ln_top is not None) + and (arch_interaction_op is not None) + ): + + # save arguments + self.proj_size = proj_size + self.ndevices = ndevices + self.output_d = 0 + self.parallel_model_batch_size = -1 + self.parallel_model_is_not_prepared = True + self.arch_interaction_op = arch_interaction_op + self.arch_interaction_itself = arch_interaction_itself + self.sync_dense_params = sync_dense_params + self.loss_threshold = loss_threshold + # create variables for QR embedding if applicable + self.qr_flag = qr_flag + if self.qr_flag: + self.qr_collisions = qr_collisions + self.qr_operation = qr_operation + self.qr_threshold = qr_threshold + # create variables for MD embedding if applicable + self.md_flag = md_flag + if self.md_flag: + self.md_threshold = md_threshold + + # generate np seeds for Emb table initialization + self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb)) + + #If running distributed, get local slice of embedding tables + if ext_dist.my_size > 1: + n_emb = len(ln_emb) + self.n_global_emb = n_emb + self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb) + self.local_emb_slice = ext_dist.get_my_slice(n_emb) + self.local_emb_indices = list(range(n_emb))[self.local_emb_slice] + #ln_emb = ln_emb[self.local_emb_slice] + + # create operators + if ndevices <= 1: + self.emb_l = self.create_emb(m_spa, ln_emb) + self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) + self.top_l = self.create_mlp(ln_top, sigmoid_top) + if (proj_size > 0): + self.proj_l = project.create_proj(len(ln_emb)+1, proj_size) + + def apply_mlp(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_proj(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_emb(self, lS_o, lS_i, emb_l): + # WARNING: notice that we are processing the batch at once. We implicitly + # assume that the data is laid out such that: + # 1. each embedding is indexed with a group of sparse indices, + # corresponding to a single lookup + # 2. for each embedding the lookups are further organized into a batch + # 3. for a list of embedding tables there is a list of batched lookups + + ly = [] + for k, sparse_index_group_batch in enumerate(lS_i): + sparse_offset_group_batch = lS_o[k] + + # embedding lookup + # We are using EmbeddingBag, which implicitly uses sum operator. + # The embeddings are represented as tall matrices, with sum + # happening vertically across 0 axis, resulting in a row vector + E = emb_l[k] + V = E(sparse_index_group_batch, sparse_offset_group_batch) + + ly.append(V) + + # print(ly) + return ly + + def interact_features(self, x, ly): + if self.arch_interaction_op == "dot": + # concatenate dense and sparse features + (batch_size, d) = x.shape + T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) + # perform a dot product + if (self.proj_size > 0): + R = project.project(T, x, self.proj_l) + #TT = torch.transpose(T, 1, 2) + #TS = torch.reshape(TT, (-1, TT.size(2))) + #TC = self.apply_mlp(TS, self.proj_l) + #TR = torch.reshape(TC, (-1, d ,self.proj_size)) + #Z = torch.bmm(T, TR) + #Zflat = Z.view((batch_size, -1)) + #R = torch.cat([x] + [Zflat], dim=1) + else: + Z = torch.bmm(T, torch.transpose(T, 1, 2)) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = Z.view((batch_size, -1)) + # approach 2: unique + _, ni, nj = Z.shape + # approach 1: tril_indices + # offset = 0 if self.arch_interaction_itself else -1 + # li, lj = torch.tril_indices(ni, nj, offset=offset) + # approach 2: custom + offset = 1 if self.arch_interaction_itself else 0 + li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) + lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) + Zflat = Z[:, li, lj] + # concatenate dense features and interactions + R = torch.cat([x] + [Zflat], dim=1) + elif self.arch_interaction_op == "cat": + # concatenation features (into a row vector) + R = torch.cat([x] + ly, dim=1) + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + self.arch_interaction_op + + " is not supported" + ) + + return R + + def forward(self, dense_x, lS_o, lS_i): + if ext_dist.my_size > 1: + return self.distributed_forward(dense_x, lS_o, lS_i) + elif self.ndevices <= 1: + return self.sequential_forward(dense_x, lS_o, lS_i) + else: + return self.parallel_forward(dense_x, lS_o, lS_i) + + def sequential_forward(self, dense_x, lS_o, lS_i): + # process dense features (using bottom mlp), resulting in a row vector + x = self.apply_mlp(dense_x, self.bot_l) + # debug prints + # print("intermediate") + # print(x.detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + # for y in ly: + # print(y.detach().cpu().numpy()) + + # interact features (dense and sparse) + z = self.interact_features(x, ly) + # print(z.detach().cpu().numpy()) + + # obtain probability of a click (using top mlp) + p = self.apply_mlp(z, self.top_l) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) + else: + z = p + + return z + + def distributed_forward(self, dense_x, lS_o, lS_i): + batch_size = dense_x.size()[0] + # WARNING: # of ranks must be <= batch size in distributed_forward call + if batch_size < ext_dist.my_size: + sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size)) + if batch_size % ext_dist.my_size != 0: + sys.exit("ERROR: batch_size %d can not split across %d ranks evenly" % (batch_size, ext_dist.my_size)) + + dense_x = dense_x[ext_dist.get_my_slice(batch_size)] + lS_o = lS_o[self.local_emb_slice] + lS_i = lS_i[self.local_emb_slice] + + if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)): + sys.exit("ERROR: corrupted model input detected in distributed_forward call") + + # embeddings + tm.tmEmb.start() + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + tm.tmEmb.stop() + + # print("ly: ", ly) + # debug prints + # print(ly) + + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each rank. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each rank. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if len(self.emb_l) != len(ly): + sys.exit("ERROR: corrupted intermediate result in distributed_forward call") + + tm.tmA2A.start() + a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank) + tm.tmA2A.stop() + + tm.tmBot.start() + x = self.apply_mlp(dense_x, self.bot_l) + tm.tmBot.stop() + + # debug prints + # print(x) + + tm.tmA2A1.start() + ly = a2a_req.wait() + tm.tmA2A1.stop() + # print("ly: ", ly) + ly = list(ly) + + # interactions + tm.tmInt.start() + z = self.interact_features(x, ly) + tm.tmInt.stop() + # debug prints + # print(z) + + # top mlp + tm.tmTop.start() + p = self.apply_mlp(z, self.top_l) + tm.tmTop.stop() + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp( + p, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z = p + + ### gather the distributed results on each rank ### + # For some reason it requires explicit sync before all_gather call if + # tensor is on GPU memory + tm.tmAllGa.start() + if z.is_cuda: torch.cuda.synchronize() + (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size) + z = ext_dist.all_gather(z, batch_split_lengths) + tm.tmAllGa.stop() + #print("Z: %s" % z) + + return z + + def parallel_forward(self, dense_x, lS_o, lS_i): + ### prepare model (overwrite) ### + # WARNING: # of devices must be >= batch size in parallel_forward call + batch_size = dense_x.size()[0] + ndevices = min(self.ndevices, batch_size, len(self.emb_l)) + device_ids = range(ndevices) + # WARNING: must redistribute the model if mini-batch size changes(this is common + # for last mini-batch, when # of elements in the dataset/batch size is not even + if self.parallel_model_batch_size != batch_size: + self.parallel_model_is_not_prepared = True + + if self.parallel_model_is_not_prepared or self.sync_dense_params: + # replicate mlp (data parallelism) + self.bot_l_replicas = replicate(self.bot_l, device_ids) + self.top_l_replicas = replicate(self.top_l, device_ids) + self.parallel_model_batch_size = batch_size + + if self.parallel_model_is_not_prepared: + # distribute embeddings (model parallelism) + t_list = [] + for k, emb in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + emb.to(d) + t_list.append(emb.to(d)) + self.emb_l = nn.ModuleList(t_list) + self.parallel_model_is_not_prepared = False + + ### prepare input (overwrite) ### + # scatter dense features (data parallelism) + # print(dense_x.device) + dense_x = scatter(dense_x, device_ids, dim=0) + # distribute sparse features (model parallelism) + if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)): + sys.exit("ERROR: corrupted model input detected in parallel_forward call") + + t_list = [] + i_list = [] + for k, _ in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + t_list.append(lS_o[k].to(d)) + i_list.append(lS_i[k].to(d)) + lS_o = t_list + lS_i = i_list + + ### compute results in parallel ### + # bottom mlp + # WARNING: Note that the self.bot_l is a list of bottom mlp modules + # that have been replicated across devices, while dense_x is a tuple of dense + # inputs that has been scattered across devices on the first (batch) dimension. + # The output is a list of tensors scattered across devices according to the + # distribution of dense_x. + x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids) + # debug prints + # print(x) + + # embeddings + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + # debug prints + # print(ly) + + # butterfly shuffle (implemented inefficiently for now) + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each device. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each device. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if len(self.emb_l) != len(ly): + sys.exit("ERROR: corrupted intermediate result in parallel_forward call") + + t_list = [] + for k, _ in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + y = scatter(ly[k], device_ids, dim=0) + t_list.append(y) + # adjust the list to be ordered per device + ly = list(map(lambda y: list(y), zip(*t_list))) + # debug prints + # print(ly) + + # interactions + z = [] + for k in range(ndevices): + zk = self.interact_features(x[k], ly[k]) + z.append(zk) + # debug prints + # print(z) + + # top mlp + # WARNING: Note that the self.top_l is a list of top mlp modules that + # have been replicated across devices, while z is a list of interaction results + # that by construction are scattered across devices on the first (batch) dim. + # The output is a list of tensors scattered across devices according to the + # distribution of z. + p = parallel_apply(self.top_l_replicas, z, None, device_ids) + + ### gather the distributed results ### + p0 = gather(p, self.output_d, dim=0) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z0 = torch.clamp( + p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z0 = p0 + + return z0 + + +def dash_separated_ints(value): + vals = value.split('-') + for val in vals: + try: + int(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of ints" % value) + + return value + + +def dash_separated_floats(value): + vals = value.split('-') + for val in vals: + try: + float(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of floats" % value) + + return value + + +if __name__ == "__main__": + ### import packages ### + import sys + import os + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Train Deep Learning Recommendation Model (DLRM)" + ) + # model related parameters + parser.add_argument("--arch-sparse-feature-size", type=int, default=2) + + parser.add_argument( + "--arch-embedding-size", type=dash_separated_ints, default="4-3-2") + parser.add_argument("--arch-project-size", type=int, default=0) + + # j will be replaced with the table number + parser.add_argument( + "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2") + parser.add_argument( + "--arch-mlp-top", type=dash_separated_ints, default="4-2-1") + parser.add_argument( + "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot") + parser.add_argument("--arch-interaction-itself", action="store_true", default=False) + # embedding table options + parser.add_argument("--md-flag", action="store_true", default=False) + parser.add_argument("--md-threshold", type=int, default=200) + parser.add_argument("--md-temperature", type=float, default=0.3) + parser.add_argument("--md-round-dims", action="store_true", default=False) + parser.add_argument("--qr-flag", action="store_true", default=False) + parser.add_argument("--qr-threshold", type=int, default=200) + parser.add_argument("--qr-operation", type=str, default="mult") + parser.add_argument("--qr-collisions", type=int, default=4) + # activations and loss + parser.add_argument("--activation-function", type=str, default="relu") + parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce + parser.add_argument( + "--loss-weights", type=dash_separated_floats, default="1.0-1.0") # for wbce + parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 + parser.add_argument("--round-targets", type=bool, default=False) + # data + parser.add_argument("--data-size", type=int, default=1) + parser.add_argument("--num-batches", type=int, default=0) + parser.add_argument( + "--data-generation", type=str, default="random" + ) # synthetic or dataset + parser.add_argument("--synthetic-data-folder", type=str, + default="./synthetic_data/syn_data_bs65536") + # add Gaussian distribution + parser.add_argument("--rand-data-dist", type=str, default="uniform") # uniform or gaussian + parser.add_argument("--rand-data-min", type=float, default=0) + parser.add_argument("--rand-data-max", type=float, default=1) + parser.add_argument("--rand-data-mu", type=float, default=-1) + parser.add_argument("--rand-data-sigma", type=float, default=1) + + parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--data-trace-enable-padding", type=bool, default=False) + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--num-indices-per-lookup", type=int, default=10) + parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--memory-map", action="store_true", default=False) + # training + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--nepochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=0.01) + parser.add_argument("--print-precision", type=int, default=5) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--sync-dense-params", type=bool, default=True) + # inference + parser.add_argument("--inference-only", action="store_true", default=False) + # onnx + parser.add_argument("--save-onnx", action="store_true", default=False) + # gpu + parser.add_argument("--use-gpu", action="store_true", default=False) + # distributed run + parser.add_argument("--dist-backend", type=str, default="") + # debugging and profiling + parser.add_argument("--print-freq", type=int, default=1) + parser.add_argument("--test-freq", type=int, default=-1) + parser.add_argument("--test-mini-batch-size", type=int, default=-1) + parser.add_argument("--test-num-workers", type=int, default=-1) + parser.add_argument("--print-time", action="store_true", default=False) + parser.add_argument("--debug-mode", action="store_true", default=False) + parser.add_argument("--enable-profiling", action="store_true", default=False) + parser.add_argument("--plot-compute-graph", action="store_true", default=False) + # store/load model + parser.add_argument("--out-dir", type=str, default=".") + parser.add_argument("--save-model", type=str, default="") + parser.add_argument("--load-model", type=str, default="") + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107 + parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0) + # stop at target AUC Terabyte (no subsampling) 0.8025 + parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) + parser.add_argument("--mlperf-bin-loader", action='store_true', default=False) + parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False) + + # LR policy + parser.add_argument("--lr-num-warmup-steps", type=int, default=0) + parser.add_argument("--lr-decay-start-step", type=int, default=0) + parser.add_argument("--lr-num-decay-steps", type=int, default=0) + + args = parser.parse_args() + + print(socket.gethostname()) + + ext_dist.init_distributed(backend=args.dist_backend) + + # print("success size= ", ext_dist.my_size, ext_dist.my_rank) + + ext_dist.barrier() + + if args.mlperf_logging: + print('command line args: ', json.dumps(vars(args))) + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + torch.set_printoptions(precision=args.print_precision) + torch.manual_seed(args.numpy_rand_seed) + + if (args.test_mini_batch_size < 0): + # if the parameter is not set, use the training batch size + args.test_mini_batch_size = args.mini_batch_size + if (args.test_num_workers < 0): + # if the parameter is not set, use the same parameter for training + args.test_num_workers = args.num_workers + if args.mini_batch_size % ext_dist.my_size !=0 or args.test_mini_batch_size % ext_dist.my_size != 0: + print("Either test minibatch (%d) or train minibatch (%d) does not split across %d ranks" % (args.test_mini_batch_size, args.mini_batch_size, ext_dist.my_size)) + sys.exit(1) + + use_gpu = args.use_gpu and torch.cuda.is_available() + if use_gpu: + torch.cuda.manual_seed_all(args.numpy_rand_seed) + torch.backends.cudnn.deterministic = True + if ext_dist.my_size > 1: + ngpus = torch.cuda.device_count() # 1 + if ext_dist.my_local_size > torch.cuda.device_count(): + print("Not sufficient GPUs available... local_size = %d, ngpus = %d" % (ext_dist.my_local_size, ngpus)) + sys.exit(1) + ngpus = 1 + device = torch.device("cuda", ext_dist.my_local_rank) + else: + device = torch.device("cuda", 0) + ngpus = torch.cuda.device_count() # 1 + ngpus=1 + print("Using {} GPU(s)...".format(ngpus)) + else: + device = torch.device("cpu") + print("Using CPU...") + + ### prepare training data ### + ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") + # input data + if (args.data_generation == "dataset"): + + train_data, train_ld, test_data, test_ld = \ + dp.make_criteo_data_and_loaders(args) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + + ln_emb = train_data.counts + # enforce maximum limit on number of vectors per embedding + if args.max_ind_range > 0: + ln_emb = np.array(list(map( + lambda x: x if x < args.max_ind_range else args.max_ind_range, + ln_emb + ))) + m_den = train_data.m_den + ln_bot[0] = m_den + + elif args.data_generation == "synthetic": + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld = dd.data_loader(args, ln_emb, m_den) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + table_feature_map = None # {idx : idx for idx in range(len(ln_emb))} + + else: + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + + ### parse command line arguments ### + m_spa = args.arch_sparse_feature_size + num_fea = ln_emb.size + 1 # num sparse + num dense features + m_den_out = ln_bot[ln_bot.size - 1] + if args.arch_interaction_op == "dot": + # approach 1: all + # num_int = num_fea * num_fea + m_den_out + # approach 2: unique + if (args.arch_project_size > 0): + num_int = num_fea * args.arch_project_size + m_den_out + else: + if args.arch_interaction_itself: + num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + else: + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + elif args.arch_interaction_op == "cat": + num_int = num_fea * m_den_out + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + args.arch_interaction_op + + " is not supported" + ) + arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top + ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") + + # sanity check: feature sizes and mlp dimensions must match + if m_den != ln_bot[0]: + sys.exit( + "ERROR: arch-dense-feature-size " + + str(m_den) + + " does not match first dim of bottom mlp " + + str(ln_bot[0]) + ) + if args.qr_flag: + if args.qr_operation == "concat" and 2 * m_spa != m_den_out: + sys.exit( + "ERROR: 2 arch-sparse-feature-size " + + str(2 * m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + + " (note that the last dim of bottom mlp must be 2x the embedding dim)" + ) + if args.qr_operation != "concat" and m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + else: + if m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + if num_int != ln_top[0]: + sys.exit( + "ERROR: # of feature interactions " + + str(num_int) + + " does not match first dimension of top mlp " + + str(ln_top[0]) + ) + + # assign mixed dimensions if applicable + if args.md_flag: + m_spa = md_solver( + torch.tensor(ln_emb), + args.md_temperature, # alpha + d0=m_spa, + round_dim=args.md_round_dims + ).tolist() + + # test prints (model arch) + if args.debug_mode: + print("model arch:") + print( + "mlp top arch " + + str(ln_top.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_top) + print("# of interactions") + print(num_int) + print( + "mlp bot arch " + + str(ln_bot.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_bot) + print("# of features (sparse and dense)") + print(num_fea) + print("dense feature size") + print(m_den) + print("sparse feature size") + print(m_spa) + print( + "# of embeddings (= # of sparse features) " + + str(ln_emb.size) + + ", with dimensions " + + str(m_spa) + + "x:" + ) + print(ln_emb) + + print("data (inputs and targets):") + for j, (X, lS_o, lS_i, T) in enumerate(train_ld): + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + + print("mini-batch: %d" % j) + print(X.detach().cpu().numpy()) + # transform offsets to lengths when printing + print( + [ + np.diff( + S_o.detach().cpu().tolist() + list(lS_i[i].shape) + ).tolist() + for i, S_o in enumerate(lS_o) + ] + ) + print([S_i.detach().cpu().tolist() for S_i in lS_i]) + print(T.detach().cpu().numpy()) + + ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1 + + ### construct the neural network specified above ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + # np.random.seed(args.numpy_rand_seed) + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + args.arch_project_size, + arch_interaction_op=args.arch_interaction_op, + arch_interaction_itself=args.arch_interaction_itself, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + sync_dense_params=args.sync_dense_params, + loss_threshold=args.loss_threshold, + ndevices=ndevices, + qr_flag=args.qr_flag, + qr_operation=args.qr_operation, + qr_collisions=args.qr_collisions, + qr_threshold=args.qr_threshold, + md_flag=args.md_flag, + md_threshold=args.md_threshold, + ) + # test prints + if args.debug_mode: + print("initial parameters (weights and bias):") + for param in dlrm.parameters(): + print(param.detach().cpu().numpy()) + # print(dlrm) + + if use_gpu: + # Custom Model-Data Parallel + # the mlps are replicated and use data parallelism, while + # the embeddings are distributed and use model parallelism + dlrm = dlrm.to(device) # .cuda() + if dlrm.ndevices > 1: + dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb) + + if ext_dist.my_size > 1: + if use_gpu: + device_ids = [ext_dist.my_local_rank] + dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids) + dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids) + else: + dlrm.bot_l = DDP(dlrm.bot_l) + dlrm.top_l = DDP(dlrm.top_l) + + # specify the loss function + if args.loss_function == "mse": + loss_fn = torch.nn.MSELoss(reduction="mean") + elif args.loss_function == "bce": + loss_fn = torch.nn.BCELoss(reduction="mean") + elif args.loss_function == "wbce": + loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-")) + loss_fn = torch.nn.BCELoss(reduction="none") + else: + sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported") + + if not args.inference_only: + # specify the optimizer algorithm + + if ext_dist.my_size == 1: + optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate) + #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step, + # args.lr_num_decay_steps) + else: + optimizer = torch.optim.SGD([ + {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate}, + {"params": dlrm.bot_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size}, + {"params": dlrm.top_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size} + ], lr=args.learning_rate) + + ### main loop ### + def time_wrap(use_gpu): + if use_gpu: + torch.cuda.synchronize() + return time.time() + + def dlrm_wrap(X, lS_o, lS_i, use_gpu, device): + if use_gpu: # .cuda() + # lS_i can be either a list of tensors or a stacked tensor. + # Handle each case below: + tm.tmH2D.start() + lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \ + else lS_i.to(device) + lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \ + else lS_o.to(device) + X = X.to(device) + tm.tmH2D.stop() + + return dlrm( + X, + lS_o, + lS_i + ) + else: + return dlrm(X, lS_o, lS_i) + + def loss_fn_wrap(Z, T, use_gpu, device): + if args.loss_function == "mse" or args.loss_function == "bce": + if use_gpu: + return loss_fn(Z, T.to(device)) + else: + return loss_fn(Z, T) + elif args.loss_function == "wbce": + if use_gpu: + loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T).to(device) + loss_fn_ = loss_fn(Z, T.to(device)) + else: + loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T) + loss_fn_ = loss_fn(Z, T.to(device)) + loss_sc_ = loss_ws_ * loss_fn_ + # debug prints + # print(loss_ws_) + # print(loss_fn_) + return loss_sc_.mean() + + # training or inference + best_gA_test = 0 + best_auc_test = 0 + skip_upto_epoch = 0 + skip_upto_batch = 0 + total_time = 0 + total_loss = 0 + total_accu = 0 + total_iter = 0 + total_samp = 0 + k = 0 + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + if use_gpu: + if dlrm.ndevices > 1: + # NOTE: when targeting inference on multiple GPUs, + # load the model as is on CPU or GPU, with the move + # to multiple GPUs to be done in parallel_forward + ld_model = torch.load(args.load_model) + else: + # NOTE: when targeting inference on single GPU, + # note that the call to .to(device) has already happened + ld_model = torch.load( + args.load_model, + map_location=torch.device('cuda') + # map_location=lambda storage, loc: storage.cuda(0) + ) + else: + # when targeting inference on CPU + ld_model = torch.load(args.load_model, map_location=torch.device('cpu')) + dlrm.load_state_dict(ld_model["state_dict"]) + ld_j = ld_model["iter"] + ld_k = ld_model["epoch"] + ld_nepochs = ld_model["nepochs"] + ld_nbatches = ld_model["nbatches"] + ld_nbatches_test = ld_model["nbatches_test"] + ld_gA = ld_model["train_acc"] + ld_gL = ld_model["train_loss"] + ld_total_loss = ld_model["total_loss"] + ld_total_accu = ld_model["total_accu"] + ld_gA_test = ld_model["test_acc"] + ld_gL_test = ld_model["test_loss"] + if not args.inference_only: + optimizer.load_state_dict(ld_model["opt_state_dict"]) + best_gA_test = ld_gA_test + total_loss = ld_total_loss + total_accu = ld_total_accu + skip_upto_epoch = ld_k # epochs + skip_upto_batch = ld_j # batches + else: + args.print_freq = ld_nbatches + args.test_freq = 0 + + print( + "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format( + ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test + ) + ) + print( + "Training state: loss = {:.6f}, accuracy = {:3.3f} %".format( + ld_gL, ld_gA * 100 + ) + ) + print( + "Testing state: loss = {:.6f}, accuracy = {:3.3f} %".format( + ld_gL_test, ld_gA_test * 100 + ) + ) + + ext_dist.barrier() + startTime = time.time() + startTime0 = startTime + skipped = 0 + + #print("Processing data") + #t1 = time.time() + syndatasetlen = min(65536 // args.mini_batch_size, nbatches) + #myobj = list(enumerate(train_ld)) + #t2 = time.time() + #print("Processing data takes {} seconds with len={} {} {} {}".format(t2-t1, len(myobj), nbatches, args.mini_batch_size, syndatasetlen)) + print("time/loss/accuracy (if enabled):") + with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof: + # with torch.autograd.profiler.emit_nvtx(): + + while k < args.nepochs: + if k < skip_upto_epoch: + continue + + if use_gpu: + tm.tmSync1.start() + torch.cuda.synchronize() + tm.tmSync1.stop() + accum_time_begin = time.time() + + if args.mlperf_logging: + previous_iteration_time = None + + # for j, (X, lS_o, lS_i, T) in enumerate(train_ld): + for j in range(nbatches): + tm.tmGetData.start() + # X, lS_o, lS_i, T = myobj[j%syndatasetlen][1] + X, lS_o, lS_i, T = train_data.__getitem__(j%syndatasetlen) + tm.tmGetData.stop() + + if j == 0 and args.save_onnx: + (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i) + + if j < skip_upto_batch: + continue + + if (skipped == 2): + ext_dist.barrier() + startTime = time.time() + ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank) + # torch.cuda.profiler.cudart().cudaProfilerStart() + torch.cuda.profiler.start() + tm.tmClear() + skipped = skipped + 1 + + if args.mlperf_logging: + current_time = time_wrap(use_gpu) + if previous_iteration_time: + iteration_time = current_time - previous_iteration_time + else: + iteration_time = 0 + previous_iteration_time = current_time + else: + if use_gpu: + tm.tmSync2.start() + torch.cuda.synchronize() + tm.tmSync2.stop() + t1 = time.time() + + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + ''' + # debug prints + print("input and targets") + print(X.detach().cpu().numpy()) + print([np.diff(S_o.detach().cpu().tolist() + + list(lS_i[i].shape)).tolist() for i, S_o in enumerate(lS_o)]) + print([S_i.detach().cpu().numpy().tolist() for S_i in lS_i]) + print(T.detach().cpu().numpy()) + ''' + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0: + print("Warning: Skiping the batch %d with size %d" % (j, X.size(0))) + continue + + + # forward pass + tm.tmFwd.start() + Z = dlrm_wrap(X, lS_o, lS_i, use_gpu, device) + tm.tmFwd.stop() + + # loss + tm.tmLoss.start() + E = loss_fn_wrap(Z, T, use_gpu, device) + ''' + # debug prints + print("output and loss") + print(Z.detach().cpu().numpy()) + print(E.detach().cpu().numpy()) + ''' + # compute loss and accuracy + L = E.detach().cpu().numpy() # numpy array + S = Z.detach().cpu().numpy() # numpy array + T = T.detach().cpu().numpy() # numpy array + mbs = T.shape[0] # = args.mini_batch_size except maybe for last + A = np.sum((np.round(S, 0) == T).astype(np.uint8)) + tm.tmLoss.stop() + + if not args.inference_only: + # scaled error gradient propagation + # (where we do not accumulate gradients across mini-batches) + tm.tmZero.start() + optimizer.zero_grad() + tm.tmZero.stop() + + # backward pass + tm.tmBwd.start() + E.backward() + tm.tmBwd.stop() + + # debug prints (check gradient norm) + # for l in mlp.layers: + # if hasattr(l, 'weight'): + # print(l.weight.grad.norm().item()) + + # optimizer + tm.tmOpt.start() + optimizer.step() + tm.tmOpt.stop() + + ### lr_scheduler.step() + + if args.mlperf_logging: + total_time += iteration_time + else: + if use_gpu: + tm.tmSync3.start() + torch.cuda.synchronize() + tm.tmSync3.stop() + t2 = time.time() + total_time += t2 - t1 + + total_accu += A + total_loss += L * mbs + total_iter += 1 + total_samp += mbs + + should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches) + should_test = ( + (args.test_freq > 0) + and (args.data_generation == "dataset") + and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches)) + ) + + # print time, loss and accuracy + if should_print or should_test: + gT = 1000.0 * total_time / total_iter if args.print_time else -1 + total_time = 0 + + gA = total_accu / total_samp + total_accu = 0 + + gL = total_loss / total_samp + total_loss = 0 + + str_run_type = "inference" if args.inference_only else "training" + print( + "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format( + str_run_type, j + 1, nbatches, k, gT + ) + + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL, + gA * 100, total_iter, ext_dist.my_rank) + ) + # Uncomment the line below to print out the total time with overhead + if ext_dist.my_rank < 2: + tt1 = time.time() + ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \ + .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1)) + total_iter = 0 + total_samp = 0 + + # testing + if should_test and not args.inference_only: + # don't measure training iter time in a test iteration + if args.mlperf_logging: + previous_iteration_time = None + + test_accu = 0 + test_loss = 0 + test_samp = 0 + + accum_test_time_begin = time_wrap(use_gpu) + if args.mlperf_logging: + scores = [] + targets = [] + + for i, (X_test, lS_o_test, lS_i_test, T_test) in enumerate(test_ld): + # early exit if nbatches was set by the user and was exceeded + if nbatches > 0 and i >= nbatches: + break + + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0: + print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0))) + continue + + t1_test = time_wrap(use_gpu) + + # forward pass + Z_test = dlrm_wrap( + X_test, lS_o_test, lS_i_test, use_gpu, device + ) + if args.mlperf_logging: + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + scores.append(S_test) + targets.append(T_test) + else: + # loss + E_test = loss_fn_wrap(Z_test, T_test, use_gpu, device) + + # compute loss and accuracy + L_test = E_test.detach().cpu().numpy() # numpy array + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + mbs_test = T_test.shape[0] # = mini_batch_size except last + A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8)) + test_accu += A_test + test_loss += L_test * mbs_test + test_samp += mbs_test + + t2_test = time_wrap(use_gpu) + + if args.mlperf_logging: + scores = np.concatenate(scores, axis=0) + targets = np.concatenate(targets, axis=0) + + metrics = { + 'loss' : sklearn.metrics.log_loss, + 'recall' : lambda y_true, y_score: + sklearn.metrics.recall_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'precision' : lambda y_true, y_score: + sklearn.metrics.precision_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'f1' : lambda y_true, y_score: + sklearn.metrics.f1_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'ap' : sklearn.metrics.average_precision_score, + 'roc_auc' : sklearn.metrics.roc_auc_score, + 'accuracy' : lambda y_true, y_score: + sklearn.metrics.accuracy_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + # 'pre_curve' : sklearn.metrics.precision_recall_curve, + # 'roc_curve' : sklearn.metrics.roc_curve, + } + + # print("Compute time for validation metric : ", end="") + # first_it = True + validation_results = {} + for metric_name, metric_function in metrics.items(): + # if first_it: + # first_it = False + # else: + # print(", ", end="") + # metric_compute_start = time_wrap(False) + validation_results[metric_name] = metric_function( + targets, + scores + ) + # metric_compute_end = time_wrap(False) + # met_time = metric_compute_end - metric_compute_start + # print("{} {:.4f}".format(metric_name, 1000 * (met_time)), + # end="") + # print(" ms") + gA_test = validation_results['accuracy'] + gL_test = validation_results['loss'] + else: + gA_test = test_accu / test_samp + gL_test = test_loss / test_samp + + is_best = gA_test > best_gA_test + if is_best: + best_gA_test = gA_test + if not (args.save_model == ""): + print("Saving model to {}".format(args.save_model)) + torch.save( + { + "epoch": k, + "nepochs": args.nepochs, + "nbatches": nbatches, + "nbatches_test": nbatches_test, + "iter": j + 1, + "state_dict": dlrm.state_dict(), + "train_acc": gA, + "train_loss": gL, + "test_acc": gA_test, + "test_loss": gL_test, + "total_loss": total_loss, + "total_accu": total_accu, + "opt_state_dict": optimizer.state_dict(), + }, + args.save_model, + ) + + if args.mlperf_logging: + is_best = validation_results['roc_auc'] > best_auc_test + if is_best: + best_auc_test = validation_results['roc_auc'] + + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k) + + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format( + validation_results['loss'], + validation_results['recall'], + validation_results['precision'] + ) + + " f1 {:.4f}, ap {:.4f},".format( + validation_results['f1'], + validation_results['ap'], + ) + + " auc {:.4f}, best auc {:.4f},".format( + validation_results['roc_auc'], + best_auc_test + ) + + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( + validation_results['accuracy'] * 100, + best_gA_test * 100 + ) + ) + else: + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0) + + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format( + gL_test, gA_test * 100, best_gA_test * 100 + ) + ) + # Uncomment the line below to print out the total time with overhead + # print("Total test time for this group: {}" \ + # .format(time_wrap(use_gpu) - accum_test_time_begin)) + + if (args.mlperf_logging + and (args.mlperf_acc_threshold > 0) + and (best_gA_test > args.mlperf_acc_threshold)): + print("MLPerf testing accuracy threshold " + + str(args.mlperf_acc_threshold) + + " reached, stop training") + break + + if (args.mlperf_logging + and (args.mlperf_auc_threshold > 0) + and (best_auc_test > args.mlperf_auc_threshold)): + print("MLPerf testing auc threshold " + + str(args.mlperf_auc_threshold) + + " reached, stop training") + break + + #if (ext_dist.my_rank == 0 and should_print): + # print("ITER : ", j, " from nvidia-smi") + # os.system("nvidia-smi") + + k += 1 # nepochs + + #if (ext_dist.my_rank == 0): + # # print(torch.cuda.memory_allocated(0)) + # print(torch.cuda.memory_summary(0)) + # # print("from nvidia-smi") + # os.system("nvidia-smi") + + tt2 = time.time() + endTime = tt2 - startTime + ext_dist.barrier() + tt3 = time.time() + finalTime = tt3 - startTime + # torch.cuda.profiler.cudart().cudaProfilerStop() + torch.cuda.profiler.stop() + if (skipped > 2): + skipped -= 2 + ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \ + iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0, + finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True) + if (ext_dist.my_rank < 2): + tm.tmSummary(ext_dist.my_rank) + + file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank) + # profiling + if args.enable_profiling: + os.makedirs(args.out_dir, exist_ok=True) + with open("TT"+str(uuid.uuid4().hex), "w") as prof_f: + prof_f.write(prof.key_averages(group_by_input_shape=True).table( + sort_by="self_cpu_time_total", + )) + +# with open("%s.prof" % file_prefix, "w") as prof_f: +# prof_f.write(prof.key_averages().table(sort_by="cpu_time_total")) +# prof.export_chrome_trace("./%s.json" % file_prefix) +# print(prof.key_averages().table(sort_by="cpu_time_total")) + + # plot compute graph + if args.plot_compute_graph: + sys.exit( + "ERROR: Please install pytorchviz package in order to use the" + + " visualization. Then, uncomment its import above as well as" + + " three lines below and run the code again." + ) + # os.makedirs(args.out_dir, exist_ok=True) + # V = Z.mean() if args.inference_only else E + # dot = make_dot(V, params=dict(dlrm.named_parameters())) + # dot.render('%s_graph' % file_prefix) # write .pdf file + + # test prints + if not args.inference_only and args.debug_mode: + print("updated parameters (weights and bias):") + for param in dlrm.parameters(): + print(param.detach().cpu().numpy()) + + # export the model in onnx + if args.save_onnx: + + dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx" + torch.onnx.export( + dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True + ) + + # recover the model back + dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix) + # check the onnx model + onnx.checker.check_model(dlrm_pytorch_onnx) diff --git a/dlrm_s_caffe2.py b/dlrm_s_caffe2.py index 47b27d61..eb3e3638 100644 --- a/dlrm_s_caffe2.py +++ b/dlrm_s_caffe2.py @@ -79,6 +79,7 @@ # caffe2 from caffe2.proto import caffe2_pb2 from caffe2.python import brew, core, dyndep, model_helper, net_drawer, workspace +# from caffe2.python.predictor import mobile_exporter """ # auxiliary routine used to split input on the mini-bacth dimension @@ -607,6 +608,9 @@ def create_model(self, X, S_lengths, S_indices, T): tril_indices = np.array([j + i * num_fea for i in range(num_fea) for j in range(i + offset)]) self.FeedBlobWrapper(self.tint + "_tril_indices", tril_indices) + if self.save_onnx: + tish = tril_indices.shape + self.onnx_tsd[self.tint + "_tril_indices"] = (onnx.TensorProto.INT32, tish) # create compute graph if T is not None: diff --git a/dlrm_s_pytorch.py b/dlrm_s_pytorch.py index 3aeeec0c..5129a39b 100644 --- a/dlrm_s_pytorch.py +++ b/dlrm_s_pytorch.py @@ -65,6 +65,7 @@ # numpy import numpy as np +import socket # onnx # The onnx import causes deprecation warnings every time workers @@ -72,10 +73,11 @@ import warnings with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) -import onnx +## import onnx # pytorch import torch +from torch import onnx import torch.nn as nn from torch.nn.parallel.parallel_apply import parallel_apply from torch.nn.parallel.replicate import replicate @@ -91,12 +93,57 @@ import sklearn.metrics +import uuid +import project +from torch.nn.parallel import DistributedDataParallel as DDP + +import dlrm_data as dd + +# import synthetic_data_loader as fb_syn_data + # from torchviz import make_dot # import torch.nn.functional as Functional # from torch.nn.parameter import Parameter +from torch.optim.lr_scheduler import _LRScheduler + exc = getattr(builtins, "IOError", "FileNotFoundError") +class LRPolicyScheduler(_LRScheduler): + def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps): + self.num_warmup_steps = num_warmup_steps + self.decay_start_step = decay_start_step + self.decay_end_step = decay_start_step + num_decay_steps + self.num_decay_steps = num_decay_steps + + if self.decay_start_step < self.num_warmup_steps: + sys.exit("Learning rate warmup must finish before the decay starts") + + super(LRPolicyScheduler, self).__init__(optimizer) + + def get_lr(self): + step_count = self._step_count + if step_count < self.num_warmup_steps: + # warmup + scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps + lr = [base_lr * scale for base_lr in self.base_lrs] + self.last_lr = lr + elif self.decay_start_step <= step_count and step_count < self.decay_end_step: + # decay + decayed_steps = step_count - self.decay_start_step + scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2 + min_lr = 0.0000001 + lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs] + self.last_lr = lr + else: + if self.num_decay_steps > 0: + # freeze at last, either because we're after decay + # or because we're between warmup and decay + lr = self.last_lr + else: + # do not adjust + lr = self.base_lrs + return lr ### define dlrm in PyTorch ### class DLRM_Net(nn.Module): @@ -154,9 +201,9 @@ def create_emb(self, m, ln): if self.qr_flag and n > self.qr_threshold: EE = QREmbeddingBag(n, m, self.qr_collisions, operation=self.qr_operation, mode="sum", sparse=True) - elif self.md_flag and n > self.md_threshold: - _m = m[i] + elif self.md_flag: base = max(m) + _m = m[i] if n > self.md_threshold else base EE = PrEmbeddingBag(n, _m, base) # use np initialization as below for consistency... W = np.random.uniform( @@ -198,6 +245,7 @@ def __init__( ln_emb=None, ln_bot=None, ln_top=None, + proj_size = 0, arch_interaction_op=None, arch_interaction_itself=False, sigmoid_bot=-1, @@ -223,6 +271,7 @@ def __init__( ): # save arguments + self.proj_size = proj_size self.ndevices = ndevices self.output_d = 0 self.parallel_model_batch_size = -1 @@ -259,6 +308,8 @@ def __init__( self.emb_l = self.create_emb(m_spa, ln_emb) self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) self.top_l = self.create_mlp(ln_top, sigmoid_top) + if (proj_size > 0): + self.proj_l = project.create_proj(len(ln_emb)+1, proj_size) def apply_mlp(self, x, layers): # approach 1: use ModuleList @@ -268,6 +319,14 @@ def apply_mlp(self, x, layers): # approach 2: use Sequential container to wrap all layers return layers(x) + def apply_proj(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + def apply_emb(self, lS_o, lS_i, emb_l): # WARNING: notice that we are processing the batch at once. We implicitly # assume that the data is laid out such that: @@ -298,22 +357,32 @@ def interact_features(self, x, ly): (batch_size, d) = x.shape T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) # perform a dot product - Z = torch.bmm(T, torch.transpose(T, 1, 2)) - # append dense feature with the interactions (into a row vector) - # approach 1: all - # Zflat = Z.view((batch_size, -1)) - # approach 2: unique - _, ni, nj = Z.shape - # approach 1: tril_indices - # offset = 0 if self.arch_interaction_itself else -1 - # li, lj = torch.tril_indices(ni, nj, offset=offset) - # approach 2: custom - offset = 1 if self.arch_interaction_itself else 0 - li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) - lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) - Zflat = Z[:, li, lj] - # concatenate dense features and interactions - R = torch.cat([x] + [Zflat], dim=1) + if (self.proj_size > 0): + R = project.project(T, x, self.proj_l) + #TT = torch.transpose(T, 1, 2) + #TS = torch.reshape(TT, (-1, TT.size(2))) + #TC = self.apply_mlp(TS, self.proj_l) + #TR = torch.reshape(TC, (-1, d ,self.proj_size)) + #Z = torch.bmm(T, TR) + #Zflat = Z.view((batch_size, -1)) + #R = torch.cat([x] + [Zflat], dim=1) + else: + Z = torch.bmm(T, torch.transpose(T, 1, 2)) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = Z.view((batch_size, -1)) + # approach 2: unique + _, ni, nj = Z.shape + # approach 1: tril_indices + # offset = 0 if self.arch_interaction_itself else -1 + # li, lj = torch.tril_indices(ni, nj, offset=offset) + # approach 2: custom + offset = 1 if self.arch_interaction_itself else 0 + li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) + lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) + Zflat = Z[:, li, lj] + # concatenate dense features and interactions + R = torch.cat([x] + [Zflat], dim=1) elif self.arch_interaction_op == "cat": # concatenation features (into a row vector) R = torch.cat([x] + ly, dim=1) @@ -417,14 +486,15 @@ def distributed_forward(self, dense_x, lS_o, lS_i): z = p ### gather the distributed results on each rank ### - # For some reason it requires explicit sync before all_gather call if + # For some reason it requires explicit sync before all_gather call if # tensor is on GPU memory if z.is_cuda: torch.cuda.synchronize() (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size) z = ext_dist.all_gather(z, batch_split_lengths) #print("Z: %s" % z) + return z - + def parallel_forward(self, dense_x, lS_o, lS_i): ### prepare model (overwrite) ### # WARNING: # of devices must be >= batch size in parallel_forward call @@ -534,6 +604,30 @@ def parallel_forward(self, dense_x, lS_o, lS_i): return z0 +def dash_separated_ints(value): + vals = value.split('-') + for val in vals: + try: + int(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of ints" % value) + + return value + + +def dash_separated_floats(value): + vals = value.split('-') + for val in vals: + try: + float(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of floats" % value) + + return value + + if __name__ == "__main__": ### import packages ### import sys @@ -546,11 +640,18 @@ def parallel_forward(self, dense_x, lS_o, lS_i): ) # model related parameters parser.add_argument("--arch-sparse-feature-size", type=int, default=2) - parser.add_argument("--arch-embedding-size", type=str, default="4-3-2") + + parser.add_argument( + "--arch-embedding-size", type=dash_separated_ints, default="4-3-2") + parser.add_argument("--arch-project-size", type=int, default=0) + # j will be replaced with the table number - parser.add_argument("--arch-mlp-bot", type=str, default="4-3-2") - parser.add_argument("--arch-mlp-top", type=str, default="4-2-1") - parser.add_argument("--arch-interaction-op", type=str, default="dot") + parser.add_argument( + "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2") + parser.add_argument( + "--arch-mlp-top", type=dash_separated_ints, default="4-2-1") + parser.add_argument( + "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot") parser.add_argument("--arch-interaction-itself", action="store_true", default=False) # embedding table options parser.add_argument("--md-flag", action="store_true", default=False) @@ -564,7 +665,8 @@ def parallel_forward(self, dense_x, lS_o, lS_i): # activations and loss parser.add_argument("--activation-function", type=str, default="relu") parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce - parser.add_argument("--loss-weights", type=str, default="1.0-1.0") # for wbce + parser.add_argument( + "--loss-weights", type=dash_separated_floats, default="1.0-1.0") # for wbce parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 parser.add_argument("--round-targets", type=bool, default=False) # data @@ -573,6 +675,15 @@ def parallel_forward(self, dense_x, lS_o, lS_i): parser.add_argument( "--data-generation", type=str, default="random" ) # synthetic or dataset + parser.add_argument("--synthetic-data-folder", type=str, + default="./synthetic_data/syn_data_bs65536") + # add Gaussian distribution + parser.add_argument("--rand-data-dist", type=str, default="uniform") # uniform or gaussian + parser.add_argument("--rand-data-min", type=float, default=0) + parser.add_argument("--rand-data-max", type=float, default=1) + parser.add_argument("--rand-data-mu", type=float, default=-1) + parser.add_argument("--rand-data-sigma", type=float, default=1) + parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte parser.add_argument("--raw-data-file", type=str, default="") @@ -621,10 +732,22 @@ def parallel_forward(self, dense_x, lS_o, lS_i): parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) parser.add_argument("--mlperf-bin-loader", action='store_true', default=False) parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False) + + # LR policy + parser.add_argument("--lr-num-warmup-steps", type=int, default=0) + parser.add_argument("--lr-decay-start-step", type=int, default=0) + parser.add_argument("--lr-num-decay-steps", type=int, default=0) + args = parser.parse_args() + print(socket.gethostname()) + ext_dist.init_distributed(backend=args.dist_backend) + # print("success size= ", ext_dist.my_size, ext_dist.my_rank) + + ext_dist.barrier() + if args.mlperf_logging: print('command line args: ', json.dumps(vars(args))) @@ -658,6 +781,7 @@ def parallel_forward(self, dense_x, lS_o, lS_i): else: device = torch.device("cuda", 0) ngpus = torch.cuda.device_count() # 1 + ngpus=1 print("Using {} GPU(s)...".format(ngpus)) else: device = torch.device("cpu") @@ -682,11 +806,20 @@ def parallel_forward(self, dense_x, lS_o, lS_i): ))) m_den = train_data.m_den ln_bot[0] = m_den + + elif args.data_generation == "synthetic": + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld = dd.data_loader(args, ln_emb, m_den) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + table_feature_map = None # {idx : idx for idx in range(len(ln_emb))} + else: # input and target at random ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") m_den = ln_bot[0] - train_data, train_ld = dp.make_random_data_and_loader(args, ln_emb, m_den) + train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den) nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) ### parse command line arguments ### @@ -697,10 +830,13 @@ def parallel_forward(self, dense_x, lS_o, lS_i): # approach 1: all # num_int = num_fea * num_fea + m_den_out # approach 2: unique - if args.arch_interaction_itself: - num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + if (args.arch_project_size > 0): + num_int = num_fea * args.arch_project_size + m_den_out else: - num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + if args.arch_interaction_itself: + num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + else: + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out elif args.arch_interaction_op == "cat": num_int = num_fea * m_den_out else: @@ -824,6 +960,7 @@ def parallel_forward(self, dense_x, lS_o, lS_i): ln_emb, ln_bot, ln_top, + args.arch_project_size, arch_interaction_op=args.arch_interaction_op, arch_interaction_itself=args.arch_interaction_itself, sigmoid_bot=-1, @@ -852,15 +989,15 @@ def parallel_forward(self, dense_x, lS_o, lS_i): dlrm = dlrm.to(device) # .cuda() if dlrm.ndevices > 1: dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb) - + if ext_dist.my_size > 1: if use_gpu: device_ids = [ext_dist.my_local_rank] - dlrm.bot_l = ext_dist.DDP(dlrm.bot_l, device_ids=device_ids) - dlrm.top_l = ext_dist.DDP(dlrm.top_l, device_ids=device_ids) + dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids) + dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids) else: - dlrm.bot_l = ext_dist.DDP(dlrm.bot_l) - dlrm.top_l = ext_dist.DDP(dlrm.top_l) + dlrm.bot_l = DDP(dlrm.bot_l) + dlrm.top_l = DDP(dlrm.top_l) # specify the loss function if args.loss_function == "mse": @@ -875,8 +1012,11 @@ def parallel_forward(self, dense_x, lS_o, lS_i): if not args.inference_only: # specify the optimizer algorithm + if ext_dist.my_size == 1: optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate) + #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step, + # args.lr_num_decay_steps) else: optimizer = torch.optim.SGD([ {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate}, @@ -997,8 +1137,12 @@ def loss_fn_wrap(Z, T, use_gpu, device): ) ext_dist.barrier() + startTime = time.time() + startTime0 = startTime + skipped = 0 + print("time/loss/accuracy (if enabled):") - with torch.autograd.profiler.profile(args.enable_profiling, use_gpu) as prof: + with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof: while k < args.nepochs: if k < skip_upto_epoch: continue @@ -1009,9 +1153,18 @@ def loss_fn_wrap(Z, T, use_gpu, device): previous_iteration_time = None for j, (X, lS_o, lS_i, T) in enumerate(train_ld): + if j == 0 and args.save_onnx: + (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i) + if j < skip_upto_batch: continue + if (skipped == 2): + ext_dist.barrier() + startTime = time.time() + ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank) + skipped = skipped + 1 + if args.mlperf_logging: current_time = time_wrap(use_gpu) if previous_iteration_time: @@ -1071,6 +1224,7 @@ def loss_fn_wrap(Z, T, use_gpu, device): # optimizer optimizer.step() + ### lr_scheduler.step() if args.mlperf_logging: total_time += iteration_time @@ -1105,11 +1259,14 @@ def loss_fn_wrap(Z, T, use_gpu, device): "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format( str_run_type, j + 1, nbatches, k, gT ) - + "loss {:.6f}, accuracy {:3.3f} %".format(gL, gA * 100) + + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL, + gA * 100, total_iter, ext_dist.my_rank) ) # Uncomment the line below to print out the total time with overhead - # print("Accumulated time so far: {}" \ - # .format(time_wrap(use_gpu) - accum_time_begin)) + if ext_dist.my_rank < 2: + tt1 = time_wrap(use_gpu) + ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \ + .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1)) total_iter = 0 total_samp = 0 @@ -1297,16 +1454,42 @@ def loss_fn_wrap(Z, T, use_gpu, device): + " reached, stop training") break + #if (ext_dist.my_rank == 0 and should_print): + # print("ITER : ", j, " from nvidia-smi") + # os.system("nvidia-smi") + k += 1 # nepochs + #if (ext_dist.my_rank == 0): + # # print(torch.cuda.memory_allocated(0)) + # print(torch.cuda.memory_summary(0)) + # # print("from nvidia-smi") + # os.system("nvidia-smi") + + tt2 = time.time() + endTime = tt2 - startTime + ext_dist.barrier() + tt3 = time.time() + finalTime = tt3 - startTime + if (skipped > 2): + skipped -= 2 + ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \ + iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0, + finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True) + file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank) # profiling if args.enable_profiling: os.makedirs(args.out_dir, exist_ok=True) - with open("%s.prof" % file_prefix, "w") as prof_f: - prof_f.write(prof.key_averages().table(sort_by="cpu_time_total")) - prof.export_chrome_trace("./%s.json" % file_prefix) - # print(prof.key_averages().table(sort_by="cpu_time_total")) + with open("TT"+str(uuid.uuid4().hex), "w") as prof_f: + prof_f.write(prof.key_averages(group_by_input_shape=True).table( + sort_by="self_cpu_time_total", + )) + +# with open("%s.prof" % file_prefix, "w") as prof_f: +# prof_f.write(prof.key_averages().table(sort_by="cpu_time_total")) +# prof.export_chrome_trace("./%s.json" % file_prefix) +# print(prof.key_averages().table(sort_by="cpu_time_total")) # plot compute graph if args.plot_compute_graph: @@ -1328,12 +1511,12 @@ def loss_fn_wrap(Z, T, use_gpu, device): # export the model in onnx if args.save_onnx: - os.makedirs(args.out_dir, exist_ok=True) - with open("%s.onnx" % file_prefix, "w+b") as dlrm_pytorch_onnx_file: - (X, lS_o, lS_i, _) = train_data[0] # get first batch of elements - torch.onnx._export( - dlrm, (X, lS_o, lS_i), dlrm_pytorch_onnx_file, verbose=True - ) + + dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx" + torch.onnx.export( + dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True + ) + # recover the model back dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix) # check the onnx model diff --git a/extend_distributed.py b/extend_distributed.py index d7fd9dd1..e816654d 100644 --- a/extend_distributed.py +++ b/extend_distributed.py @@ -4,11 +4,15 @@ from torch.autograd import Function from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist + +import profile as tm + try: import torch_ccl except ImportError as e: #print(e) torch_ccl = False +import time my_rank = -1 my_size = -1 @@ -39,6 +43,47 @@ def get_split_lengths(n): my_len = splits[my_rank] return (my_len, splits) +def get_world_rank_from_env(): + return env2int( + ["RANK", + "PMI_RANK", + "OMPI_COMM_WORLD_RANK", + "MV2_COMM_WORLD_RANK", + "SLURM_PROCID"], + -1 + ) + +def get_world_size_from_env(): + return env2int( + ["WORLD_SIZE", + "PMI_SIZE", + "OMPI_COMM_WORLD_SIZE", + "MV2_COMM_WORLD_SIZE", + "SLURM_NPROCS"], + -1 + ) + +def get_local_rank_from_env(): + return env2int( + ["LOCAL_RANK", + "MPI_LOCALRANKID", + "OMPI_COMM_WORLD_LOCAL_RANK", + "MV2_COMM_WORLD_LOCAL_RANK", + "SLURM_LOCALID", + ], + -1, + ) + +def get_local_size_from_env(): + return env2int( + ["LOCAL_SIZE", + "MPI_LOCALNRANKS", + "OMPI_COMM_WORLD_LOCAL_SIZE", + "MV2_COMM_WORLD_LOCAL_SIZE", + ], + -1, + ) + def init_distributed(rank = -1, size = -1, backend=''): global myreq global my_rank @@ -62,30 +107,74 @@ def init_distributed(rank = -1, size = -1, backend=''): if backend != '': #guess Rank and size if rank == -1: - rank = env2int(['PMI_RANK', 'OMPI_COMM_WORLD_RANK', 'MV2_COMM_WORLD_RANK', 'RANK'], 0) + rank = get_world_rank_from_env() if size == -1: - size = env2int(['PMI_SIZE', 'OMPI_COMM_WORLD_SIZE', 'MV2_COMM_WORLD_SIZE', 'WORLD_SIZE'], 1) + size = get_world_size_from_env() + assert rank >= 0 + assert size > 0 + if not os.environ.get('RANK', None) and rank != -1: os.environ['RANK'] = str(rank) if not os.environ.get('WORLD_SIZE', None) and size != -1: os.environ['WORLD_SIZE'] = str(size) if not os.environ.get('MASTER_PORT', None): os.environ['MASTER_PORT'] = '29500' if not os.environ.get('MASTER_ADDR', None): - local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) - if local_size != size and backend != 'mpi': - print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default") - print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR") - os.environ['MASTER_ADDR'] = '127.0.0.1' + if "SLURM_NODELIST" in os.environ: + master_addr = os.environ["SLURM_NODELIST"].replace('-', ',').split(',')[0].replace("[", "") + elif "HOSTNAME" in os.environ: + # handle other cases ? + master_addr = os.environ["HOSTNAME"] + else: + master_addr = "127.0.0.1" + os.environ["MASTER_ADDR"] = master_addr + +# myenv = os.environ +# for e in myenv: +# print(e, "=", myenv[e]) +# print("=== Done ===") if size > 1: + my_local_rank = get_local_rank_from_env() + my_local_size = get_local_size_from_env() + if my_local_size == -1: + if "SLURM_TASKS_PER_NODE" in os.environ: + locsize = os.environ["SLURM_TASKS_PER_NODE"].split("(")[0] + my_local_size = int(locsize) + + assert(my_local_rank >= 0) + assert(my_local_size >= 0) + print("Check local rank ", my_local_rank, " size ", my_local_size, " global rank ", rank, " global size ", size, os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"]) + dist.init_process_group(backend, rank=rank, world_size=size) my_rank = dist.get_rank() my_size = dist.get_world_size() - my_local_rank = env2int(['MPI_LOCALRANKID', 'OMPI_COMM_WORLD_LOCAL_RANK', 'MV2_COMM_WORLD_LOCAL_RANK'], 0) - my_local_size = env2int(['MPI_LOCALNRANKS', 'OMPI_COMM_WORLD_LOCAL_SIZE', 'MV2_COMM_WORLD_LOCAL_SIZE'], 1) - if my_rank == 0: print("Running on %d ranks using %s backend" % (my_size, backend)) + + if my_rank >= 0: print("Running on %d ranks using %s backend" % (my_size, backend)) if hasattr(dist, 'all_to_all_single'): try: - dist.all_to_all_single(torch.empty([0]), torch.empty([0])) + a = torch.arange(my_size) + my_rank * my_size + b = torch.zeros(my_size).to(torch.int64) + c = torch.zeros(my_size).to(torch.int64) + for i in range(my_size): + c[i] = my_rank + i * my_size + + t1 = time.time() + if (torch.cuda.is_available): + dev = torch.device('cuda', my_local_rank) + a = a.to(dev) + b = b.to(dev) + c = c.to(dev) + dist.all_to_all_single(b, a) + if my_rank == 0: + print("alltoall on rank :", my_rank, "a = ", a, " b = ", b) + else: + dist.all_to_all_single(b, a) + t2 = time.time() + + if torch.equal(b, c): alltoall_supported = True + if my_rank == 0: + print("All to all single test passed for rank ", my_rank, " time ", t2 - t1) + else: + print("Failed alltoall single test! for rank= ", my_rank, " time ", t2 - t1) except RuntimeError: pass if a2a_impl == 'alltoall' and alltoall_supported == False: @@ -250,7 +339,8 @@ class All2All_Req(Function): @staticmethod def forward(ctx, a2ai, *inputs): global myreq - #print("All2All_Req:forward") + # print("All2All_Req:forward ", my_rank) + tm.tmA2A10.start() mb_split_lengths = a2ai.gNS if mb_split_lengths: mb_split_lengths = [m * a2ai.E for m in mb_split_lengths] emb_split_lengths = a2ai.gSS @@ -267,12 +357,14 @@ def forward(ctx, a2ai, *inputs): a2ai.emb_split_lengths = emb_split_lengths myreq.a2ai = a2ai ctx.a2ai = a2ai + tm.tmA2A10.stop() return myreq.tensor @staticmethod def backward(ctx, *grad_output): global myreq - #print("All2All_Req:backward") + # print("All2All_Req:backward ", my_rank) + tm.tmA2A12.start() a2ai = ctx.a2ai myreq.req.wait() myreq.req = None @@ -280,6 +372,7 @@ def backward(ctx, *grad_output): grad_inputs = grad_input.view([a2ai.N, -1]).split(a2ai.E, dim=1) grad_inputs = [gin.contiguous() for gin in grad_inputs] myreq.tensor = None + tm.tmA2A12.stop() return (None, *grad_inputs) @@ -287,7 +380,8 @@ class All2All_Wait(Function): @staticmethod def forward(ctx, *output): global myreq - #print("All2All_Wait:forward") + # print("All2All_Wait:forward ", my_rank) + tm.tmA2A11.start() a2ai = myreq.a2ai ctx.a2ai = a2ai myreq.req.wait() @@ -296,12 +390,15 @@ def forward(ctx, *output): emb_split_lengths = a2ai.emb_split_lengths if a2ai.emb_split_lengths else a2ai.lS * a2ai.lN * a2ai.E outputs = output[0].split(emb_split_lengths) outputs = tuple([out.view([a2ai.lN, -1]) for out in outputs]) + tm.tmA2A11.stop() + # print("All2All_Wait:forward done") return outputs @staticmethod def backward(ctx, *grad_outputs): global myreq - #print("All2All_Wait:backward") + # print("All2All_Wait:backward ", my_rank) + tm.tmA2A13.start() a2ai = ctx.a2ai grad_outputs = [gout.contiguous().view([-1]) for gout in grad_outputs] grad_output = torch.cat(grad_outputs) @@ -309,6 +406,8 @@ def backward(ctx, *grad_outputs): req = dist.all_to_all_single(grad_input, grad_output, a2ai.mb_split_lengths, a2ai.emb_split_lengths, async_op=True) myreq.req = req myreq.tensor = grad_input + tm.tmA2A13.stop() + # print("All2All_Wait:backward done") return (grad_output,) class AllGather(Function): @@ -374,7 +473,7 @@ def alltoall(inputs, per_rank_split_lengths): a2ai.S = sum(per_rank_split_lengths) if per_rank_split_lengths else a2ai.lS * my_size if a2a_impl == '' and alltoall_supported or a2a_impl == 'alltoall': - #print("Using All2All_Req") + print("Using All2All_Req") output = All2All_Req.apply(a2ai, *inputs) myreq.WaitFunction = All2All_Wait elif a2a_impl == '' or a2a_impl == 'scatter': diff --git a/job.all.sh b/job.all.sh new file mode 100644 index 00000000..cf56aaca --- /dev/null +++ b/job.all.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +#SBATCH --job-name=testdlrm #The name you want the job to have +#SBATCH --output=/private/home/hongzhang/tmp/dlrm/output-%j +#SBATCH --error=/private/home/hongzhang/tmp/dlrm/error-%j +#SBATCH --nodes=1 # -C volta32gb #The number of compute nodes to use +#SBATCH --ntasks=8 #The total number of cpu tasks to run +#SBATCH --time=00:40:00 # max time +#SBATCH --exclusive # exclusive nodes +#SBATCH --gres=gpu:volta:8 -C volta32gb +#SBATCH --mem-per-cpu=60GB + +# for mpirun host file +echo $SLURM_NODELIST +echo $SLURM_NODELIST > hostfile1 + +source /private/home/hongzhang/.zshrc +#module purge +#module load anaconda3/2019.07 +#module load cuda/10.1 +#module load cudnn/v7.6.5.32-cuda.10.1 +#module load openmpi/4.0.2/gcc.7.4.0-cuda.10.1 + +#export NCCL_ROOT_DIR=/private/home/hongzhang/codes/nccl/build +#export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NCCL_ROOT_DIR/lib +#export CUDA_PATH=$CUDA_HOME +#export CUDNN_PATH=$CUDNN_ROOT_DIR +#export MPI_PATH=$MPI_HOME +#export NCCL_PATH=$NCCL_ROOT_DIR + +conda activate mytorch + +which python3 + +# large_arch_emb="2600-2600-2600-2600-2600-2600-2600-2600" +# large_arch_emb="26000000-26000000-26000000-26000000-26000000-26000000-26000000-26000000" +large_arch_emb_usr=$(printf '260%.0s' {1..815}) +large_arch_emb_usr=${large_arch_emb_usr//"02"/"0-2"} +large_arch_emb_ads=$(printf '140%.0s' {1..544}) +large_arch_emb_ads=${large_arch_emb_ads//"01"/"0-1"} +large_arch_emb="$large_arch_emb_usr-$large_arch_emb_ads" + +# --hostfile hostfile1 +# random +# /public/apps/openmpi/4.0.2/gcc.7.4.0/bin/mpirun -prefix /public/apps/openmpi/4.0.2/gcc.7.4.0/ -v -np 8 python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=random --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=30 --num-batches=4 --arch-project-size=30 + +# fb_synthetic +/public/apps/openmpi/4.0.2/gcc.7.4.0/bin/mpirun -prefix /public/apps/openmpi/4.0.2/gcc.7.4.0/ -v -np 8 python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=synthetic --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=2048 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=28 --num-batches=4 --arch-project-size=30 + +# srun --label /private/home/hongzhang/.conda/envs/mytorch/bin/python3 dlrm_s_pytorch.py --arch-sparse-feature-size=64 --arch-mlp-bot="2000-1024-1024-1024-1024-1024-1024-1024-1024-1024-1024-512-64" --arch-mlp-top="4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-4096-1" --arch-embedding-size=$large_arch_emb --data-generation=random --loss-function=bce --round-targets=True --learning-rate=0.1 --mini-batch-size=128 --print-freq=1 --print-time --test-mini-batch-size=10240 --test-num-workers=16 --use-gpu --dist-backend='nccl' --num-indices-per-lookup-fixed=1 --num-indices-per-lookup=30 --num-batches=4 diff --git a/profile.py b/profile.py new file mode 100644 index 00000000..bb29033a --- /dev/null +++ b/profile.py @@ -0,0 +1,139 @@ +# Add some self profiling information +# Allow nested timer exists + +import time + +class TimerError(Exception): + """Exception in ProfTimer class""" + +class ProfTimer: + def __init__(self, timername="Timer for DLRM Activity"): + self._name = timername + self._start = 0.0 + self._count = 0 + self._elapsed = 0.0 + + def start(self): + """Start a new timer""" + self._start = time.perf_counter() + + def stop(self): + if self._start == 0.0: + raise TimerError(f"Timer is not running.") + self._elapsed += time.perf_counter() - self._start + self._count += 1 + self._start = 0.0 + + def count(self): + return _self._count + + def reset(self): + self._elapsed = 0.0 + self._count = 0 + + def elapsed(self): + return self._elapsed + + def output(self, level): + if level == 0: + print(f"{self._name }: {self._elapsed:0.6f} seconds with counts {self._count}") + else: + print(f" {self._name }: {self._elapsed:0.6f} seconds with counts {self._count}") + +alltimers = [] +tmGetData = ProfTimer("GetData") +tmFwd = ProfTimer("Forword") +tmLoss = ProfTimer("Loss ") +tmZero = ProfTimer("Zero ") +tmBwd = ProfTimer("Backwrd") +tmOpt = ProfTimer("Opt ") +tmSync = ProfTimer("CudaSyn") +tmSync1 = ProfTimer("CudaSy1") +tmSync2 = ProfTimer("CudaSy2") +tmSync3 = ProfTimer("CudaSy3") + +tmH2D = ProfTimer("CopyH2D") +tmEmb = ProfTimer("EMB ") +tmA2A = ProfTimer("All2All") +tmA2A1 = ProfTimer("All2All1") +tmBot = ProfTimer("Bottom ") +tmInt = ProfTimer("Inter ") +tmTop = ProfTimer("Top MLP") +tmAllGa = ProfTimer("Allgath") + +tmA2A10 = ProfTimer("All2All10") +tmA2A11 = ProfTimer("All2All11") +tmA2A12 = ProfTimer("All2All12") +tmA2A13 = ProfTimer("All2All13") + +def tmClear(): + + tmGetData.reset() + tmFwd.reset() + tmLoss.reset() + tmZero.reset() + tmBwd.reset() + tmOpt.reset() + tmSync.reset() + tmSync1.reset() + tmSync2.reset() + tmSync3.reset() + + tmH2D.reset() + tmEmb.reset() + tmA2A.reset() + tmA2A1.reset() + tmBot.reset() + tmInt.reset() + tmTop.reset() + tmAllGa.reset() + + tmA2A10.reset() + tmA2A11.reset() + tmA2A12.reset() + tmA2A13.reset() + +def tmSummary(pid): + + print("Summary of the tm timers:") + print("---------{:6d}----------------".format(pid)) + tmGetData.output(0) + tmFwd.output(0) + tmH2D.output(1) + tmEmb.output(1) + tmA2A.output(1) + tmA2A1.output(1) + tmBot.output(1) + tmInt.output(1) + tmTop.output(1) + tmAllGa.output(1) + tmLoss.output(0) + tmZero.output(0) + tmBwd.output(0) + tmOpt.output(0) +# tmSync.output(0) + tmSync1.output(0) + tmSync2.output(0) + tmSync3.output(0) + + tmA2A10.output(1) + tmA2A11.output(1) + tmA2A12.output(1) + tmA2A13.output(1) + print("========={:6d}================".format(pid)) + +if __name__ == "__main__": + + t1 = ProfTimer("Test1") + t1.start() + time.sleep(3) + t1.stop() + t1.elapsed() + t1.output() + + t1.start() + time.sleep(5) + t1.stop() + t1.output() + + diff --git a/project.py b/project.py new file mode 100644 index 00000000..b83fa85e --- /dev/null +++ b/project.py @@ -0,0 +1,67 @@ + +# This feature can be used to reduce the memory size consumed by the feature layer of the top MLP. +# Suppose we have n sparse features, each sparse features is represented by an embedding of size d, +# then, we can represent the sparse embeddings by a matrix X = (n, d). The dot product between sparse +# features is X(X^T), which is a symmetric matrix of (n, n) and will be fed into the top MLP. +# Actually We only need the upper or lower traingles to eliminate duplication. If n is large, +# such as, n = 1000, then the number of dot features fed into the MLP will be n^2/2 = 50,000. +# Considering the layer size 4096, the weight parameters will be a matrix (n^2/2, 4096), which +# may consume a large amount of precious memory resources. + +# To reduce the number of dot features, we introduce a parameter called arch-projec-size (k) to compress +# the embeddings. We introduce a parameter matrix Y = (n, k) to compute the weighted sum of the +# dot features. The compressed embeddings is represented by (X^T)Y. Then, we compute the compressed dot +# features by X(X^T)Y = (n, k). Therefore, we can reduce the dot features fed into MLP from n*n/2 +# to n*k. + +import sys +import torch +import torch.nn as nn +import numpy as np + +""" +Compute the projected dot features +T: (batch_size, n, d), batched raw embeddings +x: dense features +proj_layer: the projection layer created by create_proj +""" +def project(T, x, proj_layer): + + TT = torch.transpose(T, 1, 2) + # TS = torch.reshape(TT, (-1, TT.size(2))) + # TC = proj_layer(TS) + # TR = torch.reshape(TC, (-1, T.shape[2], k)) + TR = proj_layer(TT) + Z = torch.bmm(T, TR) + Zflat = Z.view((T.shape[0], -1)) + R = torch.cat([x] + [Zflat], dim=1) + + return R + +""" +Create the project layer +n: number of sparse features +m: projection size +""" +def create_proj(n, m): + # build MLP layer by layer + layers = nn.ModuleList() + # construct fully connected operator + LL = nn.Linear(int(n), int(m), bias=True) + + # initialize the weights + # with torch.no_grad(): + # custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) + # approach 1 + LL.weight.data = torch.tensor(W, requires_grad=True) + LL.bias.data = torch.tensor(bt, requires_grad=True) + # approach 2: constant value ? + layers.append(LL) + + return torch.nn.Sequential(*layers) + diff --git a/requirements.txt b/requirements.txt index c5cad56a..b198a127 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ pydot torch torchviz scikit-learn +tqdm diff --git a/tools/visualize.py b/tools/visualize.py new file mode 100755 index 00000000..f16504cb --- /dev/null +++ b/tools/visualize.py @@ -0,0 +1,1030 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# +# This script performs the visualization of the embedding tables created in +# DLRM during the training procedure. We use two popular techniques for +# visualization: umap (https://umap-learn.readthedocs.io/en/latest/) and +# tsne (https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html). +# These links also provide instructions on how to install these packages +# in different environments. +# +# Warning: the size of the data to be visualized depends on the RAM on your machine. +# +# +# Connand line examples: +# +# Full analysis of embeddings and data representations for Criteo Kaggle data: +# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 +# --raw-data-file=../../criteo/input/train.txt --skip-categorical-analysis +# --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz +# +# +# To run just the analysis of categoricala data for Criteo Kaggle data set: +# $python ./tools/visualize.py --data-set=kaggle --load-model=../dlrm-2020-05-25/criteo.pytorch-e-0-i-110591 \ +# --raw-data-file=../../criteo/input/train.txt --data-randomize=none --processed-data-file=../../criteo/input/kaggleAdDisplayChallenge_processed.npz \ +# --skip-embedding --skip-data-plots +# +# +# The following command line arguments are available to the user: +# +# --load-model - DLRM model file +# --data-set - one of ["kaggle", "terabyte"] +# --max-ind-range - max index range used during the traning +# --output-dir - output directory, if not specified, it will be traeted from the model and datset names +# --max-umap-size - max number of points to visualize using UMAP, default=50000 +# --use-tsne - use T-SNE +# --max-tsne-size - max number of points to visualize using T-SNE, default=1000) +# --skip-embedding - skips analysis of embedding tables +# --umap-metric - metric for UMAP +# --skip-data-plots - skips data plots +# --skip-categorical-analysis - skips categorical analysis +# +# # data file related +# --raw-data-file +# --processed-data-file +# --data-sub-sample-rate +# --data-randomize +# --memory-map +# --mini-batch-size +# --num-workers +# --test-mini-batch-size +# --test-num-workers +# --num-batches +# --mlperf-logging + +import os +import sys +import argparse +import numpy as np +import umap +import hdbscan +import json +import torch +import math +import matplotlib +import matplotlib.pyplot as plt +import collections + +from sklearn.metrics import accuracy_score +from sklearn.metrics import f1_score +from sklearn.metrics import precision_score +from sklearn.metrics import recall_score + +from sklearn import manifold + +import dlrm_data_pytorch as dp +from dlrm_s_pytorch import DLRM_Net + + +def visualize_embeddings_umap(emb_l, + output_dir = "", + max_size = 500000, + umap_metric = "euclidean", + cat_counts = None, + use_max_count = True): + + for k in range(0, len(emb_l)): + + E = emb_l[k].weight.detach().cpu().numpy() + print("umap", E.shape) + + # create histogram of norms + bins = 50 + norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] +# plt.hist(norms, bins = bins) +# plt.title("Cat norm hist var. "+str(k)) + hist, bins = np.histogram(norms, bins=bins) + logbins = np.logspace(np.log10(bins[0]),np.log10(bins[-1]),len(bins)) + + plt.figure(figsize=(8,8)) + plt.title("Categorical norms: " + str(k) + " cardinality " + str(len(cat_counts[k]))) + plt.hist(norms, bins=logbins) + plt.xscale("log") +# plt.legend() + plt.savefig(output_dir+"/cat-norm-histogram-"+str(k)+".png") + plt.close() + + if E.shape[0] < 20: + print("Skipping small embedding") + continue + + n_vis = min(max_size, E.shape[0]) + min_cnt = 0 + +# reducer = umap.UMAP(random_state=42, n_neighbors=25, min_dist=0.1) + reducer = umap.UMAP(random_state=42, metric=umap_metric) + + if use_max_count is False or n_vis == E.shape[0]: + Y = reducer.fit_transform(E[:n_vis,:]) + else: + + # select values with couns > 1 + done = False + min_cnt = 1 + while done == False: + el_cnt = (cat_counts[k] > min_cnt).sum() + if el_cnt <= max_size: + done = True + else: + min_cnt = min_cnt+1 + + E1= [] + for i in range(0, E.shape[0]): + if cat_counts[k][i] > min_cnt: + E1.append(E[i,:]) + + print("max_count_len", len(E1), "mincount", min_cnt) + Y = reducer.fit_transform(np.array(E1)) + + n_vis = len(E1) + + plt.figure(figsize=(8,8)) + + linewidth = 0 + size = 1 + + if Y.shape[0] < 2500: + linewidth = 1 + size = 5 + + if cat_counts is None: + plt.scatter(-Y[:,0], -Y[:,1], s=size, marker=".", linewidth=linewidth) + else: + #print(cat_counts[k]) + n_disp = min(len(cat_counts[k]), Y.shape[0]) + cur_max = math.log(max(cat_counts[k])) + norm_cat_count = [math.log(cat_counts[k][i]+1)/cur_max for i in range(0, len(cat_counts[k]))] + plt.scatter(-Y[0:n_disp,0], -Y[0:n_disp,1], s=size, marker=".", linewidth=linewidth, c=np.array(norm_cat_count)[0:n_disp], cmap="viridis") + plt.colorbar() + + plt.title("UMAP: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ", min count " + str(min_cnt) + ")") + plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-umap.png") + plt.close() + + +def visualize_embeddings_tsne(emb_l, + output_dir = "", + max_size = 10000): + + for k in range(0, len(emb_l)): + + E = emb_l[k].weight.detach().cpu() + print("tsne", E.shape) + + if E.shape[0] < 20: + print("Skipping small embedding") + continue + + n_vis = min(max_size, E.shape[0]) + + tsne = manifold.TSNE(init="pca", random_state=0, method="exact") + + Y = tsne.fit_transform(E[:n_vis,:]) + + plt.figure(figsize=(8, 8)) + + linewidth = 0 + if Y.shape[0] < 5000: + linewidth = 1 + + plt.scatter(-Y[:,0], -Y[:,1], s=1, marker=".", linewidth=linewidth) + + plt.title("TSNE: categorical var. " + str(k) + " (" + str(n_vis) + " of " + str(E.shape[0]) + ")") + plt.savefig(output_dir + "/cat-" + str(k) + "-" + str(n_vis) + "-of-" + str(E.shape[0]) + "-tsne.png") + plt.close() + + +def analyse_categorical_data(X_cat, n_days=10, output_dir=""): + + # analyse categorical variables + n_vec = len(X_cat) + n_cat = len(X_cat[0]) + n_days = n_days + + print("n_vec", n_vec, "n_cat", n_cat) +# for c in train_data.X_cat: +# print(n_cat, c) + + all_cat = np.array(X_cat) + print("all_cat.shape", all_cat.shape) + day_size = all_cat.shape[0]/n_days + + for i in range(0,n_cat): + l_d = [] + l_s1 = [] + l_s2 = [] + l_int = [] + l_rem = [] + + cat = all_cat[:,i] + print("cat", i, cat.shape) + for d in range(1,n_days): + offset = int(d*day_size) + #print(offset) + cat1 = cat[:offset] + cat2 = cat[offset:] + + s1 = set(cat1) + s2 = set(cat2) + + intersect = list(s1 & s2) + #print(intersect) + l_d.append(d) + l_s1.append(len(s1)) + l_s2.append(len(s2)) + l_int.append(len(intersect)) + l_rem.append((len(s1)-len(intersect))) + + print(d, ",", len(s1), ",", len(s2), ",", len(intersect), ",", (len(s1)-len(intersect))) + + print("spit", l_d) + print("before", l_s1) + print("after", l_s2) + print("inters.", l_int) + print("removed", l_rem) + + plt.figure(figsize=(8,8)) + plt.plot(l_d, l_s1, "g", label="before") + plt.plot(l_d, l_s2, "r", label="after") + plt.plot(l_d, l_int, "b", label="intersect") + plt.plot(l_d, l_rem, "y", label="removed") + plt.title("categorical var. "+str(i)) + plt.legend() + plt.savefig(output_dir+"/cat-"+str(i).zfill(3)+".png") + plt.close() + + +def analyse_categorical_counts(X_cat, emb_l=None, output_dir=""): + + # analyse categorical variables + n_vec = len(X_cat) + n_cat = len(X_cat[0]) + + print("n_vec", n_vec, "n_cat", n_cat) +# for c in train_data.X_cat: +# print(n_cat, c) + + all_cat = np.array(X_cat) + print("all_cat.shape", all_cat.shape) + + all_counts = [] + + for i in range(0,n_cat): + + cat = all_cat[:,i] + if emb_l is None: + s = set(cat) + counts = np.zeros((len(s))) + print("cat", i, cat.shape, len(s)) + else: + s = emb_l[i].weight.detach().cpu().shape[0] + counts = np.zeros((s)) + print("cat", i, cat.shape, s) + + for d in range(0,n_vec): + cv = int(cat[d]) + counts[cv] = counts[cv]+1 + + all_counts.append(counts) + + if emb_l is None: + plt.figure(figsize=(8,8)) + plt.plot(counts) + plt.title("Categorical var "+str(i) + " cardinality " + str(len(counts))) + # plt.legend() + else: + E = emb_l[i].weight.detach().cpu().numpy() + norms = [np.linalg.norm(E[i], ord=2) for i in range(0,E.shape[0])] + + fig, (ax0, ax1) = plt.subplots(2, 1) + fig.suptitle("Categorical variable: " + str(i)+" cardinality "+str(len(counts))) + + ax0.plot(counts) + ax0.set_yscale("log") + ax0.set_title("Counts", fontsize=10) + + ax1.plot(norms) + ax1.set_title("Norms", fontsize=10) + + plt.savefig(output_dir+"/cat_counts-"+str(i).zfill(3)+".png") + plt.close() + + return all_counts + + +def dlrm_output_wrap(dlrm, X, lS_o, lS_i, T): + + all_feat_vec = [] + all_cat_vec = [] + x_vec = None + t_out = None + c_out = None + z_out = [] + p_out = None + + z_size = len(dlrm.top_l) + + x = dlrm.apply_mlp(X, dlrm.bot_l) + # debug prints + #print("intermediate") + #print(x[0].detach().cpu().numpy()) + x_vec = x[0].detach().cpu().numpy() + all_feat_vec.append(x_vec) +# all_X.append(x[0].detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = dlrm.apply_emb(lS_o, lS_i, dlrm.emb_l) + + for e in ly: + #print(e.detach().cpu().numpy()) + all_feat_vec.append(e[0].detach().cpu().numpy()) + all_cat_vec.append(e[0].detach().cpu().numpy()) + + all_feat_vec= np.concatenate(all_feat_vec, axis=0) + all_cat_vec= np.concatenate(all_cat_vec, axis=0) + +# all_features.append(all_feat_vec) +# all_cat.append(all_cat_vec) + t_out = int(T.detach().cpu().numpy()[0,0]) +# all_T.append(int(T.detach().cpu().numpy()[0,0])) + + z = dlrm.interact_features(x, ly) + # print(z.detach().cpu().numpy()) +# z_out = z.detach().cpu().numpy().flatten() + z_out.append(z.detach().cpu().numpy().flatten()) +# all_z[0].append(z.detach().cpu().numpy().flatten()) + + # obtain probability of a click (using top mlp) +# print(dlrm.top_l) +# p = dlrm.apply_mlp(z, dlrm.top_l) + + for i in range(0, z_size): + z = dlrm.top_l[i](z) + +# if i < z_size-1: +# curr_z = z.detach().cpu().numpy().flatten() + z_out.append(z.detach().cpu().numpy().flatten()) +# all_z[i+1].append(curr_z) +# print("z append", i) + +# print("z",i, z.detach().cpu().numpy().flatten().shape) + + p = z + + # clamp output if needed + if 0.0 < dlrm.loss_threshold and dlrm.loss_threshold < 1.0: + z = torch.clamp(p, min=dlrm.loss_threshold, max=(1.0 - dlrm.loss_threshold)) + else: + z = p + + class_thresh = 0.0 #-0.25 + zp = z.detach().cpu().numpy()[0,0]+ class_thresh + + p_out = int(zp+0.5) + if p_out > 1: + p_out = 1 + if p_out < 0: + p_out = 0 + +# all_pred.append(int(z.detach().cpu().numpy()[0,0]+0.5)) + + #print(int(z.detach().cpu().numpy()[0,0]+0.5)) + if int(p_out) == t_out: + c_out = 0 + else: + c_out = 1 + + return all_feat_vec, x_vec, all_cat_vec, t_out, c_out, z_out, p_out + + +def create_umap_data(dlrm, data_ld, max_size=50000, offset=0, info=""): + + all_features = [] + all_X = [] + all_cat = [] + all_T = [] + all_c = [] + all_z = [] + all_pred = [] + + z_size = len(dlrm.top_l) + print("z_size", z_size) + for i in range(0, z_size): + all_z.append([]) + + for j, (X, lS_o, lS_i, T) in enumerate(data_ld): + + if j < offset: + continue + + if j >= max_size+offset: + break + + af, x, cat, t, c, z, p = dlrm_output_wrap(dlrm, X, lS_o, lS_i, T) + + all_features.append(af) + all_X.append(x) + all_cat.append(cat) + all_T.append(t) + all_c.append(c) + all_pred.append(p) + + for i in range(0, z_size): + all_z[i].append(z[i]) + +# # calculate classifier metrics + ac = accuracy_score(all_T, all_pred) + f1 = f1_score(all_T, all_pred) + ps = precision_score(all_T, all_pred) + rc = recall_score(all_T, all_pred) + + print(info, "accuracy", ac, "f1", f1, "precision", ps, "recall", rc) + + return all_features, all_X, all_cat, all_T, all_z, all_c, all_pred + + +def plot_all_data_3(umap_Y, + umap_T, + train_Y = None, + train_T = None, + test_Y = None, + test_T = None, + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + size = 1 + colors = ["red","green"] + + fig, (ax0, ax1, ax2) = plt.subplots(1, 3) + fig.suptitle("UMAP: " + info + " space dim "+str(orig_space_dim)) + + ax0.scatter(umap_Y[:,0], umap_Y[:,1], s=size, c=umap_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax0.set_title("UMAP ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) + + if train_Y is not None and train_T is not None: + ax1.scatter(train_Y[:,0], train_Y[:,1], s=size, c=train_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax1.set_title("Train ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) + + if test_Y is not None and test_T is not None: + ax2.scatter(test_Y[:,0], test_Y[:,1], s=size, c=test_T, cmap=matplotlib.colors.ListedColormap(colors), marker=".", linewidth=0) + ax2.set_title("Test ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) + + plt.savefig(output_dir+"/"+info+"-umap.png") + plt.close() + + +def plot_one_class_3(umap_Y, + umap_T, + train_Y, + train_T, + test_Y, + test_T, + target = 0, + col = "red", + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + size = 1 + + fig, (ax0, ax1, ax2) = plt.subplots(1, 3) + fig.suptitle("UMAP: "+ info + " space dim "+str(orig_space_dim)) + + ind_l_umap = [i for i,x in enumerate(umap_T) if x == target] + Y_umap_l = np.array([umap_Y[i,:] for i in ind_l_umap]) + + ax0.scatter(Y_umap_l[:,0], Y_umap_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax0.set_title("UMAP, ("+str(len(umap_T))+" of "+ total_train_size+")", fontsize=7) + + if train_Y is not None and train_T is not None: + ind_l_test = [i for i,x in enumerate(train_T) if x == target] + Y_test_l = np.array([train_Y[i,:] for i in ind_l_test]) + + ax1.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax1.set_title("Train, ("+str(len(train_T))+" of "+ total_train_size+")", fontsize=7) + + if test_Y is not None and test_T is not None: + ind_l_test = [i for i,x in enumerate(test_T) if x == target] + Y_test_l = np.array([test_Y[i,:] for i in ind_l_test]) + + ax2.scatter(Y_test_l[:,0], Y_test_l[:,1], s=size, c=col, marker=".", linewidth=0) + ax2.set_title("Test, ("+str(len(test_T))+" of "+ total_test_size+")", fontsize=7) + + plt.savefig(output_dir+"/"+info+"-umap.png") + plt.close() + + +def visualize_umap_data(umap_Y, + umap_T, + umap_C, + umap_P, + train_Y, + train_T, + train_C, + train_P, + test_Y = None, + test_T = None, + test_C = None, + test_P = None, + total_train_size = "", + total_test_size = "", + info = "", + output_dir = "", + orig_space_dim = 0): + + # all classes + plot_all_data_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info, + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # all predictions + plot_all_data_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info+", all-predictions", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + + # class 0 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + target = 0, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info+" class " + str(0), + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # class 1 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_T, + train_Y = train_Y, + train_T = train_T, + test_Y = test_Y, + test_T = test_T, + target = 1, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " class " + str(1), + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # correct classification + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_C, + train_Y = train_Y, + train_T = train_C, + test_Y = test_Y, + test_T = test_C, + target = 0, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " correct ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # errors + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_C, + train_Y = train_Y, + train_T = train_C, + test_Y = test_Y, + test_T = test_C, + target = 1, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " errors ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # prediction 0 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + target = 0, + col = "red", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " predict-0 ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + + # prediction 1 + plot_one_class_3(umap_Y = umap_Y, + umap_T = umap_P, + train_Y = train_Y, + train_T = train_P, + test_Y = test_Y, + test_T = test_P, + target = 1, + col = "green", + total_train_size = total_train_size, + total_test_size = total_test_size, + info = info + " predict-1 ", + output_dir = output_dir, + orig_space_dim = orig_space_dim) + +def hdbscan_clustering(umap_data, train_data, test_data, info="", output_dir=""): + + clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500, prediction_data=True) + umap_labels = clusterer.fit_predict(umap_data) + train_labels, _ = hdbscan.approximate_predict(clusterer, train_data) + test_labels, _ = hdbscan.approximate_predict(clusterer, test_data) + + fig, ((ax00, ax01, ax02), (ax10, ax11, ax12)) = plt.subplots(2, 3) + fig.suptitle("HDBSCAN clastering: "+ info ) + + # plot umap data + umap_clustered = (umap_labels >= 0) + umap_coll = collections.Counter(umap_clustered) + print("umap_clustered", umap_coll) +# print("umap_data", umap_data.shape) +# print("~umap_clustered", umap_clustered.count(False), ~umap_clustered) + ax00.scatter(umap_data[~umap_clustered, 0], + umap_data[~umap_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax00.set_title("UMAP Outliers " + str(umap_coll[False]), fontsize=7) + ax10.scatter(umap_data[umap_clustered, 0], + umap_data[umap_clustered, 1], + c=umap_labels[umap_clustered], + s=0.1, + cmap="Spectral") + ax10.set_title("UMAP Inliers " + str(umap_coll[True]), fontsize=7) + + # plot train data + train_clustered = (train_labels >= 0) + train_coll = collections.Counter(train_clustered) + ax01.scatter(train_data[~train_clustered, 0], + train_data[~train_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax01.set_title("Train Outliers " + str(train_coll[False]), fontsize=7) + ax11.scatter(train_data[train_clustered, 0], + train_data[train_clustered, 1], + c=train_labels[train_clustered], + s=0.1, + cmap="Spectral") + ax11.set_title("Train Inliers " + str(train_coll[True]), fontsize=7) + + # plot test data + test_clustered = (test_labels >= 0) + test_coll = collections.Counter(test_clustered) + ax02.scatter(test_data[~test_clustered, 0], + test_data[~test_clustered, 1], + c=(0.5, 0.5, 0.5), + s=0.1, + alpha=0.5) + ax02.set_title("Tets Outliers " + str(test_coll[False]), fontsize=7) + ax12.scatter(test_data[test_clustered, 0], + test_data[test_clustered, 1], + c=test_labels[test_clustered], + s=0.1, + cmap="Spectral") + ax12.set_title("Test Inliers " + str(test_coll[True]), fontsize=7) + + plt.savefig(output_dir+"/"+info+"-hdbscan.png") + plt.close() + + +def visualize_all_data_umap(dlrm, + train_ld, + test_ld = None, + max_umap_size = 50000, + output_dir = "", + umap_metric = "euclidean"): + + data_ratio = 1 + + print("creating umap data") + umap_train_feat, umap_train_X, umap_train_cat, umap_train_T, umap_train_z, umap_train_c, umap_train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size, offset=0, info="umap") + + # transform train and test data + train_feat, train_X, train_cat, train_T, train_z, train_c, train_p = create_umap_data(dlrm=dlrm, data_ld=train_ld, max_size=max_umap_size*data_ratio, offset=max_umap_size, info="train") + test_feat, test_X, test_cat, test_T, test_z, test_c, test_p = create_umap_data(dlrm=dlrm, data_ld=test_ld, max_size=max_umap_size*data_ratio, offset=0, info="test") + + print("umap_train_feat", np.array(umap_train_feat).shape) + reducer_all_feat = umap.UMAP(random_state=42, metric=umap_metric) + umap_feat_Y = reducer_all_feat.fit_transform(umap_train_feat) + + train_feat_Y = reducer_all_feat.transform(train_feat) + test_feat_Y = reducer_all_feat.transform(test_feat) + + visualize_umap_data(umap_Y = umap_feat_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_feat_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_feat_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "all-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_feat).shape[1]) + + hdbscan_clustering(umap_data = umap_feat_Y, + train_data = train_feat_Y, + test_data = test_feat_Y, + info = "umap-all-features", + output_dir = output_dir) + +# hdbscan_clustering(umap_data = np.array(umap_train_feat), +# train_data = np.array(train_feat), +# test_data = np.array(test_feat), +# info = "all-features", +# output_dir = output_dir) + + print("umap_train_X", np.array(umap_train_X).shape) + reducer_X = umap.UMAP(random_state=42, metric=umap_metric) + umap_X_Y = reducer_X.fit_transform(umap_train_X) + + train_X_Y = reducer_X.transform(train_X) + test_X_Y = reducer_X.transform(test_X) + + visualize_umap_data(umap_Y = umap_X_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_X_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_X_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "cont-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_X).shape[1]) + + print("umap_train_cat", np.array(umap_train_cat).shape) + reducer_cat = umap.UMAP(random_state=42, metric=umap_metric) + umap_cat_Y = reducer_cat.fit_transform(umap_train_cat) + + train_cat_Y = reducer_cat.transform(train_cat) + test_cat_Y = reducer_cat.transform(test_cat) + + visualize_umap_data(umap_Y = umap_cat_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_cat_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_cat_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "cat-features", + output_dir = output_dir, + orig_space_dim = np.array(umap_train_cat).shape[1]) + + # UMAP for z data + for i in range(0,len(umap_train_z)): + print("z", i, np.array(umap_train_z[i]).shape) + reducer_z = umap.UMAP(random_state=42, metric=umap_metric) + umap_z_Y = reducer_z.fit_transform(umap_train_z[i]) + + train_z_Y = reducer_z.transform(train_z[i]) + test_z_Y = reducer_z.transform(test_z[i]) + + visualize_umap_data(umap_Y = umap_z_Y, + umap_T = umap_train_T, + umap_C = umap_train_c, + umap_P = umap_train_p, + train_Y = train_z_Y, + train_T = train_T, + train_C = train_c, + train_P = train_p, + test_Y = test_z_Y, + test_T = test_T, + test_C = test_c, + test_P = test_p, + total_train_size = str(len(train_ld)), + total_test_size = str(len(test_ld)), + info = "z-features-"+str(i), + output_dir = output_dir, + orig_space_dim = np.array(umap_train_z[i]).shape[1]) + + +def analyze_model_data(output_dir, + dlrm, + train_ld, + test_ld, + train_data, + skip_embedding = False, + use_tsne = False, + max_umap_size = 50000, + max_tsne_size = 10000, + skip_categorical_analysis = False, + skip_data_plots = False, + umap_metric = "euclidean"): + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if skip_embedding is False: + + cat_counts = None + + cat_counts = analyse_categorical_counts(X_cat=train_data.X_cat, emb_l=dlrm.emb_l, output_dir=output_dir) + + visualize_embeddings_umap(emb_l = dlrm.emb_l, + output_dir = output_dir, + max_size = max_umap_size, + umap_metric = umap_metric, + cat_counts = cat_counts) + + if use_tsne is True: + visualize_embeddings_tsne(emb_l = dlrm.emb_l, + output_dir = output_dir, + max_size = max_tsne_size) + + # data visualization and analysis + if skip_data_plots is False: + visualize_all_data_umap(dlrm=dlrm, train_ld=train_ld, test_ld=test_ld, max_umap_size=max_umap_size, output_dir=output_dir, umap_metric=umap_metric) + + # analyse categorical variables + if skip_categorical_analysis is False and args.data_randomize == "none": + analyse_categorical_data(X_cat=train_data.X_cat, n_days=10, output_dir=output_dir) + + + +if __name__ == "__main__": + + output_dir = "" + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Exploratory DLRM analysis" + ) + + parser.add_argument("--load-model", type=str, default="") + parser.add_argument("--data-set", choices=["kaggle", "terabyte"], help="dataset") +# parser.add_argument("--dataset-path", required=True, help="path to the dataset") + parser.add_argument("--max-ind-range", type=int, default=-1) +# parser.add_argument("--mlperf-bin-loader", action="store_true", default=False) + parser.add_argument("--output-dir", type=str, default="") + parser.add_argument("--skip-embedding", action="store_true", default=False) + parser.add_argument("--umap-metric", type=str, default="euclidean") + parser.add_argument("--skip-data-plots", action="store_true", default=False) + parser.add_argument("--skip-categorical-analysis", action="store_true", default=False) + + # umap relatet + parser.add_argument("--max-umap-size", type=int, default=50000) + # tsne related + parser.add_argument("--use-tsne", action="store_true", default=False) + parser.add_argument("--max-tsne-size", type=int, default=1000) + # data file related + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--data-randomize", type=str, default="total") # none, total or day or none + parser.add_argument("--memory-map", action="store_true", default=False) + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--test-mini-batch-size", type=int, default=1) + parser.add_argument("--test-num-workers", type=int, default=0) + parser.add_argument("--num-batches", type=int, default=0) + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + + args = parser.parse_args() + + print("command line args: ", json.dumps(vars(args))) + + if output_dir == "": + output_dir = args.data_set+"-"+os.path.split(args.load_model)[-1]+"-vis_all" + print("output_dir:", output_dir) + + if args.data_set == "kaggle": + # 1. Criteo Kaggle Display Advertisement Challenge Dataset (see ./bench/dlrm_s_criteo_kaggle.sh) + m_spa=16 + ln_emb=np.array([1460,583,10131227,2202608,305,24,12517,633,3,93145,5683,8351593,3194,27,14992,5461306,10,5652,2173,4,7046547,18,15,286181,105,142572]) + ln_bot=np.array([13,512,256,64,16]) + ln_top=np.array([367,512,256,1]) + + elif args.dataset == "terabyte": + + if args.max_ind_range == 10000000: + # 2. Criteo Terabyte (see ./bench/dlrm_s_criteo_terabyte.sh [--sub-sample=0.875] --max-in-range=10000000) + m_spa=64 + ln_emb=np.array([9980333,36084,17217,7378,20134,3,7112,1442,61, 9758201,1333352,313829,10,2208,11156,122,4,970,14, 9994222, 7267859, 9946608,415421,12420,101, 36]) + ln_bot=np.array([13,512,256,64]) + ln_top=np.array([415,512,512,256,1]) + elif args.max_ind_range == 40000000: + # 3. Criteo Terabyte MLPerf training (see ./bench/run_and_time.sh --max-in-range=40000000) + m_spa=128 + ln_emb=np.array([39884406,39043,17289,7420,20263,3,7120,1543,63,38532951,2953546,403346,10,2208,11938,155,4,976,14,39979771,25641295,39664984,585935,12972,108,36]) + ln_bot=np.array([13,512,256,128]) + ln_top=np.array([479,1024,1024,512,256,1]) + else: + raise ValueError("only --max-in-range 10M or 40M is supported") + else: + raise ValueError("only kaggle|terabyte dataset options are supported") + + # check input parameters + if args.data_randomize != "none" and args.skip_categorical_analysis is not True: + print("Incorrect option for categoricat analysis, use: --data-randomize=none") + sys.exit(-1) + + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + arch_interaction_op="dot", + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + sync_dense_params=True, + loss_threshold=0.0, + ndevices=-1, + qr_flag=False, + qr_operation=None, + qr_collisions=None, + qr_threshold=None, + md_flag=False, + md_threshold=None, + ) + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + + ld_model = torch.load(args.load_model, map_location=torch.device("cpu")) + dlrm.load_state_dict(ld_model["state_dict"]) + + print("Model loaded", args.load_model) + #print(dlrm) + + z_size = len(dlrm.top_l) + for i in range(0, z_size): + print("z", i, dlrm.top_l[i]) + + # load data + train_data = None + test_data = None + + if args.raw_data_file is not "" or args.processed_data_file is not "": + train_data, train_ld, test_data, test_ld = dp.make_criteo_data_and_loaders(args) + + analyze_model_data(output_dir = output_dir, + dlrm = dlrm, + train_ld = train_ld, + test_ld = test_ld, + train_data = train_data, + skip_embedding = args.skip_embedding, + use_tsne = args.use_tsne, + max_umap_size = args.max_umap_size, + max_tsne_size = args.max_tsne_size, + skip_categorical_analysis = args.skip_categorical_analysis, + skip_data_plots = args.skip_data_plots, + umap_metric = args.umap_metric) + diff --git a/tricks/md_embedding_bag.py b/tricks/md_embedding_bag.py index 53c9f7af..7c4071a2 100644 --- a/tricks/md_embedding_bag.py +++ b/tricks/md_embedding_bag.py @@ -34,7 +34,10 @@ def md_solver(n, alpha, d0=None, B=None, round_dim=True, k=None): d = alpha_power_rule(n.type(torch.float) / k, alpha, d0=d0, B=B) if round_dim: d = pow_2_round(d) - return d + undo_sort = [0] * len(indices) + for i, v in enumerate(indices): + undo_sort[v] = i + return d[undo_sort] def alpha_power_rule(n, alpha, d0=None, B=None): diff --git a/tt.py b/tt.py new file mode 100644 index 00000000..357ac3e4 --- /dev/null +++ b/tt.py @@ -0,0 +1,1616 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Description: an implementation of a deep learning recommendation model (DLRM) +# The model input consists of dense and sparse features. The former is a vector +# of floating point values. The latter is a list of sparse indices into +# embedding tables, which consist of vectors of floating point values. +# The selected vectors are passed to mlp networks denoted by triangles, +# in some cases the vectors are interacted through operators (Ops). +# +# output: +# vector of values +# model: | +# /\ +# /__\ +# | +# _____________________> Op <___________________ +# / | \ +# /\ /\ /\ +# /__\ /__\ ... /__\ +# | | | +# | Op Op +# | ____/__\_____ ____/__\____ +# | |_Emb_|____|__| ... |_Emb_|__|___| +# input: +# [ dense features ] [sparse indices] , ..., [sparse indices] +# +# More precise definition of model layers: +# 1) fully connected layers of an mlp +# z = f(y) +# y = Wx + b +# +# 2) embedding lookup (for a list of sparse indices p=[p1,...,pk]) +# z = Op(e1,...,ek) +# obtain vectors e1=E[:,p1], ..., ek=E[:,pk] +# +# 3) Operator Op can be one of the following +# Sum(e1,...,ek) = e1 + ... + ek +# Dot(e1,...,ek) = [e1'e1, ..., e1'ek, ..., ek'e1, ..., ek'ek] +# Cat(e1,...,ek) = [e1', ..., ek']' +# where ' denotes transpose operation +# +# References: +# [1] Maxim Naumov, Dheevatsa Mudigere, Hao-Jun Michael Shi, Jianyu Huang, +# Narayanan Sundaram, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, +# Alisson G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, +# Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, +# Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, +# Misha Smelyanskiy, "Deep Learning Recommendation Model for Personalization and +# Recommendation Systems", CoRR, arXiv:1906.00091, 2019 + +from __future__ import absolute_import, division, print_function, unicode_literals + +# miscellaneous +import builtins +import functools +# import bisect +# import shutil +import time +import json +# data generation +import dlrm_data_pytorch as dp + +# numpy +import numpy as np +import socket + +# onnx +# The onnx import causes deprecation warnings every time workers +# are spawned during testing. So, we filter out those warnings. +import warnings +with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=DeprecationWarning) +## import onnx + +# pytorch +import torch +from torch import onnx +import torch.nn as nn +from torch.nn.parallel.parallel_apply import parallel_apply +from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import gather, scatter + +# For distributed run +import extend_distributed as ext_dist + +# quotient-remainder trick +from tricks.qr_embedding_bag import QREmbeddingBag +# mixed-dimension trick +from tricks.md_embedding_bag import PrEmbeddingBag, md_solver + +import sklearn.metrics + +import uuid +import project +from torch.nn.parallel import DistributedDataParallel as DDP + +import dlrm_data as dd + +# Add dlrm self profiling timers +import profile as tm +# import pyprof +# pyprof.init() # causing errors, some symbols not found + +# import synthetic_data_loader as fb_syn_data + +# from torchviz import make_dot +# import torch.nn.functional as Functional +# from torch.nn.parameter import Parameter + +from torch.optim.lr_scheduler import _LRScheduler + +exc = getattr(builtins, "IOError", "FileNotFoundError") + +class LRPolicyScheduler(_LRScheduler): + def __init__(self, optimizer, num_warmup_steps, decay_start_step, num_decay_steps): + self.num_warmup_steps = num_warmup_steps + self.decay_start_step = decay_start_step + self.decay_end_step = decay_start_step + num_decay_steps + self.num_decay_steps = num_decay_steps + + if self.decay_start_step < self.num_warmup_steps: + sys.exit("Learning rate warmup must finish before the decay starts") + + super(LRPolicyScheduler, self).__init__(optimizer) + + def get_lr(self): + step_count = self._step_count + if step_count < self.num_warmup_steps: + # warmup + scale = 1.0 - (self.num_warmup_steps - step_count) / self.num_warmup_steps + lr = [base_lr * scale for base_lr in self.base_lrs] + self.last_lr = lr + elif self.decay_start_step <= step_count and step_count < self.decay_end_step: + # decay + decayed_steps = step_count - self.decay_start_step + scale = ((self.num_decay_steps - decayed_steps) / self.num_decay_steps) ** 2 + min_lr = 0.0000001 + lr = [max(min_lr, base_lr * scale) for base_lr in self.base_lrs] + self.last_lr = lr + else: + if self.num_decay_steps > 0: + # freeze at last, either because we're after decay + # or because we're between warmup and decay + lr = self.last_lr + else: + # do not adjust + lr = self.base_lrs + return lr + +### define dlrm in PyTorch ### +class DLRM_Net(nn.Module): + def create_mlp(self, ln, sigmoid_layer): + # build MLP layer by layer + layers = nn.ModuleList() + for i in range(0, ln.size - 1): + n = ln[i] + m = ln[i + 1] + + # construct fully connected operator + LL = nn.Linear(int(n), int(m), bias=True) + + # initialize the weights + # with torch.no_grad(): + # custom Xavier input, output or two-sided fill + mean = 0.0 # std_dev = np.sqrt(variance) + std_dev = np.sqrt(2 / (m + n)) # np.sqrt(1 / m) # np.sqrt(1 / n) + W = np.random.normal(mean, std_dev, size=(m, n)).astype(np.float32) + std_dev = np.sqrt(1 / m) # np.sqrt(2 / (m + 1)) + bt = np.random.normal(mean, std_dev, size=m).astype(np.float32) + # approach 1 + LL.weight.data = torch.tensor(W, requires_grad=True) + LL.bias.data = torch.tensor(bt, requires_grad=True) + # approach 2 + # LL.weight.data.copy_(torch.tensor(W)) + # LL.bias.data.copy_(torch.tensor(bt)) + # approach 3 + # LL.weight = Parameter(torch.tensor(W),requires_grad=True) + # LL.bias = Parameter(torch.tensor(bt),requires_grad=True) + layers.append(LL) + + # construct sigmoid or relu operator + if i == sigmoid_layer: + layers.append(nn.Sigmoid()) + else: + layers.append(nn.ReLU()) + + # approach 1: use ModuleList + # return layers + # approach 2: use Sequential container to wrap all layers + return torch.nn.Sequential(*layers) + + def create_emb(self, m, ln): + emb_l = nn.ModuleList() + # save the numpy random state + np_rand_state = np.random.get_state() + for i in range(0, ln.size): + if ext_dist.my_size > 1: + if not i in self.local_emb_indices: continue + # Use per table random seed for Embedding initialization + np.random.seed(self.l_emb_seeds[i]) + n = ln[i] + # construct embedding operator + if self.qr_flag and n > self.qr_threshold: + EE = QREmbeddingBag(n, m, self.qr_collisions, + operation=self.qr_operation, mode="sum", sparse=True) + elif self.md_flag: + base = max(m) + _m = m[i] if n > self.md_threshold else base + EE = PrEmbeddingBag(n, _m, base) + # use np initialization as below for consistency... + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, _m) + ).astype(np.float32) + EE.embs.weight.data = torch.tensor(W, requires_grad=True) + + else: + #_weight = torch.empty([n, m]).uniform_(-np.sqrt(1 / n), np.sqrt(1 / n)) + #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight= _weight) + #EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True) + + # initialize embeddings + # nn.init.uniform_(EE.weight, a=-np.sqrt(1 / n), b=np.sqrt(1 / n)) + W = np.random.uniform( + low=-np.sqrt(1 / n), high=np.sqrt(1 / n), size=(n, m) + ).astype(np.float32) + # approach 1 + EE = nn.EmbeddingBag(n, m, mode="sum", sparse=True, _weight=torch.tensor(W, requires_grad=True)) + #EE.weight.data = torch.tensor(W, requires_grad=True) + # approach 2 + # EE.weight.data.copy_(torch.tensor(W)) + # approach 3 + # EE.weight = Parameter(torch.tensor(W),requires_grad=True) + + if ext_dist.my_size > 1: + if i in self.local_emb_indices: + emb_l.append(EE) + else: + emb_l.append(EE) + + # Restore the numpy random state + np.random.set_state(np_rand_state) + return emb_l + + def __init__( + self, + m_spa=None, + ln_emb=None, + ln_bot=None, + ln_top=None, + proj_size = 0, + arch_interaction_op=None, + arch_interaction_itself=False, + sigmoid_bot=-1, + sigmoid_top=-1, + sync_dense_params=True, + loss_threshold=0.0, + ndevices=-1, + qr_flag=False, + qr_operation="mult", + qr_collisions=0, + qr_threshold=200, + md_flag=False, + md_threshold=200, + ): + super(DLRM_Net, self).__init__() + + if ( + (m_spa is not None) + and (ln_emb is not None) + and (ln_bot is not None) + and (ln_top is not None) + and (arch_interaction_op is not None) + ): + + # save arguments + self.proj_size = proj_size + self.ndevices = ndevices + self.output_d = 0 + self.parallel_model_batch_size = -1 + self.parallel_model_is_not_prepared = True + self.arch_interaction_op = arch_interaction_op + self.arch_interaction_itself = arch_interaction_itself + self.sync_dense_params = sync_dense_params + self.loss_threshold = loss_threshold + # create variables for QR embedding if applicable + self.qr_flag = qr_flag + if self.qr_flag: + self.qr_collisions = qr_collisions + self.qr_operation = qr_operation + self.qr_threshold = qr_threshold + # create variables for MD embedding if applicable + self.md_flag = md_flag + if self.md_flag: + self.md_threshold = md_threshold + + # generate np seeds for Emb table initialization + self.l_emb_seeds = np.random.randint(low=0, high=100000, size=len(ln_emb)) + + #If running distributed, get local slice of embedding tables + if ext_dist.my_size > 1: + n_emb = len(ln_emb) + self.n_global_emb = n_emb + self.n_local_emb, self.n_emb_per_rank = ext_dist.get_split_lengths(n_emb) + self.local_emb_slice = ext_dist.get_my_slice(n_emb) + self.local_emb_indices = list(range(n_emb))[self.local_emb_slice] + #ln_emb = ln_emb[self.local_emb_slice] + + # create operators + if ndevices <= 1: + self.emb_l = self.create_emb(m_spa, ln_emb) + self.bot_l = self.create_mlp(ln_bot, sigmoid_bot) + self.top_l = self.create_mlp(ln_top, sigmoid_top) + if (proj_size > 0): + self.proj_l = project.create_proj(len(ln_emb)+1, proj_size) + + def apply_mlp(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_proj(self, x, layers): + # approach 1: use ModuleList + # for layer in layers: + # x = layer(x) + # return x + # approach 2: use Sequential container to wrap all layers + return layers(x) + + def apply_emb(self, lS_o, lS_i, emb_l): + # WARNING: notice that we are processing the batch at once. We implicitly + # assume that the data is laid out such that: + # 1. each embedding is indexed with a group of sparse indices, + # corresponding to a single lookup + # 2. for each embedding the lookups are further organized into a batch + # 3. for a list of embedding tables there is a list of batched lookups + + ly = [] + for k, sparse_index_group_batch in enumerate(lS_i): + sparse_offset_group_batch = lS_o[k] + + # embedding lookup + # We are using EmbeddingBag, which implicitly uses sum operator. + # The embeddings are represented as tall matrices, with sum + # happening vertically across 0 axis, resulting in a row vector + E = emb_l[k] + V = E(sparse_index_group_batch, sparse_offset_group_batch) + + ly.append(V) + + # print(ly) + return ly + + def interact_features(self, x, ly): + if self.arch_interaction_op == "dot": + # concatenate dense and sparse features + (batch_size, d) = x.shape + T = torch.cat([x] + ly, dim=1).view((batch_size, -1, d)) + # perform a dot product + if (self.proj_size > 0): + R = project.project(T, x, self.proj_l) + #TT = torch.transpose(T, 1, 2) + #TS = torch.reshape(TT, (-1, TT.size(2))) + #TC = self.apply_mlp(TS, self.proj_l) + #TR = torch.reshape(TC, (-1, d ,self.proj_size)) + #Z = torch.bmm(T, TR) + #Zflat = Z.view((batch_size, -1)) + #R = torch.cat([x] + [Zflat], dim=1) + else: + Z = torch.bmm(T, torch.transpose(T, 1, 2)) + # append dense feature with the interactions (into a row vector) + # approach 1: all + # Zflat = Z.view((batch_size, -1)) + # approach 2: unique + _, ni, nj = Z.shape + # approach 1: tril_indices + # offset = 0 if self.arch_interaction_itself else -1 + # li, lj = torch.tril_indices(ni, nj, offset=offset) + # approach 2: custom + offset = 1 if self.arch_interaction_itself else 0 + li = torch.tensor([i for i in range(ni) for j in range(i + offset)]) + lj = torch.tensor([j for i in range(nj) for j in range(i + offset)]) + Zflat = Z[:, li, lj] + # concatenate dense features and interactions + R = torch.cat([x] + [Zflat], dim=1) + elif self.arch_interaction_op == "cat": + # concatenation features (into a row vector) + R = torch.cat([x] + ly, dim=1) + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + self.arch_interaction_op + + " is not supported" + ) + + return R + + def forward(self, dense_x, lS_o, lS_i): + if ext_dist.my_size > 1: + return self.distributed_forward(dense_x, lS_o, lS_i) + elif self.ndevices <= 1: + return self.sequential_forward(dense_x, lS_o, lS_i) + else: + return self.parallel_forward(dense_x, lS_o, lS_i) + + def sequential_forward(self, dense_x, lS_o, lS_i): + # process dense features (using bottom mlp), resulting in a row vector + x = self.apply_mlp(dense_x, self.bot_l) + # debug prints + # print("intermediate") + # print(x.detach().cpu().numpy()) + + # process sparse features(using embeddings), resulting in a list of row vectors + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + # for y in ly: + # print(y.detach().cpu().numpy()) + + # interact features (dense and sparse) + z = self.interact_features(x, ly) + # print(z.detach().cpu().numpy()) + + # obtain probability of a click (using top mlp) + p = self.apply_mlp(z, self.top_l) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp(p, min=self.loss_threshold, max=(1.0 - self.loss_threshold)) + else: + z = p + + return z + + def distributed_forward(self, dense_x, lS_o, lS_i): + batch_size = dense_x.size()[0] + # WARNING: # of ranks must be <= batch size in distributed_forward call + # if batch_size < ext_dist.my_size: + # sys.exit("ERROR: batch_size (%d) must be larger than number of ranks (%d)" % (batch_size, ext_dist.my_size)) + # if batch_size % ext_dist.my_size != 0: + # sys.exit("ERROR: batch_size %d can not split across %d ranks evenly" % (batch_size, ext_dist.my_size)) + + ## already handled in input the data + ##dense_x = dense_x[ext_dist.get_my_slice(batch_size)] + ##lS_o = lS_o[self.local_emb_slice] + ##lS_i = lS_i[self.local_emb_slice] + + if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)): + sys.exit("ERROR: corrupted model input detected in distributed_forward call") + + # embeddings + tm.tmEmb.start() + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + tm.tmEmb.stop() + + # print("ly: ", ly) + # debug prints + # print(ly) + + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each rank. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each rank. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if len(self.emb_l) != len(ly): + sys.exit("ERROR: corrupted intermediate result in distributed_forward call") + + tm.tmA2A.start() + a2a_req = ext_dist.alltoall(ly, self.n_emb_per_rank) + tm.tmA2A.stop() + + tm.tmBot.start() + x = self.apply_mlp(dense_x, self.bot_l) + tm.tmBot.stop() + + # debug prints + # print(x) + + tm.tmA2A1.start() + ly = a2a_req.wait() + tm.tmA2A1.stop() + # print("ly: ", ly) + ly = list(ly) + + # interactions + tm.tmInt.start() + z = self.interact_features(x, ly) + tm.tmInt.stop() + # debug prints + # print(z) + + # top mlp + tm.tmTop.start() + p = self.apply_mlp(z, self.top_l) + tm.tmTop.stop() + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z = torch.clamp( + p, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z = p + + ### gather the distributed results on each rank ### + # For some reason it requires explicit sync before all_gather call if + # tensor is on GPU memory + tm.tmAllGa.start() + if z.is_cuda: torch.cuda.synchronize() + (_, batch_split_lengths) = ext_dist.get_split_lengths(batch_size * ext_dist.my_size) + z = ext_dist.all_gather(z, batch_split_lengths) + tm.tmAllGa.stop() + #print("Z: %s" % z) + + return z + + def parallel_forward(self, dense_x, lS_o, lS_i): + ### prepare model (overwrite) ### + # WARNING: # of devices must be >= batch size in parallel_forward call + batch_size = dense_x.size()[0] + ndevices = min(self.ndevices, batch_size, len(self.emb_l)) + device_ids = range(ndevices) + # WARNING: must redistribute the model if mini-batch size changes(this is common + # for last mini-batch, when # of elements in the dataset/batch size is not even + if self.parallel_model_batch_size != batch_size: + self.parallel_model_is_not_prepared = True + + if self.parallel_model_is_not_prepared or self.sync_dense_params: + # replicate mlp (data parallelism) + self.bot_l_replicas = replicate(self.bot_l, device_ids) + self.top_l_replicas = replicate(self.top_l, device_ids) + self.parallel_model_batch_size = batch_size + + if self.parallel_model_is_not_prepared: + # distribute embeddings (model parallelism) + t_list = [] + for k, emb in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + emb.to(d) + t_list.append(emb.to(d)) + self.emb_l = nn.ModuleList(t_list) + self.parallel_model_is_not_prepared = False + + ### prepare input (overwrite) ### + # scatter dense features (data parallelism) + # print(dense_x.device) + dense_x = scatter(dense_x, device_ids, dim=0) + # distribute sparse features (model parallelism) + if (len(self.emb_l) != len(lS_o)) or (len(self.emb_l) != len(lS_i)): + sys.exit("ERROR: corrupted model input detected in parallel_forward call") + + t_list = [] + i_list = [] + for k, _ in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + t_list.append(lS_o[k].to(d)) + i_list.append(lS_i[k].to(d)) + lS_o = t_list + lS_i = i_list + + ### compute results in parallel ### + # bottom mlp + # WARNING: Note that the self.bot_l is a list of bottom mlp modules + # that have been replicated across devices, while dense_x is a tuple of dense + # inputs that has been scattered across devices on the first (batch) dimension. + # The output is a list of tensors scattered across devices according to the + # distribution of dense_x. + x = parallel_apply(self.bot_l_replicas, dense_x, None, device_ids) + # debug prints + # print(x) + + # embeddings + ly = self.apply_emb(lS_o, lS_i, self.emb_l) + # debug prints + # print(ly) + + # butterfly shuffle (implemented inefficiently for now) + # WARNING: Note that at this point we have the result of the embedding lookup + # for the entire batch on each device. We would like to obtain partial results + # corresponding to all embedding lookups, but part of the batch on each device. + # Therefore, matching the distribution of output of bottom mlp, so that both + # could be used for subsequent interactions on each device. + if len(self.emb_l) != len(ly): + sys.exit("ERROR: corrupted intermediate result in parallel_forward call") + + t_list = [] + for k, _ in enumerate(self.emb_l): + d = torch.device("cuda:" + str(k % ndevices)) + y = scatter(ly[k], device_ids, dim=0) + t_list.append(y) + # adjust the list to be ordered per device + ly = list(map(lambda y: list(y), zip(*t_list))) + # debug prints + # print(ly) + + # interactions + z = [] + for k in range(ndevices): + zk = self.interact_features(x[k], ly[k]) + z.append(zk) + # debug prints + # print(z) + + # top mlp + # WARNING: Note that the self.top_l is a list of top mlp modules that + # have been replicated across devices, while z is a list of interaction results + # that by construction are scattered across devices on the first (batch) dim. + # The output is a list of tensors scattered across devices according to the + # distribution of z. + p = parallel_apply(self.top_l_replicas, z, None, device_ids) + + ### gather the distributed results ### + p0 = gather(p, self.output_d, dim=0) + + # clamp output if needed + if 0.0 < self.loss_threshold and self.loss_threshold < 1.0: + z0 = torch.clamp( + p0, min=self.loss_threshold, max=(1.0 - self.loss_threshold) + ) + else: + z0 = p0 + + return z0 + + +def dash_separated_ints(value): + vals = value.split('-') + for val in vals: + try: + int(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of ints" % value) + + return value + + +def dash_separated_floats(value): + vals = value.split('-') + for val in vals: + try: + float(val) + except ValueError: + raise argparse.ArgumentTypeError( + "%s is not a valid dash separated list of floats" % value) + + return value + + +if __name__ == "__main__": + ### import packages ### + import sys + import os + import argparse + + ### parse arguments ### + parser = argparse.ArgumentParser( + description="Train Deep Learning Recommendation Model (DLRM)" + ) + # model related parameters + parser.add_argument("--arch-sparse-feature-size", type=int, default=2) + + parser.add_argument( + "--arch-embedding-size", type=dash_separated_ints, default="4-3-2") + parser.add_argument("--arch-project-size", type=int, default=0) + + # j will be replaced with the table number + parser.add_argument( + "--arch-mlp-bot", type=dash_separated_ints, default="4-3-2") + parser.add_argument( + "--arch-mlp-top", type=dash_separated_ints, default="4-2-1") + parser.add_argument( + "--arch-interaction-op", type=str, choices=['dot', 'cat'], default="dot") + parser.add_argument("--arch-interaction-itself", action="store_true", default=False) + # embedding table options + parser.add_argument("--md-flag", action="store_true", default=False) + parser.add_argument("--md-threshold", type=int, default=200) + parser.add_argument("--md-temperature", type=float, default=0.3) + parser.add_argument("--md-round-dims", action="store_true", default=False) + parser.add_argument("--qr-flag", action="store_true", default=False) + parser.add_argument("--qr-threshold", type=int, default=200) + parser.add_argument("--qr-operation", type=str, default="mult") + parser.add_argument("--qr-collisions", type=int, default=4) + # activations and loss + parser.add_argument("--activation-function", type=str, default="relu") + parser.add_argument("--loss-function", type=str, default="mse") # or bce or wbce + parser.add_argument( + "--loss-weights", type=dash_separated_floats, default="1.0-1.0") # for wbce + parser.add_argument("--loss-threshold", type=float, default=0.0) # 1.0e-7 + parser.add_argument("--round-targets", type=bool, default=False) + # data + parser.add_argument("--data-size", type=int, default=1) + parser.add_argument("--num-batches", type=int, default=0) + parser.add_argument( + "--data-generation", type=str, default="random" + ) # synthetic or dataset + parser.add_argument("--synthetic-data-folder", type=str, + default="./synthetic_data/syn_data_bs65536") + # add Gaussian distribution + parser.add_argument("--rand-data-dist", type=str, default="uniform") # uniform or gaussian + parser.add_argument("--rand-data-min", type=float, default=0) + parser.add_argument("--rand-data-max", type=float, default=1) + parser.add_argument("--rand-data-mu", type=float, default=-1) + parser.add_argument("--rand-data-sigma", type=float, default=1) + + parser.add_argument("--data-trace-file", type=str, default="./input/dist_emb_j.log") + parser.add_argument("--data-set", type=str, default="kaggle") # or terabyte + parser.add_argument("--raw-data-file", type=str, default="") + parser.add_argument("--processed-data-file", type=str, default="") + parser.add_argument("--data-randomize", type=str, default="total") # or day or none + parser.add_argument("--data-trace-enable-padding", type=bool, default=False) + parser.add_argument("--max-ind-range", type=int, default=-1) + parser.add_argument("--data-sub-sample-rate", type=float, default=0.0) # in [0, 1] + parser.add_argument("--num-indices-per-lookup", type=int, default=10) + parser.add_argument("--num-indices-per-lookup-fixed", type=bool, default=False) + parser.add_argument("--num-workers", type=int, default=0) + parser.add_argument("--memory-map", action="store_true", default=False) + # training + parser.add_argument("--mini-batch-size", type=int, default=1) + parser.add_argument("--nepochs", type=int, default=1) + parser.add_argument("--learning-rate", type=float, default=0.01) + parser.add_argument("--print-precision", type=int, default=5) + parser.add_argument("--numpy-rand-seed", type=int, default=123) + parser.add_argument("--sync-dense-params", type=bool, default=True) + # inference + parser.add_argument("--inference-only", action="store_true", default=False) + # onnx + parser.add_argument("--save-onnx", action="store_true", default=False) + # gpu + parser.add_argument("--use-gpu", action="store_true", default=False) + # distributed run + parser.add_argument("--dist-backend", type=str, default="") + # debugging and profiling + parser.add_argument("--print-freq", type=int, default=1) + parser.add_argument("--test-freq", type=int, default=-1) + parser.add_argument("--test-mini-batch-size", type=int, default=-1) + parser.add_argument("--test-num-workers", type=int, default=-1) + parser.add_argument("--print-time", action="store_true", default=False) + parser.add_argument("--debug-mode", action="store_true", default=False) + parser.add_argument("--enable-profiling", action="store_true", default=False) + parser.add_argument("--plot-compute-graph", action="store_true", default=False) + # store/load model + parser.add_argument("--out-dir", type=str, default=".") + parser.add_argument("--save-model", type=str, default="") + parser.add_argument("--load-model", type=str, default="") + # mlperf logging (disables other output and stops early) + parser.add_argument("--mlperf-logging", action="store_true", default=False) + # stop at target accuracy Kaggle 0.789, Terabyte (sub-sampled=0.875) 0.8107 + parser.add_argument("--mlperf-acc-threshold", type=float, default=0.0) + # stop at target AUC Terabyte (no subsampling) 0.8025 + parser.add_argument("--mlperf-auc-threshold", type=float, default=0.0) + parser.add_argument("--mlperf-bin-loader", action='store_true', default=False) + parser.add_argument("--mlperf-bin-shuffle", action='store_true', default=False) + + # LR policy + parser.add_argument("--lr-num-warmup-steps", type=int, default=0) + parser.add_argument("--lr-decay-start-step", type=int, default=0) + parser.add_argument("--lr-num-decay-steps", type=int, default=0) + + args = parser.parse_args() + + print(socket.gethostname()) + + ext_dist.init_distributed(backend=args.dist_backend) + + # print("success size= ", ext_dist.my_size, ext_dist.my_rank) + + ext_dist.barrier() + + if args.mlperf_logging: + print('command line args: ', json.dumps(vars(args))) + + ### some basic setup ### + np.random.seed(args.numpy_rand_seed) + np.set_printoptions(precision=args.print_precision) + torch.set_printoptions(precision=args.print_precision) + torch.manual_seed(args.numpy_rand_seed) + + if (args.test_mini_batch_size < 0): + # if the parameter is not set, use the training batch size + args.test_mini_batch_size = args.mini_batch_size + if (args.test_num_workers < 0): + # if the parameter is not set, use the same parameter for training + args.test_num_workers = args.num_workers + if args.mini_batch_size % ext_dist.my_size !=0 or args.test_mini_batch_size % ext_dist.my_size != 0: + print("Either test minibatch (%d) or train minibatch (%d) does not split across %d ranks" % (args.test_mini_batch_size, args.mini_batch_size, ext_dist.my_size)) + sys.exit(1) + + use_gpu = args.use_gpu and torch.cuda.is_available() + if use_gpu: + torch.cuda.manual_seed_all(args.numpy_rand_seed) + torch.backends.cudnn.deterministic = True + if ext_dist.my_size > 1: + ngpus = torch.cuda.device_count() # 1 + if ext_dist.my_local_size > torch.cuda.device_count(): + print("Not sufficient GPUs available... local_size = %d, ngpus = %d" % (ext_dist.my_local_size, ngpus)) + sys.exit(1) + ngpus = 1 + device = torch.device("cuda", ext_dist.my_local_rank) + else: + device = torch.device("cuda", 0) + ngpus = torch.cuda.device_count() # 1 + ngpus=1 + print("Using {} GPU(s)...".format(ngpus)) + else: + device = torch.device("cpu") + print("Using CPU...") + + ### prepare training data ### + ln_bot = np.fromstring(args.arch_mlp_bot, dtype=int, sep="-") + # input data + if (args.data_generation == "dataset"): + + train_data, train_ld, test_data, test_ld = \ + dp.make_criteo_data_and_loaders(args) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + nbatches_test = len(test_ld) + + ln_emb = train_data.counts + # enforce maximum limit on number of vectors per embedding + if args.max_ind_range > 0: + ln_emb = np.array(list(map( + lambda x: x if x < args.max_ind_range else args.max_ind_range, + ln_emb + ))) + m_den = train_data.m_den + ln_bot[0] = m_den + + elif args.data_generation == "synthetic": + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld = dd.data_loader(args, ln_emb, m_den) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + table_feature_map = None # {idx : idx for idx in range(len(ln_emb))} + + else: + # input and target at random + ln_emb = np.fromstring(args.arch_embedding_size, dtype=int, sep="-") + m_den = ln_bot[0] + train_data, train_ld = dd.make_random_data_and_loader(args, ln_emb, m_den) + nbatches = args.num_batches if args.num_batches > 0 else len(train_ld) + + ### parse command line arguments ### + m_spa = args.arch_sparse_feature_size + num_fea = ln_emb.size + 1 # num sparse + num dense features + m_den_out = ln_bot[ln_bot.size - 1] + if args.arch_interaction_op == "dot": + # approach 1: all + # num_int = num_fea * num_fea + m_den_out + # approach 2: unique + if (args.arch_project_size > 0): + num_int = num_fea * args.arch_project_size + m_den_out + else: + if args.arch_interaction_itself: + num_int = (num_fea * (num_fea + 1)) // 2 + m_den_out + else: + num_int = (num_fea * (num_fea - 1)) // 2 + m_den_out + elif args.arch_interaction_op == "cat": + num_int = num_fea * m_den_out + else: + sys.exit( + "ERROR: --arch-interaction-op=" + + args.arch_interaction_op + + " is not supported" + ) + arch_mlp_top_adjusted = str(num_int) + "-" + args.arch_mlp_top + ln_top = np.fromstring(arch_mlp_top_adjusted, dtype=int, sep="-") + + # sanity check: feature sizes and mlp dimensions must match + if m_den != ln_bot[0]: + sys.exit( + "ERROR: arch-dense-feature-size " + + str(m_den) + + " does not match first dim of bottom mlp " + + str(ln_bot[0]) + ) + if args.qr_flag: + if args.qr_operation == "concat" and 2 * m_spa != m_den_out: + sys.exit( + "ERROR: 2 arch-sparse-feature-size " + + str(2 * m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + + " (note that the last dim of bottom mlp must be 2x the embedding dim)" + ) + if args.qr_operation != "concat" and m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + else: + if m_spa != m_den_out: + sys.exit( + "ERROR: arch-sparse-feature-size " + + str(m_spa) + + " does not match last dim of bottom mlp " + + str(m_den_out) + ) + if num_int != ln_top[0]: + sys.exit( + "ERROR: # of feature interactions " + + str(num_int) + + " does not match first dimension of top mlp " + + str(ln_top[0]) + ) + + # assign mixed dimensions if applicable + if args.md_flag: + m_spa = md_solver( + torch.tensor(ln_emb), + args.md_temperature, # alpha + d0=m_spa, + round_dim=args.md_round_dims + ).tolist() + + # test prints (model arch) + if args.debug_mode: + print("model arch:") + print( + "mlp top arch " + + str(ln_top.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_top) + print("# of interactions") + print(num_int) + print( + "mlp bot arch " + + str(ln_bot.size - 1) + + " layers, with input to output dimensions:" + ) + print(ln_bot) + print("# of features (sparse and dense)") + print(num_fea) + print("dense feature size") + print(m_den) + print("sparse feature size") + print(m_spa) + print( + "# of embeddings (= # of sparse features) " + + str(ln_emb.size) + + ", with dimensions " + + str(m_spa) + + "x:" + ) + print(ln_emb) + + print("data (inputs and targets):") + for j, (X, lS_o, lS_i, T) in enumerate(train_ld): + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + + print("mini-batch: %d" % j) + print(X.detach().cpu().numpy()) + # transform offsets to lengths when printing + print( + [ + np.diff( + S_o.detach().cpu().tolist() + list(lS_i[i].shape) + ).tolist() + for i, S_o in enumerate(lS_o) + ] + ) + print([S_i.detach().cpu().tolist() for S_i in lS_i]) + print(T.detach().cpu().numpy()) + + ndevices = min(ngpus, args.mini_batch_size, num_fea - 1) if use_gpu else -1 + + ### construct the neural network specified above ### + # WARNING: to obtain exactly the same initialization for + # the weights we need to start from the same random seed. + # np.random.seed(args.numpy_rand_seed) + dlrm = DLRM_Net( + m_spa, + ln_emb, + ln_bot, + ln_top, + args.arch_project_size, + arch_interaction_op=args.arch_interaction_op, + arch_interaction_itself=args.arch_interaction_itself, + sigmoid_bot=-1, + sigmoid_top=ln_top.size - 2, + sync_dense_params=args.sync_dense_params, + loss_threshold=args.loss_threshold, + ndevices=ndevices, + qr_flag=args.qr_flag, + qr_operation=args.qr_operation, + qr_collisions=args.qr_collisions, + qr_threshold=args.qr_threshold, + md_flag=args.md_flag, + md_threshold=args.md_threshold, + ) + # test prints + if args.debug_mode: + print("initial parameters (weights and bias):") + for param in dlrm.parameters(): + print(param.detach().cpu().numpy()) + # print(dlrm) + + if use_gpu: + # Custom Model-Data Parallel + # the mlps are replicated and use data parallelism, while + # the embeddings are distributed and use model parallelism + dlrm = dlrm.to(device) # .cuda() + if dlrm.ndevices > 1: + dlrm.emb_l = dlrm.create_emb(m_spa, ln_emb) + + if ext_dist.my_size > 1: + if use_gpu: + device_ids = [ext_dist.my_local_rank] + dlrm.bot_l = DDP(dlrm.bot_l, device_ids=device_ids) + dlrm.top_l = DDP(dlrm.top_l, device_ids=device_ids) + else: + dlrm.bot_l = DDP(dlrm.bot_l) + dlrm.top_l = DDP(dlrm.top_l) + + # specify the loss function + if args.loss_function == "mse": + loss_fn = torch.nn.MSELoss(reduction="mean") + elif args.loss_function == "bce": + loss_fn = torch.nn.BCELoss(reduction="mean") + elif args.loss_function == "wbce": + loss_ws = torch.tensor(np.fromstring(args.loss_weights, dtype=float, sep="-")) + loss_fn = torch.nn.BCELoss(reduction="none") + else: + sys.exit("ERROR: --loss-function=" + args.loss_function + " is not supported") + + if not args.inference_only: + # specify the optimizer algorithm + + if ext_dist.my_size == 1: + optimizer = torch.optim.SGD(dlrm.parameters(), lr=args.learning_rate) + #lr_scheduler = LRPolicyScheduler(optimizer, args.lr_num_warmup_steps, args.lr_decay_start_step, + # args.lr_num_decay_steps) + else: + optimizer = torch.optim.SGD([ + {"params": [p for emb in dlrm.emb_l for p in emb.parameters()], "lr" : args.learning_rate}, + {"params": dlrm.bot_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size}, + {"params": dlrm.top_l.parameters(), "lr" : args.learning_rate * ext_dist.my_size} + ], lr=args.learning_rate) + + ### main loop ### + def time_wrap(use_gpu): + if use_gpu: + torch.cuda.synchronize() + return time.time() + + def dlrm_wrap(X, lS_o, lS_i, use_gpu, device): + if use_gpu: # .cuda() + # lS_i can be either a list of tensors or a stacked tensor. + # Handle each case below: + tm.tmH2D.start() + #lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \ + # else lS_i.to(device) + #lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \ + # else lS_o.to(device) + #X = X.to(device) + tm.tmH2D.stop() + + return dlrm( + X, + lS_o, + lS_i + ) + else: + return dlrm(X, lS_o, lS_i) + + def loss_fn_wrap(Z, T, use_gpu, device): + if args.loss_function == "mse" or args.loss_function == "bce": + if use_gpu: + # return loss_fn(Z, T.to(device)) + return loss_fn(Z, T) + else: + return loss_fn(Z, T) + elif args.loss_function == "wbce": + if use_gpu: + loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T).to(device) + loss_fn_ = loss_fn(Z, T.to(device)) + else: + loss_ws_ = loss_ws[T.data.view(-1).long()].view_as(T) + loss_fn_ = loss_fn(Z, T.to(device)) + loss_sc_ = loss_ws_ * loss_fn_ + # debug prints + # print(loss_ws_) + # print(loss_fn_) + return loss_sc_.mean() + + # training or inference + best_gA_test = 0 + best_auc_test = 0 + skip_upto_epoch = 0 + skip_upto_batch = 0 + total_time = 0 + total_loss = 0 + total_accu = 0 + total_iter = 0 + total_samp = 0 + k = 0 + + # Load model is specified + if not (args.load_model == ""): + print("Loading saved model {}".format(args.load_model)) + if use_gpu: + if dlrm.ndevices > 1: + # NOTE: when targeting inference on multiple GPUs, + # load the model as is on CPU or GPU, with the move + # to multiple GPUs to be done in parallel_forward + ld_model = torch.load(args.load_model) + else: + # NOTE: when targeting inference on single GPU, + # note that the call to .to(device) has already happened + ld_model = torch.load( + args.load_model, + map_location=torch.device('cuda') + # map_location=lambda storage, loc: storage.cuda(0) + ) + else: + # when targeting inference on CPU + ld_model = torch.load(args.load_model, map_location=torch.device('cpu')) + dlrm.load_state_dict(ld_model["state_dict"]) + ld_j = ld_model["iter"] + ld_k = ld_model["epoch"] + ld_nepochs = ld_model["nepochs"] + ld_nbatches = ld_model["nbatches"] + ld_nbatches_test = ld_model["nbatches_test"] + ld_gA = ld_model["train_acc"] + ld_gL = ld_model["train_loss"] + ld_total_loss = ld_model["total_loss"] + ld_total_accu = ld_model["total_accu"] + ld_gA_test = ld_model["test_acc"] + ld_gL_test = ld_model["test_loss"] + if not args.inference_only: + optimizer.load_state_dict(ld_model["opt_state_dict"]) + best_gA_test = ld_gA_test + total_loss = ld_total_loss + total_accu = ld_total_accu + skip_upto_epoch = ld_k # epochs + skip_upto_batch = ld_j # batches + else: + args.print_freq = ld_nbatches + args.test_freq = 0 + + print( + "Saved at: epoch = {:d}/{:d}, batch = {:d}/{:d}, ntbatch = {:d}".format( + ld_k, ld_nepochs, ld_j, ld_nbatches, ld_nbatches_test + ) + ) + print( + "Training state: loss = {:.6f}, accuracy = {:3.3f} %".format( + ld_gL, ld_gA * 100 + ) + ) + print( + "Testing state: loss = {:.6f}, accuracy = {:3.3f} %".format( + ld_gL_test, ld_gA_test * 100 + ) + ) + + ext_dist.barrier() + startTime = time.time() + startTime0 = startTime + skipped = 0 + + #print("Processing data") + #t1 = time.time() + syndatasetlen = min(65536 // args.mini_batch_size, nbatches) + #myobj = list(enumerate(train_ld)) + #t2 = time.time() + #print("Processing data takes {} seconds with len={} {} {} {}".format(t2-t1, len(myobj), nbatches, args.mini_batch_size, syndatasetlen)) + print("time/loss/accuracy (if enabled):") + with torch.autograd.profiler.profile(args.enable_profiling, use_gpu, record_shapes=True) as prof: + # with torch.autograd.profiler.emit_nvtx(): + + while k < args.nepochs: + if k < skip_upto_epoch: + continue + + if use_gpu: + tm.tmSync1.start() + torch.cuda.synchronize() + tm.tmSync1.stop() + accum_time_begin = time.time() + + if args.mlperf_logging: + previous_iteration_time = None + + # for j, (X, lS_o, lS_i, T) in enumerate(train_ld): + for j in range(nbatches): + + if (skipped == 2): + ext_dist.barrier() + startTime = time.time() + ext_dist.orig_print("ORIG TIME: ", startTime, accum_time_begin, startTime - accum_time_begin, " for process ", ext_dist.my_rank) + # torch.cuda.profiler.cudart().cudaProfilerStart() + if use_gpu: + torch.cuda.profiler.start() + tm.tmClear() + skipped = skipped + 1 + + tm.tmGetData.start() + if j==0 and use_gpu: + # X, lS_o, lS_i, T = train_data.__getitem__(j%syndatasetlen) + X, lS_o, lS_i, T = next(enumerate(train_ld) + + print("BB0 X size {} lS_i[0] size {}".format(X.size(), lS_i[0].size())) + mybatch_size = X.size()[0] + if ext_dist.my_size > 1: + X = X[ext_dist.get_my_slice(mybatch_size)] + lS_o = lS_o[dlrm.local_emb_slice] + lS_i = lS_i[dlrm.local_emb_slice] + + lS_i = [S_i.to(device) for S_i in lS_i] if isinstance(lS_i, list) \ + else lS_i.to(device) + lS_o = [S_o.to(device) for S_o in lS_o] if isinstance(lS_o, list) \ + else lS_o.to(device) + X = X.to(device) + T = T.to(device) + print("BBB X size {} lS_i[0] size {}".format(X.size(), lS_i[0].size())) + + tm.tmGetData.stop() + + if j == 0 and args.save_onnx: + (X_onnx, lS_o_onnx, lS_i_onnx) = (X, lS_o, lS_i) + + if j < skip_upto_batch: + continue + + if args.mlperf_logging: + current_time = time_wrap(use_gpu) + if previous_iteration_time: + iteration_time = current_time - previous_iteration_time + else: + iteration_time = 0 + previous_iteration_time = current_time + else: + if use_gpu: + tm.tmSync2.start() + torch.cuda.synchronize() + tm.tmSync2.stop() + t1 = time.time() + + # early exit if nbatches was set by the user and has been exceeded + if nbatches > 0 and j >= nbatches: + break + ''' + # debug prints + print("input and targets") + print(X.detach().cpu().numpy()) + print([np.diff(S_o.detach().cpu().tolist() + + list(lS_i[i].shape)).tolist() for i, S_o in enumerate(lS_o)]) + print([S_i.detach().cpu().numpy().tolist() for S_i in lS_i]) + print(T.detach().cpu().numpy()) + ''' + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X.size(0) % ext_dist.my_size != 0: + print("Warning: Skiping the batch %d with size %d" % (j, X.size(0))) + continue + + + # forward pass + tm.tmFwd.start() + Z = dlrm_wrap(X, lS_o, lS_i, use_gpu, device) + tm.tmFwd.stop() + + # loss + tm.tmLoss.start() + E = loss_fn_wrap(Z, T, use_gpu, device) + ''' + # debug prints + print("output and loss") + print(Z.detach().cpu().numpy()) + print(E.detach().cpu().numpy()) + ''' + # compute loss and accuracy + L = E.detach().cpu().numpy() # numpy array + S = Z.detach().cpu().numpy() # numpy array + T0 = T.detach().cpu().numpy() # numpy array + mbs = T0.shape[0] # = args.mini_batch_size except maybe for last + A = np.sum((np.round(S, 0) == T0).astype(np.uint8)) + tm.tmLoss.stop() + + if not args.inference_only: + # scaled error gradient propagation + # (where we do not accumulate gradients across mini-batches) + tm.tmZero.start() + optimizer.zero_grad() + tm.tmZero.stop() + + # backward pass + tm.tmBwd.start() + E.backward() + tm.tmBwd.stop() + + # debug prints (check gradient norm) + # for l in mlp.layers: + # if hasattr(l, 'weight'): + # print(l.weight.grad.norm().item()) + + # optimizer + tm.tmOpt.start() + optimizer.step() + tm.tmOpt.stop() + + ### lr_scheduler.step() + + if args.mlperf_logging: + total_time += iteration_time + else: + if use_gpu: + tm.tmSync3.start() + torch.cuda.synchronize() + tm.tmSync3.stop() + t2 = time.time() + total_time += t2 - t1 + + total_accu += A + total_loss += L * mbs + total_iter += 1 + total_samp += mbs + + should_print = ((j + 1) % args.print_freq == 0) or (j + 1 == nbatches) + should_test = ( + (args.test_freq > 0) + and (args.data_generation == "dataset") + and (((j + 1) % args.test_freq == 0) or (j + 1 == nbatches)) + ) + + # print time, loss and accuracy + if should_print or should_test: + gT = 1000.0 * total_time / total_iter if args.print_time else -1 + total_time = 0 + + gA = total_accu / total_samp + total_accu = 0 + + gL = total_loss / total_samp + total_loss = 0 + + str_run_type = "inference" if args.inference_only else "training" + print( + "Finished {} it {}/{} of epoch {}, {:.2f} ms/it, ".format( + str_run_type, j + 1, nbatches, k, gT + ) + + "loss {:.6f}, accuracy {:3.3f} % it {} for task {} ".format(gL, + gA * 100, total_iter, ext_dist.my_rank) + ) + # Uncomment the line below to print out the total time with overhead + if ext_dist.my_rank < 2: + tt1 = time.time() + ext_dist.orig_print("Accumulated time so far: {} for process {} for step {} at {}" \ + .format(tt1 - accum_time_begin, ext_dist.my_rank, skipped, tt1)) + total_iter = 0 + total_samp = 0 + + # testing + if should_test and not args.inference_only: + # don't measure training iter time in a test iteration + if args.mlperf_logging: + previous_iteration_time = None + + test_accu = 0 + test_loss = 0 + test_samp = 0 + + accum_test_time_begin = time_wrap(use_gpu) + if args.mlperf_logging: + scores = [] + targets = [] + + for i, (X_test, lS_o_test, lS_i_test, T_test) in enumerate(test_ld): + # early exit if nbatches was set by the user and was exceeded + if nbatches > 0 and i >= nbatches: + break + + # Skip the batch if batch size not multiple of total ranks + if ext_dist.my_size > 1 and X_test.size(0) % ext_dist.my_size != 0: + print("Warning: Skiping the batch %d with size %d" % (i, X_test.size(0))) + continue + + t1_test = time_wrap(use_gpu) + + # forward pass + Z_test = dlrm_wrap( + X_test, lS_o_test, lS_i_test, use_gpu, device + ) + if args.mlperf_logging: + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + scores.append(S_test) + targets.append(T_test) + else: + # loss + E_test = loss_fn_wrap(Z_test, T_test, use_gpu, device) + + # compute loss and accuracy + L_test = E_test.detach().cpu().numpy() # numpy array + S_test = Z_test.detach().cpu().numpy() # numpy array + T_test = T_test.detach().cpu().numpy() # numpy array + mbs_test = T_test.shape[0] # = mini_batch_size except last + A_test = np.sum((np.round(S_test, 0) == T_test).astype(np.uint8)) + test_accu += A_test + test_loss += L_test * mbs_test + test_samp += mbs_test + + t2_test = time_wrap(use_gpu) + + if args.mlperf_logging: + scores = np.concatenate(scores, axis=0) + targets = np.concatenate(targets, axis=0) + + metrics = { + 'loss' : sklearn.metrics.log_loss, + 'recall' : lambda y_true, y_score: + sklearn.metrics.recall_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'precision' : lambda y_true, y_score: + sklearn.metrics.precision_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'f1' : lambda y_true, y_score: + sklearn.metrics.f1_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + 'ap' : sklearn.metrics.average_precision_score, + 'roc_auc' : sklearn.metrics.roc_auc_score, + 'accuracy' : lambda y_true, y_score: + sklearn.metrics.accuracy_score( + y_true=y_true, + y_pred=np.round(y_score) + ), + # 'pre_curve' : sklearn.metrics.precision_recall_curve, + # 'roc_curve' : sklearn.metrics.roc_curve, + } + + # print("Compute time for validation metric : ", end="") + # first_it = True + validation_results = {} + for metric_name, metric_function in metrics.items(): + # if first_it: + # first_it = False + # else: + # print(", ", end="") + # metric_compute_start = time_wrap(False) + validation_results[metric_name] = metric_function( + targets, + scores + ) + # metric_compute_end = time_wrap(False) + # met_time = metric_compute_end - metric_compute_start + # print("{} {:.4f}".format(metric_name, 1000 * (met_time)), + # end="") + # print(" ms") + gA_test = validation_results['accuracy'] + gL_test = validation_results['loss'] + else: + gA_test = test_accu / test_samp + gL_test = test_loss / test_samp + + is_best = gA_test > best_gA_test + if is_best: + best_gA_test = gA_test + if not (args.save_model == ""): + print("Saving model to {}".format(args.save_model)) + torch.save( + { + "epoch": k, + "nepochs": args.nepochs, + "nbatches": nbatches, + "nbatches_test": nbatches_test, + "iter": j + 1, + "state_dict": dlrm.state_dict(), + "train_acc": gA, + "train_loss": gL, + "test_acc": gA_test, + "test_loss": gL_test, + "total_loss": total_loss, + "total_accu": total_accu, + "opt_state_dict": optimizer.state_dict(), + }, + args.save_model, + ) + + if args.mlperf_logging: + is_best = validation_results['roc_auc'] > best_auc_test + if is_best: + best_auc_test = validation_results['roc_auc'] + + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, k) + + " loss {:.6f}, recall {:.4f}, precision {:.4f},".format( + validation_results['loss'], + validation_results['recall'], + validation_results['precision'] + ) + + " f1 {:.4f}, ap {:.4f},".format( + validation_results['f1'], + validation_results['ap'], + ) + + " auc {:.4f}, best auc {:.4f},".format( + validation_results['roc_auc'], + best_auc_test + ) + + " accuracy {:3.3f} %, best accuracy {:3.3f} %".format( + validation_results['accuracy'] * 100, + best_gA_test * 100 + ) + ) + else: + print( + "Testing at - {}/{} of epoch {},".format(j + 1, nbatches, 0) + + " loss {:.6f}, accuracy {:3.3f} %, best {:3.3f} %".format( + gL_test, gA_test * 100, best_gA_test * 100 + ) + ) + # Uncomment the line below to print out the total time with overhead + # print("Total test time for this group: {}" \ + # .format(time_wrap(use_gpu) - accum_test_time_begin)) + + if (args.mlperf_logging + and (args.mlperf_acc_threshold > 0) + and (best_gA_test > args.mlperf_acc_threshold)): + print("MLPerf testing accuracy threshold " + + str(args.mlperf_acc_threshold) + + " reached, stop training") + break + + if (args.mlperf_logging + and (args.mlperf_auc_threshold > 0) + and (best_auc_test > args.mlperf_auc_threshold)): + print("MLPerf testing auc threshold " + + str(args.mlperf_auc_threshold) + + " reached, stop training") + break + + #if (ext_dist.my_rank == 0 and should_print): + # print("ITER : ", j, " from nvidia-smi") + # os.system("nvidia-smi") + + k += 1 # nepochs + + #if (ext_dist.my_rank == 0): + # # print(torch.cuda.memory_allocated(0)) + # print(torch.cuda.memory_summary(0)) + # # print("from nvidia-smi") + # os.system("nvidia-smi") + + tt2 = time.time() + endTime = tt2 - startTime + ext_dist.barrier() + tt3 = time.time() + finalTime = tt3 - startTime + # torch.cuda.profiler.cudart().cudaProfilerStop() + torch.cuda.profiler.stop() + if (skipped > 2): + skipped -= 2 + ext_dist.orig_print("Process {} Done with total time {:.6f} measure time {:.6f}s {:.6f}s, \ + iter {:.1f}ms {:.1f}ms steps {} {}".format(ext_dist.my_rank, tt3 - startTime0, + finalTime, endTime, finalTime*1000.0/skipped, endTime*1000.0/skipped, skipped, tt2), flush=True) + if (ext_dist.my_rank < 2): + tm.tmSummary(ext_dist.my_rank) + + file_prefix = "%s/dlrm_s_pytorch_r%d" % (args.out_dir, ext_dist.my_rank) + # profiling + if args.enable_profiling: + os.makedirs(args.out_dir, exist_ok=True) + with open("TT"+str(uuid.uuid4().hex), "w") as prof_f: + prof_f.write(prof.key_averages(group_by_input_shape=True).table( + sort_by="self_cpu_time_total", + )) + +# with open("%s.prof" % file_prefix, "w") as prof_f: +# prof_f.write(prof.key_averages().table(sort_by="cpu_time_total")) +# prof.export_chrome_trace("./%s.json" % file_prefix) +# print(prof.key_averages().table(sort_by="cpu_time_total")) + + # plot compute graph + if args.plot_compute_graph: + sys.exit( + "ERROR: Please install pytorchviz package in order to use the" + + " visualization. Then, uncomment its import above as well as" + + " three lines below and run the code again." + ) + # os.makedirs(args.out_dir, exist_ok=True) + # V = Z.mean() if args.inference_only else E + # dot = make_dot(V, params=dict(dlrm.named_parameters())) + # dot.render('%s_graph' % file_prefix) # write .pdf file + + # test prints + if not args.inference_only and args.debug_mode: + print("updated parameters (weights and bias):") + for param in dlrm.parameters(): + print(param.detach().cpu().numpy()) + + # export the model in onnx + if args.save_onnx: + + dlrm_pytorch_onnx_file = "dlrm_s_pytorch.onnx" + torch.onnx.export( + dlrm, (X_onnx, lS_o_onnx, lS_i_onnx), dlrm_pytorch_onnx_file, verbose=True, use_external_data_format=True + ) + + # recover the model back + dlrm_pytorch_onnx = onnx.load("%s.onnx" % file_prefix) + # check the onnx model + onnx.checker.check_model(dlrm_pytorch_onnx)