From 97132d6465d28c17d74401ed72d44549522a019e Mon Sep 17 00:00:00 2001 From: Sourav Chakraborty Date: Thu, 31 Mar 2022 10:03:52 -0500 Subject: [PATCH] Allow benchmark to run on more than 4 nodes dist.all_to_all_single(t, t) will fail if length of t is less the number of nodes. Increasing it to 1024 allows it to run on up to 1024 nodes. --- extend_distributed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extend_distributed.py b/extend_distributed.py index 1f2c8a53..c6b2b9c9 100644 --- a/extend_distributed.py +++ b/extend_distributed.py @@ -164,7 +164,7 @@ def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend="") print("Running on %d ranks using %s backend" % (my_size, backend)) if hasattr(dist, "all_to_all_single"): try: - t = torch.zeros([4]) + t = torch.zeros([1024]) if use_gpu: t = t.cuda() dist.all_to_all_single(t, t)