Attempt to extend nccl collective timeout (#858)

Summary: Pull Request resolved: #858 We have two remaining tests that are still failing, with the following error message: ``` [Rank 1] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=1, OpType=BROADCAST, NumelIn=2, NumelOut=2, Timeout(ms)=60000) ran for 60033 milliseconds before timing out. ``` Let's attempt to increase the collective timeout for those tests. There's no guarantee this will work, but it's worth trying. Otherwise we may consider deleting the failing tests to avoid flakyness. Reviewed By: galrotem Differential Revision: D59342738 fbshipit-source-id: 220f1f359eb0f98e5175e93badc7e998ae00db64
pytorch · Jul 3, 2024 · 58b6ea7 · 58b6ea7
1 parent 5dad8d3
commit 58b6ea7
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 5 deletions.
diff --git a/tests/utils/test_checkpoint_gpu.py b/tests/utils/test_checkpoint_gpu.py
@@ -24,9 +24,7 @@ class TestCheckpointUtilsGPU(unittest.TestCase):
     @skip_if_not_gpu
     def test_get_checkpoint_dirpaths_distributed(self) -> None:
         spawn_multi_process(
-            2,
-            "nccl",
-            self._test_get_checkpoint_dirpaths,
+            2, "nccl", self._test_get_checkpoint_dirpaths, timeout_s=180
         )
 
     @staticmethod

diff --git a/tests/utils/test_distributed_gpu.py b/tests/utils/test_distributed_gpu.py
@@ -52,6 +52,7 @@ def test_pg_wrapper_scatter_object_list_nccl(self) -> None:
             2,
             "nccl",
             self._test_pg_wrapper_scatter_object_list,
+            timeout_s=180,
         )
 
     @classmethod

diff --git a/torchtnt/utils/distributed.py b/torchtnt/utils/distributed.py
@@ -518,6 +518,7 @@ class ProcessGroupSetupParams:
     backend: str
     port: str
     world_size: int
+    timeout_s: int
 
 
 def spawn_multi_process(
@@ -538,6 +539,11 @@ def spawn_multi_process(
         method_args: args for the method
         method_kwargs: kwargs for the method
 
+    Note:
+        The default timeout used for distributed collectives in the process group is 60 seconds.
+        This can be overridden by passing a `timeout_s` key in the `method_kwargs`. It will be
+        extracted before passing to the method call.
+
     Returns:
         A list, l, where l[i] is the return value of method(*method_args, **methods_kwargs) on rank i
     """
@@ -550,7 +556,12 @@ def spawn_multi_process(
         # https://pytorch.org/docs/stable/multiprocessing.html#torch.multiprocessing.spawn
         _init_pg_and_rank_and_launch_method,
         args=(
-            ProcessGroupSetupParams(backend=backend, port=port, world_size=world_size),
+            ProcessGroupSetupParams(
+                backend=backend,
+                port=port,
+                world_size=world_size,
+                timeout_s=method_kwargs.pop("timeout_s", 60),
+            ),
             mp_output_dict,
             method,
             method_args,
@@ -582,7 +593,9 @@ def _init_pg_and_rank_and_launch_method(
         rank=rank,
         world_size=pg_setup_params.world_size,
         backend=pg_setup_params.backend,
-        timeout=timedelta(seconds=60),  # setting up timeout for distributed collectives
+        timeout=timedelta(  # setting up timeout for distributed collectives
+            seconds=pg_setup_params.timeout_s
+        ),
     )
     try:
         # pyre-ignore: spawn_multi_process uses unsafe types to begin with
-Original file line number
+Diff line change
@@ Expand Up / @@ -52,6 +52,7 @@ def test_pg_wrapper_scatter_object_list_nccl(self) -> None: @@
 ,
                 "nccl",
                 self._test_pg_wrapper_scatter_object_list,
+                timeout_s=180,
             )
         @classmethod
@@ Expand Down @@