NVIDIA · wujingyue · Dec 14, 2024
diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py
@@ -59,7 +59,22 @@ def setup_process_group(mpi_test) -> None:
     [Parallelism.TENSOR_PARALLEL, Parallelism.SEQUENCE_PARALLEL],
     ids=["tp", "sp"],
 )
-def test_transformer_layer(setup_process_group, benchmark, compute_type, parallelism):
+@pytest.mark.parametrize(
+    "overlap",
+    [False, True],
+    ids=["nonoverlap", "overlap"],
+)
+def test_transformer_layer(
+    setup_process_group,
+    monkeypatch,
+    benchmark,
+    compute_type: ComputeType,
+    parallelism: Parallelism,
+    overlap: bool,
+):
+    if overlap and parallelism == Parallelism.TENSOR_PARALLEL:
+        pytest.skip("Tensor parallelism doesn't support overlapping")
+
     # Hyperparameters for GPT-3
     hidden_size = 12288
     num_heads = 96
@@ -82,6 +97,7 @@ def test_transformer_layer(setup_process_group, benchmark, compute_type, paralle
         attn_input_format="bshd",
         set_parallel_mode=True,
         sequence_parallel=(parallelism == Parallelism.SEQUENCE_PARALLEL),
+        ub_tp_comm_overlap=overlap,
         tp_group=dist.group.WORLD,
     )
     transformer_layer.to(dtype).to("cuda")
@@ -97,6 +113,20 @@ def test_transformer_layer(setup_process_group, benchmark, compute_type, paralle
         batch_size, local_sequence_length, hidden_size, dtype=dtype, device="cuda"
     )
 
+    if overlap:
+        # Similar to https://github.com/NVIDIA/TransformerEngine/blob/e7bfc0c547d63332e4f8d65e606dc69f4c22ffbe/examples/pytorch/comm_gemm_overlap/te_layer_with_overlap.py#L27-L29
+        monkeypatch.setenv("CUDA_DEVICE_MAX_CONNECTIONS", "1")
+        if not te.cpp_extensions.device_supports_multicast():
+            monkeypatch.setenv("UB_SKIPMC", "1")
+
+        te.module.base.initialize_ub(
+            # Instructed by https://github.com/NVIDIA/TransformerEngine/blob/e7bfc0c547d63332e4f8d65e606dc69f4c22ffbe/transformer_engine/pytorch/module/base.py#L96-L99
+            [batch_size * sequence_length, hidden_size],
+            size,
+            dtype=dtype,
+            bootstrap_backend="nccl",
+        )
+
     match compute_type:
         case ComputeType.FORWARD:
 
@@ -155,3 +185,6 @@ def benchmark_fn(y, dy, profile):
                 setup=partial(setup_fn, True),
                 rounds=5,
             )
+
+    if overlap:
+        te.module.base.destroy_ub()