From 3d919022c91798a1d47409b63a0babb49dc006f6 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 8 Oct 2024 13:58:31 -0700 Subject: [PATCH 1/7] Repro for #3129. --- tests/python/test_transformer_engine.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index 239ab0a4df2..19e5ca90fa1 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -32,11 +32,8 @@ class ComputeType(Enum): @pytest.mark.mpi @pytest.mark.parametrize( "compute_type", - # TODO(#3119): add the backward test back. - # [ComputeType.FORWARD, ComputeType.BACKWARD], - # ids=["forward", "backward"], - [ComputeType.FORWARD], - ids=["forward"], + [ComputeType.FORWARD, ComputeType.BACKWARD], + ids=["forward", "backward"], ) def test_transformer_layer(mpi_test, benchmark, compute_type): # Hyperparameters for GPT-3 From fc42a5331417331de83d0b2983c95112d5fbdc9c Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 8 Oct 2024 22:23:07 -0700 Subject: [PATCH 2/7] Init and destroy once. --- tests/python/mpi_fixtures.py | 2 +- tests/python/test_transformer_engine.py | 28 +++++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/python/mpi_fixtures.py b/tests/python/mpi_fixtures.py index b9731bdc533..42ba043f618 100644 --- a/tests/python/mpi_fixtures.py +++ b/tests/python/mpi_fixtures.py @@ -34,7 +34,7 @@ def barrier(self): self._communicator.barrier() -@pytest.fixture +@pytest.fixture(scope="session") def mpi_test(): fixture = MpiTest() yield fixture diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index 19e5ca90fa1..adfe1132e52 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -23,6 +23,21 @@ class ComputeType(Enum): BACKWARD = auto() +@pytest.fixture(scope="module") +def process_group(mpi_test) -> None: + os.environ["MASTER_ADDR"] = "localhost" + # The default port as used by https://github.com/pytorch/pytorch/blob/45a8b5682eb69d865cbf68c7f2f689b56b4efd53/torch/csrc/distributed/c10d/TCPStore.hpp#L51. + os.environ["MASTER_PORT"] = "29500" + dist.init_process_group( + backend="nccl", + init_method="env://", + world_size=mpi_test.size, + rank=mpi_test.rank, + ) + yield + dist.destroy_process_group() + + # This benchmark is instrumented with cudaProfilerStart/Stop. Therefore, one # can collect stats of the first few non-warmup benchmark iterations using # @@ -35,7 +50,7 @@ class ComputeType(Enum): [ComputeType.FORWARD, ComputeType.BACKWARD], ids=["forward", "backward"], ) -def test_transformer_layer(mpi_test, benchmark, compute_type): +def test_transformer_layer(mpi_test, process_group, benchmark, compute_type): # Hyperparameters for GPT-3 hidden_size = 12288 num_heads = 96 @@ -48,15 +63,6 @@ def test_transformer_layer(mpi_test, benchmark, compute_type): rank = mpi_test.rank torch.cuda.set_device(rank) - os.environ["MASTER_ADDR"] = "localhost" - # The default port as used by https://github.com/pytorch/pytorch/blob/45a8b5682eb69d865cbf68c7f2f689b56b4efd53/torch/csrc/distributed/c10d/TCPStore.hpp#L51. - os.environ["MASTER_PORT"] = "29500" - dist.init_process_group( - backend="nccl", - init_method="env://", - world_size=size, - rank=rank, - ) tp_group = dist.new_group() transformer_layer = te.TransformerLayer( @@ -129,4 +135,4 @@ def benchmark_fn(y, dy, profile): rounds=5, ) - dist.destroy_process_group() + dist.destroy_process_group(group=tp_group) From 6f9e66e0184fce7488281b699289abc141d335ed Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Tue, 8 Oct 2024 22:30:12 -0700 Subject: [PATCH 3/7] rename --- tests/python/test_transformer_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index adfe1132e52..f089432fa6d 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -24,7 +24,7 @@ class ComputeType(Enum): @pytest.fixture(scope="module") -def process_group(mpi_test) -> None: +def setup_process_group(mpi_test) -> None: os.environ["MASTER_ADDR"] = "localhost" # The default port as used by https://github.com/pytorch/pytorch/blob/45a8b5682eb69d865cbf68c7f2f689b56b4efd53/torch/csrc/distributed/c10d/TCPStore.hpp#L51. os.environ["MASTER_PORT"] = "29500" @@ -50,7 +50,7 @@ def process_group(mpi_test) -> None: [ComputeType.FORWARD, ComputeType.BACKWARD], ids=["forward", "backward"], ) -def test_transformer_layer(mpi_test, process_group, benchmark, compute_type): +def test_transformer_layer(mpi_test, setup_process_group, benchmark, compute_type): # Hyperparameters for GPT-3 hidden_size = 12288 num_heads = 96 From eb58a4027c97a942b7956805078233398a7a6747 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 9 Oct 2024 12:18:10 -0700 Subject: [PATCH 4/7] Use world. --- tests/python/test_transformer_engine.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index f089432fa6d..d4c2c3d267f 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -63,14 +63,13 @@ def test_transformer_layer(mpi_test, setup_process_group, benchmark, compute_typ rank = mpi_test.rank torch.cuda.set_device(rank) - tp_group = dist.new_group() transformer_layer = te.TransformerLayer( hidden_size, ffn_hidden_size, num_heads, set_parallel_mode=True, - tp_group=tp_group, + tp_group=dist.group.WORLD, ) transformer_layer.to(dtype).to("cuda") @@ -134,5 +133,3 @@ def benchmark_fn(y, dy, profile): setup=partial(setup_fn, True), rounds=5, ) - - dist.destroy_process_group(group=tp_group) From 07b089870f9a4b29940dedc2b25f404ff5c12a2f Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 9 Oct 2024 12:24:26 -0700 Subject: [PATCH 5/7] Get rank and size from the default process group. --- tests/python/test_transformer_engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index d4c2c3d267f..862e4531a88 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -50,7 +50,7 @@ def setup_process_group(mpi_test) -> None: [ComputeType.FORWARD, ComputeType.BACKWARD], ids=["forward", "backward"], ) -def test_transformer_layer(mpi_test, setup_process_group, benchmark, compute_type): +def test_transformer_layer(setup_process_group, benchmark, compute_type): # Hyperparameters for GPT-3 hidden_size = 12288 num_heads = 96 @@ -59,8 +59,8 @@ def test_transformer_layer(mpi_test, setup_process_group, benchmark, compute_typ sequence_length = 2048 dtype = torch.bfloat16 - size = mpi_test.size - rank = mpi_test.rank + size = dist.get_world_size() + rank = dist.get_rank() torch.cuda.set_device(rank) From 77f3e2ee5b941ec58d662b295b07d403f6375952 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 9 Oct 2024 13:33:26 -0700 Subject: [PATCH 6/7] Simplify initialization. --- tests/python/test_transformer_engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index 862e4531a88..69237dca2a1 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -25,12 +25,10 @@ class ComputeType(Enum): @pytest.fixture(scope="module") def setup_process_group(mpi_test) -> None: - os.environ["MASTER_ADDR"] = "localhost" # The default port as used by https://github.com/pytorch/pytorch/blob/45a8b5682eb69d865cbf68c7f2f689b56b4efd53/torch/csrc/distributed/c10d/TCPStore.hpp#L51. - os.environ["MASTER_PORT"] = "29500" dist.init_process_group( backend="nccl", - init_method="env://", + init_method="tcp://localhost:29500", world_size=mpi_test.size, rank=mpi_test.rank, ) From b1b87e95c5cf6fe1553bdf8c983a291e599e0653 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Wed, 9 Oct 2024 18:41:40 -0700 Subject: [PATCH 7/7] Fix lint. --- tests/python/test_transformer_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/python/test_transformer_engine.py b/tests/python/test_transformer_engine.py index 69237dca2a1..05189e56675 100644 --- a/tests/python/test_transformer_engine.py +++ b/tests/python/test_transformer_engine.py @@ -2,7 +2,6 @@ # All rights reserved. # SPDX-License-Identifier: BSD-3-Clause -import os import pytest import torch import torch.distributed as dist