Add nightly model tests against pytorch

and migrate gpt2_cpu_init example ghstack-source-id: 2e503f1305ef4fcc68c606e09cee7b8e6e0d24b3 Pull Request resolved: #1120
pytorch · May 29, 2024 · ee8a724 · ee8a724
1 parent 395801c
commit ee8a724
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 17 deletions.
diff --git a/.github/workflows/gpu_tests.yaml b/.github/workflows/gpu_tests.yaml
@@ -6,7 +6,7 @@ on:
     - main
   pull_request:
     paths:
-      - '.github/workflows/**.yaml'
+      - '.github/workflows/gpu_tests.yaml'
       - 'pippy/**'
       - 'test/**'
       - 'examples/**'

diff --git a/.github/workflows/model_tests.yaml b/.github/workflows/model_tests.yaml
@@ -0,0 +1,85 @@
+name: Model Tests
+# Run models in `examples` folder
+
+on:
+  # Run when any example is changed
+  pull_request:
+    paths:
+      - '.github/workflows/model_tests.yaml'
+      - 'examples/**'
+  # Nightly run against pytorch nightly build
+  schedule:
+    - cron: "30 11 * * *"   # Everyday 11:30 am UTC, i.e. 4:30 am PST
+
+concurrency:
+  # Cancel CI on previous commit when a new commit is pushed to the same branch
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+defaults:
+  run:
+    shell: bash -l -eo pipefail {0}
+
+jobs:
+  model_tests_4gpu:
+    runs-on: linux.g5.12xlarge.nvidia.gpu
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    steps:
+      - name: Check out repo
+        uses: actions/checkout@v3
+      - name: Setup conda env
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          miniconda-version: "latest"
+          activate-environment: test
+          python-version: ${{ matrix.python-version }}
+      - name: Activate conda env
+        run: conda activate test
+      - name: Install dependencies
+        run: |
+          pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
+      - name: Install Transformers for getting models
+        run: pip install transformers
+      - name: Run GPT2
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py
+      - name: Run BERT
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py
+      - name: Run blenderbot
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_blenderbot.py
+      - name: Run camemBert
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_camemBert.py
+      - name: Run convBert
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_convBert.py
+      - name: Run deberta
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_deberta.py
+      - name: Run debertaV2
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_debertaV2.py
+      - name: Run distilBert
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_distilBert.py
+      - name: Run electra
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_electra.py
+      - name: Run fnet
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_fnet.py
+      - name: Run gptNeo
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gptNeo.py
+      - name: Run layoutLM
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_layoutLM.py
+      - name: Run mbart
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_mbart.py
+      - name: Run megatronBert
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_megatronBert.py
+      - name: Run mobileBert
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_mobileBert.py
+      - name: Run opt
+        run: torchrun --nproc-per-node 2 examples/huggingface/pippy_opt.py
+      - name: Run trOCR
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_trOCR.py
+      - name: Run unet
+        run: torchrun --nproc-per-node 2 examples/huggingface/pippy_unet.py
+      - name: Run xlnet
+        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_xlnet.py
+      - name: Test CPU init + GPU run
+        run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py
diff --git a/.github/workflows/pippy_tests.yaml b/.github/workflows/pippy_tests.yaml
@@ -6,7 +6,7 @@ on:
     - main
   pull_request:
     paths:
-      - '.github/workflows/**.yaml'
+      - '.github/workflows/pippy_tests.yaml'
       - 'pippy/**'
       - 'test/**'
       - 'examples/**'

diff --git a/examples/cpu_init/gpt2_cpu_init.py b/examples/cpu_init/gpt2_cpu_init.py
@@ -8,20 +8,11 @@
 
 import torch
 import torch.distributed as dist
-
-from pippy import pipeline, PipelineStage, SplitPoint, annotate_split_points
-from pippy.PipelineSchedule import ScheduleGPipe
+from torch.distributed.pipelining import pipeline, PipelineStage, ScheduleGPipe, SplitPoint
 
 from transformers import GPT2ForSequenceClassification, GPT2Config
 
 
-def add_split_points(gpt2, nranks):
-    layers_per_rank = gpt2.config.num_hidden_layers // nranks
-    for i in range(1, nranks):
-        annotate_split_points(
-            gpt2, {f"transformer.h.{i * layers_per_rank}": SplitPoint.BEGINNING})
-
-
 def run(args):
     # Model configs
     config = GPT2Config()
@@ -45,20 +36,27 @@ def run(args):
         requires_grad=False,
     )
 
-    # Annotate split points
-    add_split_points(gpt2, args.world_size)
+    # Split spec
+    decoders_per_rank = (gpt2.config.n_layer + args.world_size - 1) // args.world_size
+    print(f"decoders_per_rank = {decoders_per_rank}")
+    split_spec = {
+        f'transformer.h.{i * decoders_per_rank}': SplitPoint.BEGINNING
+        for i in range(1, args.world_size)
+    }
 
     # Create pipeline
-    gpt2_pipe = pipeline(
+    pipe = pipeline(
         gpt2,
         num_chunks=args.chunks,
         example_args=(example_input,),
+        split_spec=split_spec,
     )
-    assert gpt2_pipe.num_stages == args.world_size, f"nstages = {gpt2_pipe.num_stages} nranks = {args.world_size}"
+
+    assert pipe.num_stages == args.world_size, f"nstages = {pipe.num_stages} nranks = {args.world_size}"
 
     # Create schedule runtime
     stage = PipelineStage(
-        gpt2_pipe,
+        pipe,
         args.rank,
         device=args.device,
     )