Skip to content

Commit

Permalink
Add nightly model tests against pytorch
Browse files Browse the repository at this point in the history
and migrate gpt2_cpu_init example

ghstack-source-id: 2e503f1305ef4fcc68c606e09cee7b8e6e0d24b3
Pull Request resolved: #1120
  • Loading branch information
kwen2501 committed May 29, 2024
1 parent 395801c commit ee8a724
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 17 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gpu_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- main
pull_request:
paths:
- '.github/workflows/**.yaml'
- '.github/workflows/gpu_tests.yaml'
- 'pippy/**'
- 'test/**'
- 'examples/**'
Expand Down
85 changes: 85 additions & 0 deletions .github/workflows/model_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: Model Tests
# Run models in `examples` folder

on:
# Run when any example is changed
pull_request:
paths:
- '.github/workflows/model_tests.yaml'
- 'examples/**'
# Nightly run against pytorch nightly build
schedule:
- cron: "30 11 * * *" # Everyday 11:30 am UTC, i.e. 4:30 am PST

concurrency:
# Cancel CI on previous commit when a new commit is pushed to the same branch
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

defaults:
run:
shell: bash -l -eo pipefail {0}

jobs:
model_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Activate conda env
run: conda activate test
- name: Install dependencies
run: |
pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
- name: Install Transformers for getting models
run: pip install transformers
- name: Run GPT2
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py
- name: Run BERT
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py
- name: Run blenderbot
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_blenderbot.py
- name: Run camemBert
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_camemBert.py
- name: Run convBert
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_convBert.py
- name: Run deberta
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_deberta.py
- name: Run debertaV2
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_debertaV2.py
- name: Run distilBert
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_distilBert.py
- name: Run electra
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_electra.py
- name: Run fnet
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_fnet.py
- name: Run gptNeo
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gptNeo.py
- name: Run layoutLM
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_layoutLM.py
- name: Run mbart
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_mbart.py
- name: Run megatronBert
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_megatronBert.py
- name: Run mobileBert
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_mobileBert.py
- name: Run opt
run: torchrun --nproc-per-node 2 examples/huggingface/pippy_opt.py
- name: Run trOCR
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_trOCR.py
- name: Run unet
run: torchrun --nproc-per-node 2 examples/huggingface/pippy_unet.py
- name: Run xlnet
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_xlnet.py
- name: Test CPU init + GPU run
run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py
2 changes: 1 addition & 1 deletion .github/workflows/pippy_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- main
pull_request:
paths:
- '.github/workflows/**.yaml'
- '.github/workflows/pippy_tests.yaml'
- 'pippy/**'
- 'test/**'
- 'examples/**'
Expand Down
28 changes: 13 additions & 15 deletions examples/cpu_init/gpt2_cpu_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,11 @@

import torch
import torch.distributed as dist

from pippy import pipeline, PipelineStage, SplitPoint, annotate_split_points
from pippy.PipelineSchedule import ScheduleGPipe
from torch.distributed.pipelining import pipeline, PipelineStage, ScheduleGPipe, SplitPoint

from transformers import GPT2ForSequenceClassification, GPT2Config


def add_split_points(gpt2, nranks):
layers_per_rank = gpt2.config.num_hidden_layers // nranks
for i in range(1, nranks):
annotate_split_points(
gpt2, {f"transformer.h.{i * layers_per_rank}": SplitPoint.BEGINNING})


def run(args):
# Model configs
config = GPT2Config()
Expand All @@ -45,20 +36,27 @@ def run(args):
requires_grad=False,
)

# Annotate split points
add_split_points(gpt2, args.world_size)
# Split spec
decoders_per_rank = (gpt2.config.n_layer + args.world_size - 1) // args.world_size
print(f"decoders_per_rank = {decoders_per_rank}")
split_spec = {
f'transformer.h.{i * decoders_per_rank}': SplitPoint.BEGINNING
for i in range(1, args.world_size)
}

# Create pipeline
gpt2_pipe = pipeline(
pipe = pipeline(
gpt2,
num_chunks=args.chunks,
example_args=(example_input,),
split_spec=split_spec,
)
assert gpt2_pipe.num_stages == args.world_size, f"nstages = {gpt2_pipe.num_stages} nranks = {args.world_size}"

assert pipe.num_stages == args.world_size, f"nstages = {pipe.num_stages} nranks = {args.world_size}"

# Create schedule runtime
stage = PipelineStage(
gpt2_pipe,
pipe,
args.rank,
device=args.device,
)
Expand Down

0 comments on commit ee8a724

Please sign in to comment.