Skip to content

A graph-based pipeline splitting #322

A graph-based pipeline splitting

A graph-based pipeline splitting #322

Workflow file for this run

name: GPU Tests
on:
push:
branches:
- main
pull_request:
paths:
- '.github/workflows/**.yaml'
- 'pippy/**'
- 'test/**'
- 'examples/**'
- '!docs/**'
- '!**.md'
- 'requirements.txt'
concurrency:
# Cancel CI on previous commit when a new commit is pushed to the same branch
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
defaults:
run:
shell: bash -l -eo pipefail {0}
jobs:
forward_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Activate conda env
run: conda activate test
- name: Install dependencies
run: |
pip install numpy expecttest
pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
- name: Install pippy
run: python setup.py install
- name: Run forward-only integration test
run: torchrun --nproc-per-node 4 test/test_fwd.py
- name: Run auto-split test
run: torchrun --nproc-per-node 4 test/test_autosplit.py
- name: Test skip connection support
run: torchrun --nproc-per-node 4 test/test_skip_conn.py
- name: Test trace on different device
run: torchrun --nproc-per-node 2 test/test_cpu_init.py
- name: Test Composability
run: python test/test_composability.py
- name: Run example
run: torchrun --nproc-per-node 3 examples/basic/example.py
- name: Run example with manual stage
run: torchrun --nproc-per-node 3 examples/basic/example_manual_stage.py
- name: Run training example
run: torchrun --nproc-per-node 3 examples/basic/example_train.py
- name: Install Transformers for real model tests
run: pip install transformers
- name: Run GPT2
run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py
- name: Test CPU init + GPU run
run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py
# - name: Run T5
# run: torchrun --nproc-per-node 2 examples/huggingface/pippy_t5.py
# - name: Run BERT
# run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py
backward_tests_4gpu:
runs-on: linux.g5.12xlarge.nvidia.gpu
strategy:
matrix:
python-version: ['3.10']
schedule: ["gpipe", "1f1b"]
steps:
- name: Check out repo
uses: actions/checkout@v3
- name: Setup conda env
uses: conda-incubator/setup-miniconda@v2
with:
auto-update-conda: true
miniconda-version: "latest"
activate-environment: test
python-version: ${{ matrix.python-version }}
- name: Activate conda env
run: conda activate test
- name: Install dependencies
run:
pip install numpy
pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
- name: Install pippy
run: python setup.py install
- name: Run forward-backward test
run: torchrun --nproc-per-node 4 test/test_bwd.py --schedule ${{ matrix.schedule }}
- name: Run optimizer test
run: torchrun --nproc-per-node 4 test/test_optim.py --schedule ${{ matrix.schedule }}
- name: Test gradient equivalence
run: torchrun --nproc-per-node 4 test/test_grad.py --schedule ${{ matrix.schedule }}
- name: Test interleaving schedules
run: torchrun --nproc-per-node 4 test/test_interleave.py --schedule ${{ matrix.schedule }}