A graph-based pipeline splitting #322

Workflow file for this run

.github/workflows/gpu_tests.yaml at 5f62c99

	name: GPU Tests

	on:
	push:
	branches:
	- main
	pull_request:
	paths:
	- '.github/workflows/**.yaml'
	- 'pippy/**'
	- 'test/**'
	- 'examples/**'
	- '!docs/**'
	- '!**.md'
	- 'requirements.txt'

	concurrency:
	# Cancel CI on previous commit when a new commit is pushed to the same branch
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	defaults:
	run:
	shell: bash -l -eo pipefail {0}

	jobs:
	forward_tests_4gpu:
	runs-on: linux.g5.12xlarge.nvidia.gpu
	strategy:
	matrix:
	python-version: ['3.10']
	steps:
	- name: Check out repo
	uses: actions/checkout@v3
	- name: Setup conda env
	uses: conda-incubator/setup-miniconda@v2
	with:
	auto-update-conda: true
	miniconda-version: "latest"
	activate-environment: test
	python-version: ${{ matrix.python-version }}
	- name: Activate conda env
	run: conda activate test
	- name: Install dependencies
	run: \|
	pip install numpy expecttest
	pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
	- name: Install pippy
	run: python setup.py install
	- name: Run forward-only integration test
	run: torchrun --nproc-per-node 4 test/test_fwd.py
	- name: Run auto-split test
	run: torchrun --nproc-per-node 4 test/test_autosplit.py
	- name: Test skip connection support
	run: torchrun --nproc-per-node 4 test/test_skip_conn.py
	- name: Test trace on different device
	run: torchrun --nproc-per-node 2 test/test_cpu_init.py
	- name: Test Composability
	run: python test/test_composability.py
	- name: Run example
	run: torchrun --nproc-per-node 3 examples/basic/example.py
	- name: Run example with manual stage
	run: torchrun --nproc-per-node 3 examples/basic/example_manual_stage.py
	- name: Run training example
	run: torchrun --nproc-per-node 3 examples/basic/example_train.py
	- name: Install Transformers for real model tests
	run: pip install transformers
	- name: Run GPT2
	run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py
	- name: Test CPU init + GPU run
	run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py
	# - name: Run T5
	# run: torchrun --nproc-per-node 2 examples/huggingface/pippy_t5.py
	# - name: Run BERT
	# run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py

	backward_tests_4gpu:
	runs-on: linux.g5.12xlarge.nvidia.gpu
	strategy:
	matrix:
	python-version: ['3.10']
	schedule: ["gpipe", "1f1b"]
	steps:
	- name: Check out repo
	uses: actions/checkout@v3
	- name: Setup conda env
	uses: conda-incubator/setup-miniconda@v2
	with:
	auto-update-conda: true
	miniconda-version: "latest"
	activate-environment: test
	python-version: ${{ matrix.python-version }}
	- name: Activate conda env
	run: conda activate test
	- name: Install dependencies
	run:
	pip install numpy
	pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
	- name: Install pippy
	run: python setup.py install
	- name: Run forward-backward test
	run: torchrun --nproc-per-node 4 test/test_bwd.py --schedule ${{ matrix.schedule }}
	- name: Run optimizer test
	run: torchrun --nproc-per-node 4 test/test_optim.py --schedule ${{ matrix.schedule }}
	- name: Test gradient equivalence
	run: torchrun --nproc-per-node 4 test/test_grad.py --schedule ${{ matrix.schedule }}
	- name: Test interleaving schedules
	run: torchrun --nproc-per-node 4 test/test_interleave.py --schedule ${{ matrix.schedule }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

A graph-based pipeline splitting #322

Workflow file

A graph-based pipeline splitting #322

Jobs

Run details

Workflow file for this run