demo training/inference script #1034

Workflow file for this run

.github/workflows/pippy_tests.yaml at d6506ed

	name: PiPPy tests

	on:
	push:
	branches:
	- main
	pull_request:
	paths:
	- '.github/workflows/pippy**'
	- 'pippy/**'
	- 'test/**'
	- 'examples/**'
	- '!docs/**'
	- '!**.md'
	- 'requirements.txt'

	concurrency:
	# Cancel CI on previous commit when a new commit is pushed to the same branch
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}
	cancel-in-progress: true

	jobs:

	pytest_tests:
	runs-on: linux.4xlarge
	strategy:
	matrix:
	python-version: ["3.8", "3.9"]
	container:
	image: python:${{ matrix.python-version }}

	steps:
	- uses: actions/checkout@v2
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install flake8 pytest pytest-cov pytest-xdist numpy
	if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	- name: Install pavel's huggingface fork
	run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
	- name: Install pippy
	run: "python setup.py install"
	- name: Test with pytest
	run: \|
	pytest --cov=pippy --ignore=test/hf_test.py --ignore=test/test_fx.py --ignore=test/test_fx_experimental.py --ignore=test/fx test/

	# hf_model_tests:
	# runs-on: linux.12xlarge
	# strategy:
	# matrix:
	# python-version: ["3.9"]
	# shard: ["0", "1", "2", "3", "4", "5", "6", "7"]
	# container:
	# image: python:${{ matrix.python-version }}

	# steps:
	# - uses: actions/checkout@v2
	# - name: Install dependencies
	# run: \|
	# python -m pip install --upgrade pip
	# pip install flake8 pytest pytest-cov pytest-xdist pytest-shard numpy
	# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	# - name: Install pavel's huggingface fork
	# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
	# - name: Install pippy
	# run: "python setup.py install"
	# # Single thread to avoid OOM
	# - name: Test forward only
	# run: \|
	# pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'not HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py
	# - name: Test forward and backward
	# run: \|
	# pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py

	integration_test_cpu:
	runs-on: linux.4xlarge
	strategy:
	matrix:
	python-version: ["3.8", "3.9"]
	replicate: ["0", "1"]
	schedule: ["FillDrain", "1F1B"]
	checkpoint: [ "0", "1" ]
	env:
	OMP_NUM_THREADS: "1"
	container:
	image: python:${{ matrix.python-version }}

	steps:
	- uses: actions/checkout@v2
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
	if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	- name: Install pavel's huggingface fork
	run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
	- name: Install pippy
	run: "python setup.py install"
	- name: Run forward-only integration test
	run: python test/local_test_forward.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	- name: Run forward-only-auto-parallel integration test
	run: python test/local_test_forward_auto_parallel.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	- name: Run forward-loss-backward integration test
	run: python test/local_test_forward_backward.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	- name: Run null_coalesce_accumulate integration test
	run: python test/local_test_null_coalesce_accumulate.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
	- name: Run PP + DDP test
	run: python test/local_test_ddp.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	#- name: Run HF BERT forward-only integration test
	# run: python test/local_test_forward_hf_bert.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	- name: Run HF GPT2 forward-only integration test
	run: python test/local_test_forward_hf_gpt2.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
	- name: Run visualizer test
	run: python test/local_test_visualizer.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
	- name: Run auto-split test
	run: python test/local_test_autosplit.py --replicate ${{ matrix.replicate }} -s ${{ matrix.schedule }}
	- name: Run compile test
	run: python test/local_test_compile.py -s ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}

	# hf_examples_set1:
	# runs-on: linux.12xlarge
	# strategy:
	# matrix:
	# python-version: ["3.9"]
	# schedule: ["FillDrain", "1F1B"]
	# env:
	# OMP_NUM_THREADS: "1"
	# container:
	# image: python:${{ matrix.python-version }}

	# steps:
	# - uses: actions/checkout@v2
	# - name: Install dependencies
	# run: \|
	# python -m pip install --upgrade pip
	# pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
	# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	# - name: Install pavel's huggingface fork
	# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
	# - name: Install pippy
	# run: "python setup.py install"
	# - name: Test min-GPT
	# run: \|
	# git config --global --add safe.directory /__w/tau/tau
	# git submodule update --init test/minGPT
	# python test/min_gpt_tracing.py
	# - name: Run GPT2 example
	# run: python examples/hf/gpt2/pippy_gpt2.py -s ${{ matrix.schedule }}
	# - name: Run BERT example
	# run: python examples/hf/bert/pippy_bert.py -s ${{ matrix.schedule }}
	# - name: Run T5 example
	# run: python examples/hf/t5/pippy_t5.py -s ${{ matrix.schedule }}
	# - name: "HF Translation: fine-tune T5 model translation English to Romanian"
	# run: >
	# python examples/hf/translation/run_translation.py --model_name_or_path t5-small --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=1 --pp_group_size=8
	# - name: "HF Translation: fine-tune BART model translation English to Romanian"
	# run: >
	# python examples/hf/translation/run_translation.py --model_name_or_path facebook/bart-base --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=2 --pp_group_size=8

	# hf_examples_set2:
	# runs-on: linux.12xlarge
	# strategy:
	# matrix:
	# python-version: ["3.9"]
	# schedule: ["FillDrain", "1F1B"]
	# env:
	# OMP_NUM_THREADS: "1"
	# container:
	# image: python:${{ matrix.python-version }}

	# steps:
	# - uses: actions/checkout@v2
	# - name: Install dependencies
	# run: \|
	# python -m pip install --upgrade pip
	# pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
	# if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	# - name: Install pavel's huggingface fork
	# run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
	# - name: Install pippy
	# run: "python setup.py install"
	# - name: "HF Causal Language Modeling: fine-tune GPT-2 on WikiText-2"
	# run: python examples/hf/language-modeling/run_clm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-clm --max_steps=3 --overwrite_output_dir
	# - name: "HF Masked Language Modeling: fine-tune RoBERTa on WikiText-2"
	# run: python examples/hf/language-modeling/run_mlm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path roberta-base --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-mlm --max_steps=3 --overwrite_output_dir
	# - name: "HF Text classification: fine-tune BERT on the GLUE benchmark"
	# run: python examples/hf/text-classification/run_glue.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path bert-base-cased --task_name mrpc --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 32 --learning_rate 2e-5 --num_train_epochs 3 --output_dir /tmp/mrpc/ --max_steps=3 --overwrite_output_dir

	integration_test_gpu:
	runs-on: linux.16xlarge.nvidia.gpu
	strategy:
	matrix:
	python-version: ["3.8"]
	replicate: ["0", "1"]
	schedule: ["FillDrain", "1F1B"]
	env:
	DOCKER_IMAGE: qts8n/cuda-python:devel
	PIPPY_ROOT: /PiPPy
	OMP_NUM_THREADS: "1"
	REPLICATE: ${{ matrix.replicate }}
	SCHEDULE: ${{ matrix.schedule }}

	steps:
	- name: Clean working directory
	shell: bash
	run: \|
	sudo rm -rf /home/ec2-user/actions-runner/_work/PiPPy/PiPPy/* \|\| true
	- uses: actions/checkout@v2
	- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
	uses: pytorch/test-infra/.github/actions/setup-nvidia@main
	- name: Pull Docker image
	run: \|
	retry () {
	"$@" \|\| (sleep 1 && "$@") \|\| (sleep 2 && "$@")
	}
	retry docker pull "${DOCKER_IMAGE}"
	- name: Test docker run
	run: \|
	set -x
	# shellcheck disable=SC2086,SC2090
	container_name=$(docker run \
	--gpus all \
	--shm-size=1g --ulimit memlock=-1 \
	-e OMP_NUM_THREADS \
	-e REPLICATE \
	-e SCHEDULE \
	--tty \
	--detach \
	-v "$(pwd):${PIPPY_ROOT}" \
	-w "${PIPPY_ROOT}" \
	"${DOCKER_IMAGE}"
	)
	# Run GPU tests and return error signal from docker
	docker exec -t -w "${PIPPY_ROOT}" "${container_name}" bash -c "bash .github/workflows/pippy_gpu_tests.sh; exit \$?"
	- name: Chown workspace
	if: always()
	run: \|
	# Ensure the working directory gets chowned back to the current user
	docker run --rm -v "$(pwd):${PIPPY_ROOT}" -w "${PIPPY_ROOT}" "${DOCKER_IMAGE}" chown -R "$(id -u):$(id -g)" .
	- name: Kill containers, clean up images
	if: always()
	run: \|
	# ignore expansion of "docker ps -q" since it could be empty
	# shellcheck disable=SC2046
	docker stop $(docker ps -q) \|\| true
	# Prune all of the docker images
	docker system prune -af

	programming_model_tests:
	runs-on: linux.4xlarge
	strategy:
	matrix:
	python-version: ["3.9"]
	container:
	image: python:${{ matrix.python-version }}

	steps:
	- uses: actions/checkout@v2
	- name: Install dependencies
	run: \|
	python -m pip install --upgrade pip
	pip install numpy datasets evaluate scikit-learn sacrebleu
	if [ -f requirements.txt ]; then pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
	- name: Install pippy
	run: "python setup.py install"
	- name: Test PiPPy + Dynamo example
	run: python examples/TorchDynamo/pippy_dynamo.py
	- name: Run PiPPy in GSPMD style
	run: python examples/gspmd/pippy_gspmd.py

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

demo training/inference script #1034

Workflow file

demo training/inference script #1034

Jobs

Run details

Workflow file for this run