From 0bb9927369451f18f011225299b5a2a527e4f006 Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Fri, 14 Jun 2024 16:39:42 -0700 Subject: [PATCH] Add migration notice [ghstack-poisoned] --- .github/pull_request_template.md | 34 --- .github/workflows/ISSUE_TEMPLATE/bug.yaml | 68 ------ .github/workflows/code-quality.yml.bak | 30 --- .github/workflows/docker/Dockerfile | 46 ---- .github/workflows/gpu_tests.yaml.bak | 108 ---------- .github/workflows/model_tests.yaml | 2 +- .github/workflows/pippy_tests.yaml.bak | 193 ----------------- README.md | 2 +- test/local_test_c10d_ddp.py | 228 -------------------- test/local_test_checkpoint.py | 210 ------------------ test/local_test_null_coalesce_accumulate.py | 128 ----------- test/pippy_wrapper.sh | 12 -- test/run_pipeline_scheduler.sh | 3 - 13 files changed, 2 insertions(+), 1062 deletions(-) delete mode 100644 .github/pull_request_template.md delete mode 100644 .github/workflows/ISSUE_TEMPLATE/bug.yaml delete mode 100644 .github/workflows/code-quality.yml.bak delete mode 100644 .github/workflows/docker/Dockerfile delete mode 100644 .github/workflows/gpu_tests.yaml.bak delete mode 100644 .github/workflows/pippy_tests.yaml.bak delete mode 100644 test/local_test_c10d_ddp.py delete mode 100644 test/local_test_checkpoint.py delete mode 100644 test/local_test_null_coalesce_accumulate.py delete mode 100755 test/pippy_wrapper.sh delete mode 100644 test/run_pipeline_scheduler.sh diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md deleted file mode 100644 index f1b05c503..000000000 --- a/.github/pull_request_template.md +++ /dev/null @@ -1,34 +0,0 @@ -## Description - -Please read our [CONTRIBUTING.md](https://github.com/pytorch/PiPPy/blob/main/CONTRIBUTING.md) prior to creating your first pull request. - -Please include a summary of the feature or issue being fixed. Please also include relevant motivation and context. List any dependencies that are required for this change. - -Fixes #(issue) - -## Type of change - -Please delete options that are not relevant. - -- [ ] Bug fix (non-breaking change which fixes an issue) -- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) -- [ ] New feature (non-breaking change which adds functionality) -- [ ] This change requires a documentation update - -## Feature/Issue validation/testing - -Please describe the Unit or Integration tests that you ran to verify your changes and relevant result summary. Provide instructions so it can be reproduced. -Please also list any relevant details for your test configuration. - -- [ ] Test A -Logs for Test A - -- [ ] Test B -Logs for Test B - - -## Checklist: - -- [ ] Have you added tests that prove your fix is effective or that this feature works? -- [ ] Has code been commented, particularly in hard-to-understand areas? -- [ ] Have you made corresponding changes to the documentation? diff --git a/.github/workflows/ISSUE_TEMPLATE/bug.yaml b/.github/workflows/ISSUE_TEMPLATE/bug.yaml deleted file mode 100644 index 9cd2b41e2..000000000 --- a/.github/workflows/ISSUE_TEMPLATE/bug.yaml +++ /dev/null @@ -1,68 +0,0 @@ -name: 🐛 Bug Report -description: Create a bug report to help us reproduce and fix the bug - -body: - - type: markdown - attributes: - value: > - #### Before submitting a bug, please make sure the issue hasn't been already reported/addressed by searching through [the - existing and past issues](https://github.com/pytorch/PiPPy/issues), the [README](https://github.com/pytorch/PiPPy/blob/main/README.md). - - - type: textarea - attributes: - label: 🐛 Describe the bug - description: | - Please provide a clear and concise description of what the bug is. - - If relevant, please provide a minimal example so that we can reproduce the error by running the code. - - placeholder: | - A clear and concise description of what the bug is. - validations: - required: true - - - type: textarea - attributes: - label: Error logs - description: | - Paste the error logs that indicate the problem - - placeholder: | - Error... - - validations: - required: true - - - type: textarea - attributes: - label: Enviroment - description: | - Please provide infromation about your running enviroment - placeholder: | - PyTorch env info: python -m "torch.utils.collect_env" - validations: - required: true - - - - type: textarea - attributes: - label: Settings - description: | - Please provide your running settings - placeholder: | - Number of Nodes : - Number of GPU per Node: - scheduler : - validations: - required: true - - - type: textarea - attributes: - label: Possible Solution - description: | - Possible fix for them the problem - - - type: markdown - attributes: - value: > - Thanks for contributing 🎉! diff --git a/.github/workflows/code-quality.yml.bak b/.github/workflows/code-quality.yml.bak deleted file mode 100644 index 1f52b1cb6..000000000 --- a/.github/workflows/code-quality.yml.bak +++ /dev/null @@ -1,30 +0,0 @@ -name: Code Quality Checks - -on: - push: - branches: - - main - pull_request: - paths-ignore: - - 'docs/**' - - '**.md' - -jobs: - build: - runs-on: ubuntu-latest - name: Lints - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: "3.10.6" - - name: Install dependencies - run: | - pip install --upgrade pip - pip install -r docs/requirements.txt - pip install types-docutils types-setuptools tqdm types-tabulate - if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - pip install "black<23" pylint==v3.0.0a5 mypy==v0.981 flake8==3.8.2 pyre-check==0.9.15 ufmt==2.1.0 - - name: Static Analysis Checks - if: always() - run: ./check.sh diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile deleted file mode 100644 index a507a8d83..000000000 --- a/.github/workflows/docker/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -# Using cuda 11.3 -FROM nvidia/cuda:11.3.1-devel-ubuntu18.04 - -# nvidia cuda 11.3 paths -ENV LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} -ENV LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.3/lib64 - -# ensure local python is preferred over distribution python -ENV PATH /usr/local/bin:$PATH - -ENV LANG C.UTF-8 - -# Ignore `tzdata` asking questions -ENV DEBIAN_FRONTEND=noninteractive - -RUN echo "US/Pacific" > /etc/timezone \ - && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \ - && apt update && apt upgrade -y \ - && apt-get -y install build-essential checkinstall wget git \ - libreadline-gplv2-dev libncursesw5-dev libssl-dev \ - libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev - -# Set Python Version -ENV PYTHON_VERSION 3.9.12 -ENV PYTHON_COMMAND 3.9 - -# Install Python from source. -RUN cd /opt \ - && wget https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tgz \ - && tar xzf Python-$PYTHON_VERSION.tgz \ - && cd Python-$PYTHON_VERSION \ - && ./configure --enable-optimizations \ - && make altinstall \ - && ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python \ - && ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python3 \ - && ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip \ - && ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip3 \ - && cd / - -# Install python libraries needed for CI test. -RUN pip3 install --upgrade pip \ - && pip3 config set global.progress_bar off \ - && pip3 install flake8 pytest pytest-cov pytest-shard numpy expecttest hypothesis pyyaml - -LABEL versin="1.0.2" -LABEL description="Build docker image for ubuntu Linux OS with cuda 11.3 and Python." \ No newline at end of file diff --git a/.github/workflows/gpu_tests.yaml.bak b/.github/workflows/gpu_tests.yaml.bak deleted file mode 100644 index 76a8b1cbd..000000000 --- a/.github/workflows/gpu_tests.yaml.bak +++ /dev/null @@ -1,108 +0,0 @@ -name: GPU Tests - -on: - push: - branches: - - main - pull_request: - paths: - - '.github/workflows/gpu_tests.yaml' - - 'pippy/**' - - 'test/**' - - 'examples/**' - - '!docs/**' - - '!**.md' - - 'requirements.txt' - -concurrency: - # Cancel CI on previous commit when a new commit is pushed to the same branch - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -defaults: - run: - shell: bash -l -eo pipefail {0} - -jobs: - forward_tests_4gpu: - runs-on: linux.g5.12xlarge.nvidia.gpu - strategy: - matrix: - python-version: ['3.10'] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Activate conda env - run: conda activate test - - name: Install dependencies - run: | - pip install numpy expecttest - pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html - - name: Install pippy - run: python setup.py install - - name: Run forward-only integration test - run: torchrun --nproc-per-node 4 test/test_fwd.py - - name: Run auto-split test - run: torchrun --nproc-per-node 4 test/test_autosplit.py - - name: Test skip connection support - run: torchrun --nproc-per-node 4 test/test_skip_conn.py - - name: Test trace on different device - run: torchrun --nproc-per-node 2 test/test_cpu_init.py - - name: Test Composability - run: python test/test_composability.py - - name: Run example - run: torchrun --nproc-per-node 3 examples/basic/example.py - - name: Run example with manual stage - run: torchrun --nproc-per-node 3 examples/basic/example_manual_stage.py - - name: Run training example - run: torchrun --nproc-per-node 3 examples/basic/example_train.py - - name: Install Transformers for real model tests - run: pip install transformers - - name: Run GPT2 - run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py - - name: Test CPU init + GPU run - run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py - # - name: Run T5 - # run: torchrun --nproc-per-node 2 examples/huggingface/pippy_t5.py - # - name: Run BERT - # run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py - - backward_tests_4gpu: - runs-on: linux.g5.12xlarge.nvidia.gpu - strategy: - matrix: - python-version: ['3.10'] - schedule: ["gpipe", "1f1b"] - steps: - - name: Check out repo - uses: actions/checkout@v3 - - name: Setup conda env - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniconda-version: "latest" - activate-environment: test - python-version: ${{ matrix.python-version }} - - name: Activate conda env - run: conda activate test - - name: Install dependencies - run: - pip install numpy - pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html - - name: Install pippy - run: python setup.py install - - name: Run forward-backward test - run: torchrun --nproc-per-node 4 test/test_bwd.py --schedule ${{ matrix.schedule }} - - name: Run optimizer test - run: torchrun --nproc-per-node 4 test/test_optim.py --schedule ${{ matrix.schedule }} - - name: Test gradient equivalence - run: torchrun --nproc-per-node 4 test/test_grad.py --schedule ${{ matrix.schedule }} - - name: Test interleaving schedules - run: torchrun --nproc-per-node 4 test/test_interleave.py --schedule ${{ matrix.schedule }} diff --git a/.github/workflows/model_tests.yaml b/.github/workflows/model_tests.yaml index f9b8d2105..541eba686 100644 --- a/.github/workflows/model_tests.yaml +++ b/.github/workflows/model_tests.yaml @@ -40,7 +40,7 @@ jobs: run: conda activate test - name: Install dependencies run: | - pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - name: Install Transformers for getting models run: pip install transformers # - name: Install Diffusers for getting models diff --git a/.github/workflows/pippy_tests.yaml.bak b/.github/workflows/pippy_tests.yaml.bak deleted file mode 100644 index 635ca171f..000000000 --- a/.github/workflows/pippy_tests.yaml.bak +++ /dev/null @@ -1,193 +0,0 @@ -name: CPU tests - -on: - push: - branches: - - main - pull_request: - paths: - - '.github/workflows/pippy_tests.yaml' - - 'pippy/**' - - 'test/**' - - 'examples/**' - - '!docs/**' - - '!**.md' - - 'requirements.txt' - -concurrency: - # Cancel CI on previous commit when a new commit is pushed to the same branch - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -jobs: - - # pytest_tests: - # runs-on: linux.4xlarge - # strategy: - # matrix: - # python-version: ["3.8", "3.9"] - # container: - # image: python:${{ matrix.python-version }} - - # steps: - # - uses: actions/checkout@v2 - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install flake8 pytest pytest-cov pytest-xdist numpy - # if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - # - name: Install pippy - # run: "python setup.py install" - # - name: Test with pytest - # run: | - # pytest --cov=pippy test/ - - # hf_model_tests: - # runs-on: linux.12xlarge - # strategy: - # matrix: - # python-version: ["3.9"] - # shard: ["0", "1", "2", "3", "4", "5", "6", "7"] - # container: - # image: python:${{ matrix.python-version }} - - # steps: - # - uses: actions/checkout@v2 - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install flake8 pytest pytest-cov pytest-xdist pytest-shard numpy - # if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - # - name: Install pavel's huggingface fork - # run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses - # - name: Install pippy - # run: "python setup.py install" - # # Single thread to avoid OOM - # - name: Test forward only - # run: | - # pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'not HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py - # - name: Test forward and backward - # run: | - # pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py - - Unit_tests: - runs-on: linux.4xlarge - strategy: - matrix: - python-version: ["3.9"] - env: - OMP_NUM_THREADS: "1" - container: - image: python:${{ matrix.python-version }} - - steps: - - uses: actions/checkout@v2 - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu expecttest - if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - - name: Install pippy - run: "python setup.py install" - # Util tests - - name: Test stage backward helper function - run: python test/test_stage_backward.py - - name: Test microbatch function - run: python test/test_microbatch.py - # IR tests - - name: Test forward pipe generation - run: python test/test_pipe.py - - name: Test backward pipe generation - run: python test/test_pipe_bwd.py - - name: Test unflatten - run: python test/test_unflatten.py - - name: Test Transformer - run: python test/test_transformer.py - - name: Test args chunking spec - run: python test/test_chunkspec.py - # Runtime tests - - name: Test pipeline schedule - run: python test/test_pipeline_schedule.py - # - name: Run null_coalesce_accumulate integration test - # run: python test/local_test_null_coalesce_accumulate.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} - # - name: Run PP + DDP test - # run: python test/local_test_ddp.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }} - #- name: Run HF BERT forward-only integration test - # run: python test/local_test_forward_hf_bert.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }} - # - name: Run HF GPT2 forward-only integration test - # run: python test/local_test_forward_hf_gpt2.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }} - # - name: Run auto-split test - # run: python test/local_test_autosplit.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} - - # hf_examples_set1: - # runs-on: linux.12xlarge - # strategy: - # matrix: - # python-version: ["3.9"] - # schedule: ["FillDrain", "1F1B"] - # env: - # OMP_NUM_THREADS: "1" - # container: - # image: python:${{ matrix.python-version }} - - # steps: - # - uses: actions/checkout@v2 - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu - # if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - # - name: Install pavel's huggingface fork - # run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses - # - name: Install pippy - # run: "python setup.py install" - # - name: Test min-GPT - # run: | - # git config --global --add safe.directory /__w/tau/tau - # git submodule update --init test/minGPT - # python test/min_gpt_tracing.py - # - name: Run GPT2 example - # run: python examples/hf/gpt2/pippy_gpt2.py --schedule ${{ matrix.schedule }} - # - name: Run BERT example - # run: python examples/hf/bert/pippy_bert.py --schedule ${{ matrix.schedule }} - # - name: Run T5 example - # run: python examples/hf/t5/pippy_t5.py --schedule ${{ matrix.schedule }} - # - name: "HF Translation: fine-tune T5 model translation English to Romanian" - # run: > - # python examples/hf/translation/run_translation.py --model_name_or_path t5-small --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=1 --pp_group_size=8 - # - name: "HF Translation: fine-tune BART model translation English to Romanian" - # run: > - # python examples/hf/translation/run_translation.py --model_name_or_path facebook/bart-base --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10 --dp_group_size=2 --pp_group_size=8 - - # hf_examples_set2: - # runs-on: linux.12xlarge - # strategy: - # matrix: - # python-version: ["3.9"] - # schedule: ["FillDrain", "1F1B"] - # env: - # OMP_NUM_THREADS: "1" - # container: - # image: python:${{ matrix.python-version }} - - # steps: - # - uses: actions/checkout@v2 - # - name: Install dependencies - # run: | - # python -m pip install --upgrade pip - # pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu - # if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi - # - name: Install pavel's huggingface fork - # run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses - # - name: Install pippy - # run: "python setup.py install" - # - name: "HF Causal Language Modeling: fine-tune GPT-2 on WikiText-2" - # run: python examples/hf/language-modeling/run_clm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-clm --max_steps=3 --overwrite_output_dir - # - name: "HF Masked Language Modeling: fine-tune RoBERTa on WikiText-2" - # run: python examples/hf/language-modeling/run_mlm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path roberta-base --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-mlm --max_steps=3 --overwrite_output_dir - # - name: "HF Text classification: fine-tune BERT on the GLUE benchmark" - # run: python examples/hf/text-classification/run_glue.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path bert-base-cased --task_name mrpc --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 32 --learning_rate 2e-5 --num_train_epochs 3 --output_dir /tmp/mrpc/ --max_steps=3 --overwrite_output_dir - - # TODO: - # Update GPU test to use template in: - # https://github.com/pytorch/test-infra/wiki/Writing-generic-CI-jobs diff --git a/README.md b/README.md index 889f4a528..5c6592b69 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # PiPPy: Pipeline Parallelism for PyTorch > [!NOTE] -> PiPPy is currently in alpha state and under extensive development. While the 0.2.0 release is available on [PyPI](https://pypi.org/project/torchpippy/), the GitHub version may have updated or new APIs. For up-to-date examples that work with the GitHub version, please refer to programs run in our [CI](.github/workflows/gpu_tests.yaml). +> PiPPy has been migrated into [PyTorch](https://github.com/pytorch/pytorch) as a subpackage: [`torch.distributed.pipelining`](https://github.com/pytorch/pytorch/tree/main/torch/distributed/pipelining). You can find the detailed documentation [here](https://pytorch.org/docs/main/distributed.pipelining.html). The current repo mainly serves as a land of [examples](examples/). The PiPPy library code will be removed. Please use the APIs in `torch.distributed.pipelining` instead. Thank you! [**Why PiPPy?**](#why-pippy) | [**Install guide**](#install) diff --git a/test/local_test_c10d_ddp.py b/test/local_test_c10d_ddp.py deleted file mode 100644 index ccc17ecc9..000000000 --- a/test/local_test_c10d_ddp.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates -import argparse -import os -import unittest - -import pippy - -import torch -import torch.distributed as dist -from pippy._IR import pipe_split -from torch.nn.parallel import DistributedDataParallel - - -d_hid = 512 -chunk_size = 256 - - -class ExampleCode(torch.nn.Module): - def __init__(self): - super().__init__() - self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) - self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) - self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid)) - self.lin0 = torch.nn.Linear(d_hid, d_hid) - self.lin1 = torch.nn.Linear(d_hid, d_hid) - self.loss_fn = torch.nn.MSELoss(reduction="sum") - - def forward(self, x, target): - x = torch.mm(x, self.mm_param0) - skip_connection = x - x = torch.relu(x) - pipe_split() - x = torch.mm(x, self.mm_param1) - x = self.lin0(x) - pipe_split() - x = torch.relu(x) - x = x + skip_connection - x = torch.mm(x, self.mm_param2) - pipe_split() - x = self.lin1(x) - x = torch.relu(x) - loss = self.loss_fn(x, target) - return {"loss": loss} - - -def create_model() -> torch.nn.Module: - # Fix a seed such that models are created the same - torch.manual_seed(42) - ec = ExampleCode() - return ec - - -# Get process group for ranks in a pipeline -def get_pp_subgroup(args): - my_pp_rank = args.rank // args.dp_group_size - my_dp_rank = args.rank % args.dp_group_size - for dp_rank in range(0, args.dp_group_size): - pp_group_ranks = list( - range(dp_rank, args.world_size, args.dp_group_size) - ) - pp_group = dist.new_group(ranks=pp_group_ranks) - if dp_rank == my_dp_rank: - my_pp_group = pp_group - print(f"Rank {args.rank} done getting pp group") - return my_pp_group, my_pp_rank - - -# Get DP process group for ranks with the same stage -def get_dp_subgroup(args): - my_pp_rank = args.rank // args.dp_group_size - my_dp_rank = args.rank % args.dp_group_size - for pp_rank in range(0, args.pp_group_size): - dp_group_ranks = list( - range( - pp_rank * args.dp_group_size, (pp_rank + 1) * args.dp_group_size - ) - ) - dp_group = dist.new_group(ranks=dp_group_ranks) - if pp_rank == my_pp_rank: - my_dp_group = dp_group - print(f"Rank {args.rank} done getting dp group") - return my_dp_group, my_dp_rank - - -# Main program -def run_worker(args): - ec_with_loss = create_model() - ec_with_loss.to(args.device) - - input = torch.randn(args.chunks * chunk_size, d_hid, device=args.device) - target = torch.randn(args.chunks * chunk_size, d_hid, device=args.device) - - # Get DP and PP sub process groups - dp_group, dp_rank = get_dp_subgroup(args) - pp_group, pp_rank = get_pp_subgroup(args) - - stage = pippy.compile_stage( - ec_with_loss, - pp_rank, - args.pp_group_size, - args.chunks, - args.device, - pp_group, - [input, target], - ) - - # Form a map from original qualname to param for equivalence check later - pipe_params = {} - for qualname, param in stage.submod.named_parameters(): - origin_name = stage.submod.remap_qualname(qualname) - pipe_params[origin_name] = param - - # Wrap stage module with DDP - stage.submod = DistributedDataParallel( - stage.submod, - process_group=dp_group, - ) - - # Run - if pp_rank == 0: - stage(input) - elif pp_rank == args.pp_group_size - 1: - pipe_out = stage(target) - else: - stage() - - # Form a map from original qualname to gradient for equivalence check later - pipe_grads = {} - for origin_name, pipe_param in pipe_params.items(): - pipe_grads[origin_name] = pipe_param.grad - - # DDP reference model - ref_mod = create_model() - ref_mod.to(args.device) - ddp_ref_mod = DistributedDataParallel( - ref_mod, - process_group=dp_group, - ) - - # DDP forward and backward - ddp_out = ddp_ref_mod(input, target) - ddp_out["loss"].backward() - - # Compare pipeline output and DDP output - if pp_rank == args.pp_group_size - 1: - torch.testing.assert_close(pipe_out, ddp_out) - print("Output equivalence test passed") - - # Compare pipeline gradient and DDP gradient - for origin_name, pipe_grad in pipe_grads.items(): - ddp_param = ddp_ref_mod.module.get_parameter(origin_name) - if dp_rank == 0: - print(f"Checking gradient of {origin_name}") - # Since we use synthetic input and output, the gradients generated are - # large. Hence we need to manually set relative tolerance - torch.testing.assert_close( - pipe_grad, - ddp_param.grad, - rtol=7e-2, - atol=1e-5, - ) - - print("Gradient equivalence test passed") - - -def main(args=None): - parser = argparse.ArgumentParser() - parser.add_argument( - "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 8)) - ) - parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1))) - parser.add_argument( - "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost") - ) - parser.add_argument( - "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500") - ) - parser.add_argument( - "--cuda", type=int, default=int(torch.cuda.is_available()) - ) - parser.add_argument( - "--chunks", - type=int, - default=4, - ) - args = parser.parse_args(args) - - # pp group size must match with pipe_split's in model - args.pp_group_size = 4 - # world size must be multiple of pp group size - assert args.world_size % args.pp_group_size == 0 - args.dp_group_size = args.world_size // args.pp_group_size - if args.rank == 0: - print( - f"PP group size = {args.pp_group_size}, DP group size = {args.dp_group_size}" - ) - - if args.cuda: - dev_id = args.rank % torch.cuda.device_count() - args.device = torch.device(f"cuda:{dev_id}") - else: - args.device = torch.device("cpu") - - # Init process group - backend = "nccl" if args.cuda else "gloo" - dist.init_process_group( - backend=backend, - rank=args.rank, - world_size=args.world_size, - ) - - run_worker(args) - - -if __name__ == "__main__": - main() - - -class LocalTestC10dDDPTest(unittest.TestCase): - def test_c10d_ddp(self): - import random - - port = random.randint(29500, 30000) - args = [ - "--master_port", - str(port), - ] - main(args) diff --git a/test/local_test_checkpoint.py b/test/local_test_checkpoint.py deleted file mode 100644 index f8b893226..000000000 --- a/test/local_test_checkpoint.py +++ /dev/null @@ -1,210 +0,0 @@ -import argparse -import json -import os -import shutil -import unittest -from copy import deepcopy -from typing import List - -import torch - -import torch.distributed as dist -import torch.optim as optim - -from pippy._IR import pipe_split, TrivialLossWrapper -from pippy.compile import compile_stage -from pippy.utilities.hf_checkpoint import load_checkpoint, save_checkpoint - - -DEFAULT_FILENAME = "pytorch_model.bin.index.json" -CKPT_DIR = "test_ckpts" -WEIGHT_MAP = set( - [ - "module.mm_param0", - "module.mm_param1", - "module.mm_param2", - "module.lin0.weight", - "module.lin0.bias", - "module.lin1.weight", - "module.lin1.bias", - ] -) -D_HID = 512 -CHUNK_SIZE = 256 - - -class ExampleCode(torch.nn.Module): - def __init__(self): - super().__init__() - self.mm_param0 = torch.nn.Parameter(torch.randn(D_HID, D_HID)) - self.mm_param1 = torch.nn.Parameter(torch.randn(D_HID, D_HID)) - self.mm_param2 = torch.nn.Parameter(torch.randn(D_HID, D_HID)) - self.lin0 = torch.nn.Linear(D_HID, D_HID) - self.lin1 = torch.nn.Linear(D_HID, D_HID) - - def forward(self, x): - x = torch.mm(x, self.mm_param0) - skip_connection = x - x = torch.relu(x) - pipe_split() - x = torch.mm(x, self.mm_param1) - x = self.lin0(x) - pipe_split() - x = torch.relu(x) - x = x + skip_connection - x = torch.mm(x, self.mm_param2) - pipe_split() - x = self.lin1(x) - x = torch.relu(x) - return x - - -def run_worker(args: List[str | int]) -> None: - ec = ExampleCode() - loss_fn = torch.nn.MSELoss(reduction="sum") - ec_with_loss = TrivialLossWrapper(ec, loss_fn) - ec_with_loss.to(args.device) - - ec_x = torch.randn(args.chunks * CHUNK_SIZE, D_HID, device=args.device) - target = torch.randn(args.chunks * CHUNK_SIZE, D_HID, device=args.device) - - stage = compile_stage( - ec_with_loss, - args.rank, - args.world_size, - args.chunks, - args.device, - None, - [ec_x, target], - ) - - # Create an optimizer for stage submodule's parameters - optimizer = optim.SGD(stage.submod.parameters(), lr=1e-3, momentum=0.9) - - # first run - # Zero gradients - optimizer.zero_grad() - - # Run - if args.rank == 0: - stage(ec_x) - elif args.rank == args.world_size - 1: - stage(target) - else: - stage() - - # Take an optimization step - optimizer.step() - ref_state_dict = deepcopy(stage.submod.state_dict()) - ref_optim_state_dict = deepcopy(optimizer.state_dict()) - - save_checkpoint(stage, CKPT_DIR, optimizer) - - # save index file in rank 0 - if args.rank == 0: - filepath = os.path.join(CKPT_DIR, DEFAULT_FILENAME) - with open(filepath) as f: - content = f.read() - data = json.loads(content) - - # check file written on disk to given location - assert os.path.exists(filepath) - - # check total_size is correct - size_calc = sum(param.numel() for param in ec.parameters()) * 4 - assert size_calc == data["metadata"]["total_size"] - - # check all params present - assert len(data["weight_map"]) == 7 - for param in WEIGHT_MAP: - assert param in data["weight_map"] - - # second run - # Zero gradients - optimizer.zero_grad() - - # Run - if args.rank == 0: - stage(ec_x) - elif args.rank == args.world_size - 1: - stage(target) - else: - stage() - - # Take an optimization step - optimizer.step() - - # new api - # after index file has been written, load_checkpoint will read it - if os.path.exists(os.path.join(CKPT_DIR, DEFAULT_FILENAME)): - mod, optimizer = load_checkpoint( - stage.submod, - os.path.join(CKPT_DIR, DEFAULT_FILENAME), - optim=optimizer, - device=args.device, - ) - - torch.testing.assert_close(mod.state_dict(), ref_state_dict) - torch.testing.assert_close(optimizer.state_dict(), ref_optim_state_dict) - - dist.barrier() - print(f"Rank {args.rank} completes") - - # remove test ckpt directory in last rank - if args.rank == args.world_size - 1: - shutil.rmtree(CKPT_DIR) - - -def main(args: List[str | int] = None) -> None: - parser = argparse.ArgumentParser() - parser.add_argument( - "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 4)) - ) - parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1))) - parser.add_argument( - "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost") - ) - parser.add_argument( - "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500") - ) - parser.add_argument( - "--cuda", type=int, default=int(torch.cuda.is_available()) - ) - parser.add_argument( - "--chunks", - type=int, - default=4, - ) - args = parser.parse_args(args) - - if args.cuda: - dev_id = args.rank % torch.cuda.device_count() - args.device = torch.device(f"cuda:{dev_id}") - else: - args.device = torch.device("cpu") - - # init process group - backend = "nccl" if args.cuda else "gloo" - dist.init_process_group( - backend=backend, - rank=args.rank, - world_size=args.world_size, - ) - - run_worker(args) - - -if __name__ == "__main__": - main() - - -class LocalCheckpointTest(unittest.TestCase): - def test_index_file(self): - import random - - port = random.randint(29500, 30000) - args = [ - "--master_port", - str(port), - ] - main(args) diff --git a/test/local_test_null_coalesce_accumulate.py b/test/local_test_null_coalesce_accumulate.py deleted file mode 100644 index 227e906cb..000000000 --- a/test/local_test_null_coalesce_accumulate.py +++ /dev/null @@ -1,128 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates -import argparse -import os -import unittest - -import pippy.fx - -import torch -from pippy import run_pippy -from pippy._IR import ( - _null_coalesce_accumulate, - pipe_split, - pipeline, - TrivialLossWrapper, -) -from pippy.PipelineDriver import ( - PipelineDriver1F1B, - PipelineDriverBase, - PipelineDriverFillDrain, -) - -PROFILING_ENABLED = True -CHECK_NUMERIC_EQUIVALENCE = True - -schedules = { - "FillDrain": PipelineDriverFillDrain, - "1F1B": PipelineDriver1F1B, -} - -pippy.fx.Tracer.proxy_buffer_attributes = True - - -def run_master(_, args): - all_ranks = list(range(1, args.world_size)) # exclude master rank = 0 - chunks = len(all_ranks) - bs = 4 * chunks - hid_dim = 50 - - class Code(torch.nn.Module): - def __init__(self): - super().__init__() - self.linear = torch.nn.Linear(hid_dim, hid_dim) - - def forward(self, x): - x = self.linear(x) - pipe_split() - y = torch.relu(x) - pipe_split() - z = torch.sigmoid(x) - pipe_split() - return y + z - - c = Code() - c.train() - mse_loss = torch.nn.MSELoss() - wrapper = TrivialLossWrapper(c, mse_loss) - accum_pipe = pipeline(wrapper) - assert 4 == len(list(accum_pipe.split_gm.children())) - assert any( - n.target == _null_coalesce_accumulate - for n in accum_pipe.split_gm.graph.nodes - ) - - input = torch.randn(bs, hid_dim) - target = torch.randn(bs, hid_dim) - accum_pipe(input, target) - - pipe_driver: PipelineDriverBase = schedules[args.schedule]( - accum_pipe, - chunks, - args.world_size - 1, - all_ranks=all_ranks, - _debug_mask_minibatches=True, - _record_mem_dumps=bool(args.record_mem_dumps), - checkpoint=bool(args.checkpoint), - ) - - pipe_driver(input, target) - - -def main(args=None): - parser = argparse.ArgumentParser() - parser.add_argument( - "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 5)) - ) - parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1))) - parser.add_argument( - "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost") - ) - parser.add_argument( - "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500") - ) - parser.add_argument( - "-s", - "--schedule", - type=str, - default=list(schedules.keys())[0], - choices=schedules.keys(), - ) - parser.add_argument( - "--replicate", type=int, default=int(os.getenv("REPLICATE", "0")) - ) - parser.add_argument( - "--cuda", type=int, default=int(torch.cuda.is_available()) - ) - parser.add_argument( - "--record_mem_dumps", type=int, default=0, choices=[0, 1] - ) - parser.add_argument("--checkpoint", type=int, default=0, choices=[0, 1]) - args = parser.parse_args(args) - - run_pippy(run_master, args) - - -if __name__ == "__main__": - main() - - -class LocalTestNullCoalesceAccumulateTest(unittest.TestCase): - def test_null_coalesce_accumulate(self): - import random - - port = random.randint(29500, 30000) - args = [ - "--master_port", - str(port), - ] - main(args) diff --git a/test/pippy_wrapper.sh b/test/pippy_wrapper.sh deleted file mode 100755 index 05b1de4a2..000000000 --- a/test/pippy_wrapper.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -# Copyright (c) Meta Platforms, Inc. and affiliates - -export MASTER_PORT=29500 -export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1) -export LOCAL_RANK=${SLURM_LOCALID} -# Optional: depending on whether the application wants each procoess to see only 1 GPU or all GPUs -#export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID} -export WORLD_SIZE=${SLURM_NTASKS} -export RANK=${SLURM_PROCID} - -python -u "$@" 2>&1 diff --git a/test/run_pipeline_scheduler.sh b/test/run_pipeline_scheduler.sh deleted file mode 100644 index 24fee401a..000000000 --- a/test/run_pipeline_scheduler.sh +++ /dev/null @@ -1,3 +0,0 @@ -# To run samples: -# launcher for testing pipeline schedules -torchrun --nnodes=1 --nproc_per_node 8 --rdzv_endpoint="localhost:59124" test_pipeline_schedule.py \ No newline at end of file