From 0bb9927369451f18f011225299b5a2a527e4f006 Mon Sep 17 00:00:00 2001
From: Ke Wen <kw2501@meta.com>
Date: Fri, 14 Jun 2024 16:39:42 -0700
Subject: [PATCH] Add migration notice

[ghstack-poisoned]
---
 .github/pull_request_template.md            |  34 ---
 .github/workflows/ISSUE_TEMPLATE/bug.yaml   |  68 ------
 .github/workflows/code-quality.yml.bak      |  30 ---
 .github/workflows/docker/Dockerfile         |  46 ----
 .github/workflows/gpu_tests.yaml.bak        | 108 ----------
 .github/workflows/model_tests.yaml          |   2 +-
 .github/workflows/pippy_tests.yaml.bak      | 193 -----------------
 README.md                                   |   2 +-
 test/local_test_c10d_ddp.py                 | 228 --------------------
 test/local_test_checkpoint.py               | 210 ------------------
 test/local_test_null_coalesce_accumulate.py | 128 -----------
 test/pippy_wrapper.sh                       |  12 --
 test/run_pipeline_scheduler.sh              |   3 -
 13 files changed, 2 insertions(+), 1062 deletions(-)
 delete mode 100644 .github/pull_request_template.md
 delete mode 100644 .github/workflows/ISSUE_TEMPLATE/bug.yaml
 delete mode 100644 .github/workflows/code-quality.yml.bak
 delete mode 100644 .github/workflows/docker/Dockerfile
 delete mode 100644 .github/workflows/gpu_tests.yaml.bak
 delete mode 100644 .github/workflows/pippy_tests.yaml.bak
 delete mode 100644 test/local_test_c10d_ddp.py
 delete mode 100644 test/local_test_checkpoint.py
 delete mode 100644 test/local_test_null_coalesce_accumulate.py
 delete mode 100755 test/pippy_wrapper.sh
 delete mode 100644 test/run_pipeline_scheduler.sh

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
deleted file mode 100644
index f1b05c503..000000000
--- a/.github/pull_request_template.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## Description
-
-Please read our [CONTRIBUTING.md](https://github.com/pytorch/PiPPy/blob/main/CONTRIBUTING.md) prior to creating your first pull request.
-
-Please include a summary of the feature or issue being fixed. Please also include relevant motivation and context. List any dependencies that are required for this change.
-
-Fixes #(issue)
-
-## Type of change
-
-Please delete options that are not relevant.
-
-- [ ] Bug fix (non-breaking change which fixes an issue)
-- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
-- [ ] New feature (non-breaking change which adds functionality)
-- [ ] This change requires a documentation update
-
-## Feature/Issue validation/testing
-
-Please describe the Unit or Integration tests that you ran to verify your changes and relevant result summary. Provide instructions so it can be reproduced.
-Please also list any relevant details for your test configuration.
-
-- [ ] Test A
-Logs for Test A
-
-- [ ] Test B
-Logs for Test B
-
-
-## Checklist:
-
-- [ ] Have you added tests that prove your fix is effective or that this feature works?
-- [ ] Has code been commented, particularly in hard-to-understand areas?
-- [ ] Have you made corresponding changes to the documentation?
diff --git a/.github/workflows/ISSUE_TEMPLATE/bug.yaml b/.github/workflows/ISSUE_TEMPLATE/bug.yaml
deleted file mode 100644
index 9cd2b41e2..000000000
--- a/.github/workflows/ISSUE_TEMPLATE/bug.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: 🐛 Bug Report
-description: Create a bug report to help us reproduce and fix the bug
-
-body:
-  - type: markdown
-    attributes:
-      value: >
-        #### Before submitting a bug, please make sure the issue hasn't been already reported/addressed by searching through [the
-        existing and past issues](https://github.com/pytorch/PiPPy/issues), the [README](https://github.com/pytorch/PiPPy/blob/main/README.md).
-        
-  - type: textarea
-    attributes:
-      label: 🐛 Describe the bug
-      description: |
-        Please provide a clear and concise description of what the bug is.
-
-        If relevant, please provide a minimal example so that we can reproduce the error by running the code. 
-        
-      placeholder: |
-        A clear and concise description of what the bug is.
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: Error logs
-      description: |
-       Paste the error logs that indicate the problem
-
-      placeholder: |
-        Error...
-
-    validations:
-      required: true
-
-  - type: textarea
-    attributes:
-      label: Enviroment
-      description: |
-        Please provide infromation about your running enviroment
-      placeholder: |
-        PyTorch env info: python -m "torch.utils.collect_env"    
-    validations:
-      required: true
-    
-
-  - type: textarea
-    attributes:
-      label: Settings
-      description: |
-        Please provide your running settings
-      placeholder: |
-       Number of Nodes :
-       Number of GPU per Node: 
-       scheduler : 
-    validations:
-      required: true 
-
-  - type: textarea
-    attributes:
-      label: Possible Solution
-      description: |
-        Possible fix for them the problem
-
-  - type: markdown
-    attributes:
-      value: >
-        Thanks for contributing 🎉!
diff --git a/.github/workflows/code-quality.yml.bak b/.github/workflows/code-quality.yml.bak
deleted file mode 100644
index 1f52b1cb6..000000000
--- a/.github/workflows/code-quality.yml.bak
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Code Quality Checks
-
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - '**.md'
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    name: Lints
-    steps:
-    - uses: actions/checkout@v2
-    - uses: actions/setup-python@v2
-      with:
-        python-version: "3.10.6"
-    - name: Install dependencies
-      run: |
-        pip install --upgrade pip
-        pip install -r docs/requirements.txt
-        pip install types-docutils types-setuptools tqdm types-tabulate
-        if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-        pip install "black<23" pylint==v3.0.0a5 mypy==v0.981 flake8==3.8.2 pyre-check==0.9.15 ufmt==2.1.0
-    - name: Static Analysis Checks
-      if: always()
-      run: ./check.sh
diff --git a/.github/workflows/docker/Dockerfile b/.github/workflows/docker/Dockerfile
deleted file mode 100644
index a507a8d83..000000000
--- a/.github/workflows/docker/Dockerfile
+++ /dev/null
@@ -1,46 +0,0 @@
-# Using cuda 11.3
-FROM nvidia/cuda:11.3.1-devel-ubuntu18.04
-
-# nvidia cuda 11.3 paths
-ENV LD_LIBRARY_PATH=/usr/local/cuda-11.3/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}
-ENV LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda-11.3/lib64
-
-# ensure local python is preferred over distribution python
-ENV PATH /usr/local/bin:$PATH
-
-ENV LANG C.UTF-8
-
-# Ignore `tzdata` asking questions
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN echo "US/Pacific" > /etc/timezone \
-    && ln -fs /usr/share/zoneinfo/America/Los_Angeles /etc/localtime \
-    && apt update && apt upgrade -y \
-    && apt-get -y install build-essential checkinstall wget git \
-	libreadline-gplv2-dev libncursesw5-dev libssl-dev \
-	libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev libffi-dev zlib1g-dev
-
-# Set Python Version
-ENV PYTHON_VERSION 3.9.12
-ENV PYTHON_COMMAND 3.9
-
-# Install Python from source.
-RUN cd /opt \
-	&& wget https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tgz \
-	&& tar xzf Python-$PYTHON_VERSION.tgz \
-	&& cd Python-$PYTHON_VERSION \
-	&& ./configure --enable-optimizations \
-	&& make altinstall \
-    && ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python \
-	&& ln -fs /usr/local/bin/python$PYTHON_COMMAND /usr/bin/python3 \
-	&& ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip \
-	&& ln -fs /usr/local/bin/pip$PYTHON_COMMAND /usr/bin/pip3 \
-	&& cd /
-
-# Install python libraries needed for CI test.
-RUN pip3 install --upgrade pip \
-	&& pip3 config set global.progress_bar off \
-	&& pip3 install flake8 pytest pytest-cov pytest-shard numpy expecttest hypothesis pyyaml
-
-LABEL versin="1.0.2"
-LABEL description="Build docker image for ubuntu Linux OS with cuda 11.3 and Python."
\ No newline at end of file
diff --git a/.github/workflows/gpu_tests.yaml.bak b/.github/workflows/gpu_tests.yaml.bak
deleted file mode 100644
index 76a8b1cbd..000000000
--- a/.github/workflows/gpu_tests.yaml.bak
+++ /dev/null
@@ -1,108 +0,0 @@
-name: GPU Tests
-
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-    paths:
-      - '.github/workflows/gpu_tests.yaml'
-      - 'pippy/**'
-      - 'test/**'
-      - 'examples/**'
-      - '!docs/**'
-      - '!**.md'
-      - 'requirements.txt'
-
-concurrency:
-  # Cancel CI on previous commit when a new commit is pushed to the same branch
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-defaults:
-  run:
-    shell: bash -l -eo pipefail {0}
-
-jobs:
-  forward_tests_4gpu:
-    runs-on: linux.g5.12xlarge.nvidia.gpu
-    strategy:
-      matrix:
-        python-version: ['3.10']
-    steps:
-      - name: Check out repo
-        uses: actions/checkout@v3
-      - name: Setup conda env
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          miniconda-version: "latest"
-          activate-environment: test
-          python-version: ${{ matrix.python-version }}
-      - name: Activate conda env
-        run: conda activate test
-      - name: Install dependencies
-        run: |
-          pip install numpy expecttest
-          pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
-      - name: Install pippy
-        run: python setup.py install
-      - name: Run forward-only integration test
-        run: torchrun --nproc-per-node 4 test/test_fwd.py
-      - name: Run auto-split test
-        run: torchrun --nproc-per-node 4 test/test_autosplit.py
-      - name: Test skip connection support
-        run: torchrun --nproc-per-node 4 test/test_skip_conn.py
-      - name: Test trace on different device
-        run: torchrun --nproc-per-node 2 test/test_cpu_init.py
-      - name: Test Composability
-        run: python test/test_composability.py
-      - name: Run example
-        run: torchrun --nproc-per-node 3 examples/basic/example.py
-      - name: Run example with manual stage
-        run: torchrun --nproc-per-node 3 examples/basic/example_manual_stage.py
-      - name: Run training example
-        run: torchrun --nproc-per-node 3 examples/basic/example_train.py
-      - name: Install Transformers for real model tests
-        run: pip install transformers
-      - name: Run GPT2
-        run: torchrun --nproc-per-node 4 examples/huggingface/pippy_gpt2.py
-      - name: Test CPU init + GPU run
-        run: torchrun --nproc-per-node 4 examples/cpu_init/gpt2_cpu_init.py
-      # - name: Run T5
-      #   run: torchrun --nproc-per-node 2 examples/huggingface/pippy_t5.py
-      # - name: Run BERT
-      #   run: torchrun --nproc-per-node 4 examples/huggingface/pippy_bert.py
-
-  backward_tests_4gpu:
-    runs-on: linux.g5.12xlarge.nvidia.gpu
-    strategy:
-      matrix:
-        python-version: ['3.10']
-        schedule: ["gpipe", "1f1b"]
-    steps:
-      - name: Check out repo
-        uses: actions/checkout@v3
-      - name: Setup conda env
-        uses: conda-incubator/setup-miniconda@v2
-        with:
-          auto-update-conda: true
-          miniconda-version: "latest"
-          activate-environment: test
-          python-version: ${{ matrix.python-version }}
-      - name: Activate conda env
-        run: conda activate test
-      - name: Install dependencies
-        run:
-          pip install numpy
-          pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
-      - name: Install pippy
-        run: python setup.py install
-      - name: Run forward-backward test
-        run: torchrun --nproc-per-node 4 test/test_bwd.py --schedule ${{ matrix.schedule }}
-      - name: Run optimizer test
-        run: torchrun --nproc-per-node 4 test/test_optim.py --schedule ${{ matrix.schedule }}
-      - name: Test gradient equivalence
-        run: torchrun --nproc-per-node 4 test/test_grad.py --schedule ${{ matrix.schedule }}
-      - name: Test interleaving schedules
-        run: torchrun --nproc-per-node 4 test/test_interleave.py --schedule ${{ matrix.schedule }}
diff --git a/.github/workflows/model_tests.yaml b/.github/workflows/model_tests.yaml
index f9b8d2105..541eba686 100644
--- a/.github/workflows/model_tests.yaml
+++ b/.github/workflows/model_tests.yaml
@@ -40,7 +40,7 @@ jobs:
         run: conda activate test
       - name: Install dependencies
         run: |
-          pip install --pre -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cu121/torch_nightly.html
+          pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
       - name: Install Transformers for getting models
         run: pip install transformers
       # - name: Install Diffusers for getting models
diff --git a/.github/workflows/pippy_tests.yaml.bak b/.github/workflows/pippy_tests.yaml.bak
deleted file mode 100644
index 635ca171f..000000000
--- a/.github/workflows/pippy_tests.yaml.bak
+++ /dev/null
@@ -1,193 +0,0 @@
-name: CPU tests
-
-on:
-  push:
-    branches:
-    - main
-  pull_request:
-    paths:
-      - '.github/workflows/pippy_tests.yaml'
-      - 'pippy/**'
-      - 'test/**'
-      - 'examples/**'
-      - '!docs/**'
-      - '!**.md'
-      - 'requirements.txt'
-
-concurrency:
-  # Cancel CI on previous commit when a new commit is pushed to the same branch
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-
-jobs:
-
-  # pytest_tests:
-  #   runs-on: linux.4xlarge
-  #   strategy:
-  #     matrix:
-  #       python-version: ["3.8", "3.9"]
-  #   container:
-  #     image: python:${{ matrix.python-version }}
-
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Install dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install flake8 pytest pytest-cov pytest-xdist numpy
-  #         if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-  #     - name: Install pippy
-  #       run: "python setup.py install"
-  #     - name: Test with pytest
-  #       run: |
-  #         pytest --cov=pippy test/
-
-  # hf_model_tests:
-  #   runs-on: linux.12xlarge
-  #   strategy:
-  #     matrix:
-  #       python-version: ["3.9"]
-  #       shard: ["0", "1", "2", "3", "4", "5", "6", "7"]
-  #   container:
-  #     image: python:${{ matrix.python-version }}
-
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Install dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install flake8 pytest pytest-cov pytest-xdist pytest-shard numpy
-  #         if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-  #     - name: Install pavel's huggingface fork
-  #       run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
-  #     - name: Install pippy
-  #       run: "python setup.py install"
-  #     # Single thread to avoid OOM
-  #     - name: Test forward only
-  #       run: |
-  #         pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'not HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py
-  #     - name: Test forward and backward
-  #       run: |
-  #         pytest --shard-id=${{ matrix.shard }} --num-shards=8 -k 'HFModelsForwardBackwardTest' -sv --cov=pippy test/hf_test.py
-
-  Unit_tests:
-    runs-on: linux.4xlarge
-    strategy:
-      matrix:
-        python-version: ["3.9"]
-    env:
-      OMP_NUM_THREADS: "1"
-    container:
-      image: python:${{ matrix.python-version }}
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu expecttest
-          if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-      - name: Install pippy
-        run: "python setup.py install"
-      # Util tests
-      - name: Test stage backward helper function
-        run: python test/test_stage_backward.py
-      - name: Test microbatch function
-        run: python test/test_microbatch.py
-      # IR tests
-      - name: Test forward pipe generation
-        run: python test/test_pipe.py
-      - name: Test backward pipe generation
-        run: python test/test_pipe_bwd.py
-      - name: Test unflatten
-        run: python test/test_unflatten.py
-      - name: Test Transformer
-        run: python test/test_transformer.py
-      - name: Test args chunking spec
-        run: python test/test_chunkspec.py
-      # Runtime tests
-      - name: Test pipeline schedule
-        run: python test/test_pipeline_schedule.py
-      # - name: Run null_coalesce_accumulate integration test
-      #   run: python test/local_test_null_coalesce_accumulate.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }}
-      # - name: Run PP + DDP test
-      #   run: python test/local_test_ddp.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
-      #- name: Run HF BERT forward-only integration test
-      #  run: python test/local_test_forward_hf_bert.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
-      # - name: Run HF GPT2 forward-only integration test
-      #   run: python test/local_test_forward_hf_gpt2.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }} --checkpoint ${{ matrix.checkpoint }}
-      # - name: Run auto-split test
-      #   run: python test/local_test_autosplit.py --replicate ${{ matrix.replicate }} --schedule ${{ matrix.schedule }}
-
-  # hf_examples_set1:
-  #   runs-on: linux.12xlarge
-  #   strategy:
-  #     matrix:
-  #       python-version: ["3.9"]
-  #       schedule: ["FillDrain", "1F1B"]
-  #   env:
-  #     OMP_NUM_THREADS: "1"
-  #   container:
-  #     image: python:${{ matrix.python-version }}
-
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Install dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
-  #         if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-  #     - name: Install pavel's huggingface fork
-  #       run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
-  #     - name: Install pippy
-  #       run: "python setup.py install"
-  #     - name: Test min-GPT
-  #       run: |
-  #         git config --global --add safe.directory /__w/tau/tau
-  #         git submodule update --init test/minGPT
-  #         python test/min_gpt_tracing.py
-  #     - name: Run GPT2 example
-  #       run: python examples/hf/gpt2/pippy_gpt2.py --schedule ${{ matrix.schedule }}
-  #     - name: Run BERT example
-  #       run: python examples/hf/bert/pippy_bert.py --schedule ${{ matrix.schedule }}
-  #     - name: Run T5 example
-  #       run: python examples/hf/t5/pippy_t5.py --schedule ${{ matrix.schedule }}
-  #     - name: "HF Translation: fine-tune T5 model translation English to Romanian"
-  #       run: >
-  #         python examples/hf/translation/run_translation.py --model_name_or_path t5-small --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10  --dp_group_size=1 --pp_group_size=8
-  #     - name: "HF Translation: fine-tune BART model translation English to Romanian"
-  #       run: >
-  #         python examples/hf/translation/run_translation.py --model_name_or_path facebook/bart-base --do_train --source_lang en --target_lang ro --source_prefix "translate English to Romanian: " --dataset_name wmt16 --dataset_config_name ro-en --output_dir /tmp/tst-translation --per_device_train_batch_size=8 --per_device_eval_batch_size=8 --overwrite_output_dir --predict_with_generate --max_steps=10  --dp_group_size=2 --pp_group_size=8
-
-  # hf_examples_set2:
-  #   runs-on: linux.12xlarge
-  #   strategy:
-  #     matrix:
-  #       python-version: ["3.9"]
-  #       schedule: ["FillDrain", "1F1B"]
-  #   env:
-  #     OMP_NUM_THREADS: "1"
-  #   container:
-  #     image: python:${{ matrix.python-version }}
-
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Install dependencies
-  #       run: |
-  #         python -m pip install --upgrade pip
-  #         pip install flake8 pytest pytest-cov numpy datasets evaluate scikit-learn sacrebleu
-  #         if [ -f requirements.txt ]; then pip install -r requirements.txt --find-links https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html; fi
-  #     - name: Install pavel's huggingface fork
-  #       run: pip install git+https://github.com/huggingface/transformers.git@main sentencepiece six sacremoses
-  #     - name: Install pippy
-  #       run: "python setup.py install"
-  #     - name: "HF Causal Language Modeling: fine-tune GPT-2 on WikiText-2"
-  #       run: python examples/hf/language-modeling/run_clm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-clm --max_steps=3 --overwrite_output_dir
-  #     - name: "HF Masked Language Modeling: fine-tune RoBERTa on WikiText-2"
-  #       run: python examples/hf/language-modeling/run_mlm.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path roberta-base --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --do_train --do_eval --output_dir /tmp/test-mlm --max_steps=3 --overwrite_output_dir
-  #     - name: "HF Text classification: fine-tune BERT on the GLUE benchmark"
-  #       run: python examples/hf/text-classification/run_glue.py --dp_group_size=2 --pp_group_size=8 --model_name_or_path bert-base-cased --task_name mrpc --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 32 --learning_rate 2e-5 --num_train_epochs 3 --output_dir /tmp/mrpc/ --max_steps=3 --overwrite_output_dir
-
-  # TODO:
-  # Update GPU test to use template in:
-  # https://github.com/pytorch/test-infra/wiki/Writing-generic-CI-jobs
diff --git a/README.md b/README.md
index 889f4a528..5c6592b69 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # PiPPy: Pipeline Parallelism for PyTorch
 
 > [!NOTE]
-> PiPPy is currently in alpha state and under extensive development. While the 0.2.0 release is available on [PyPI](https://pypi.org/project/torchpippy/), the GitHub version may have updated or new APIs. For up-to-date examples that work with the GitHub version, please refer to programs run in our [CI](.github/workflows/gpu_tests.yaml).
+> PiPPy has been migrated into [PyTorch](https://github.com/pytorch/pytorch) as a subpackage: [`torch.distributed.pipelining`](https://github.com/pytorch/pytorch/tree/main/torch/distributed/pipelining). You can find the detailed documentation [here](https://pytorch.org/docs/main/distributed.pipelining.html). The current repo mainly serves as a land of [examples](examples/). The PiPPy library code will be removed. Please use the APIs in `torch.distributed.pipelining` instead. Thank you!
 
 [**Why PiPPy?**](#why-pippy)
 | [**Install guide**](#install)
diff --git a/test/local_test_c10d_ddp.py b/test/local_test_c10d_ddp.py
deleted file mode 100644
index ccc17ecc9..000000000
--- a/test/local_test_c10d_ddp.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-import argparse
-import os
-import unittest
-
-import pippy
-
-import torch
-import torch.distributed as dist
-from pippy._IR import pipe_split
-from torch.nn.parallel import DistributedDataParallel
-
-
-d_hid = 512
-chunk_size = 256
-
-
-class ExampleCode(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.mm_param0 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
-        self.mm_param1 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
-        self.mm_param2 = torch.nn.Parameter(torch.randn(d_hid, d_hid))
-        self.lin0 = torch.nn.Linear(d_hid, d_hid)
-        self.lin1 = torch.nn.Linear(d_hid, d_hid)
-        self.loss_fn = torch.nn.MSELoss(reduction="sum")
-
-    def forward(self, x, target):
-        x = torch.mm(x, self.mm_param0)
-        skip_connection = x
-        x = torch.relu(x)
-        pipe_split()
-        x = torch.mm(x, self.mm_param1)
-        x = self.lin0(x)
-        pipe_split()
-        x = torch.relu(x)
-        x = x + skip_connection
-        x = torch.mm(x, self.mm_param2)
-        pipe_split()
-        x = self.lin1(x)
-        x = torch.relu(x)
-        loss = self.loss_fn(x, target)
-        return {"loss": loss}
-
-
-def create_model() -> torch.nn.Module:
-    # Fix a seed such that models are created the same
-    torch.manual_seed(42)
-    ec = ExampleCode()
-    return ec
-
-
-# Get process group for ranks in a pipeline
-def get_pp_subgroup(args):
-    my_pp_rank = args.rank // args.dp_group_size
-    my_dp_rank = args.rank % args.dp_group_size
-    for dp_rank in range(0, args.dp_group_size):
-        pp_group_ranks = list(
-            range(dp_rank, args.world_size, args.dp_group_size)
-        )
-        pp_group = dist.new_group(ranks=pp_group_ranks)
-        if dp_rank == my_dp_rank:
-            my_pp_group = pp_group
-    print(f"Rank {args.rank} done getting pp group")
-    return my_pp_group, my_pp_rank
-
-
-# Get DP process group for ranks with the same stage
-def get_dp_subgroup(args):
-    my_pp_rank = args.rank // args.dp_group_size
-    my_dp_rank = args.rank % args.dp_group_size
-    for pp_rank in range(0, args.pp_group_size):
-        dp_group_ranks = list(
-            range(
-                pp_rank * args.dp_group_size, (pp_rank + 1) * args.dp_group_size
-            )
-        )
-        dp_group = dist.new_group(ranks=dp_group_ranks)
-        if pp_rank == my_pp_rank:
-            my_dp_group = dp_group
-    print(f"Rank {args.rank} done getting dp group")
-    return my_dp_group, my_dp_rank
-
-
-# Main program
-def run_worker(args):
-    ec_with_loss = create_model()
-    ec_with_loss.to(args.device)
-
-    input = torch.randn(args.chunks * chunk_size, d_hid, device=args.device)
-    target = torch.randn(args.chunks * chunk_size, d_hid, device=args.device)
-
-    # Get DP and PP sub process groups
-    dp_group, dp_rank = get_dp_subgroup(args)
-    pp_group, pp_rank = get_pp_subgroup(args)
-
-    stage = pippy.compile_stage(
-        ec_with_loss,
-        pp_rank,
-        args.pp_group_size,
-        args.chunks,
-        args.device,
-        pp_group,
-        [input, target],
-    )
-
-    # Form a map from original qualname to param for equivalence check later
-    pipe_params = {}
-    for qualname, param in stage.submod.named_parameters():
-        origin_name = stage.submod.remap_qualname(qualname)
-        pipe_params[origin_name] = param
-
-    # Wrap stage module with DDP
-    stage.submod = DistributedDataParallel(
-        stage.submod,
-        process_group=dp_group,
-    )
-
-    # Run
-    if pp_rank == 0:
-        stage(input)
-    elif pp_rank == args.pp_group_size - 1:
-        pipe_out = stage(target)
-    else:
-        stage()
-
-    # Form a map from original qualname to gradient for equivalence check later
-    pipe_grads = {}
-    for origin_name, pipe_param in pipe_params.items():
-        pipe_grads[origin_name] = pipe_param.grad
-
-    # DDP reference model
-    ref_mod = create_model()
-    ref_mod.to(args.device)
-    ddp_ref_mod = DistributedDataParallel(
-        ref_mod,
-        process_group=dp_group,
-    )
-
-    # DDP forward and backward
-    ddp_out = ddp_ref_mod(input, target)
-    ddp_out["loss"].backward()
-
-    # Compare pipeline output and DDP output
-    if pp_rank == args.pp_group_size - 1:
-        torch.testing.assert_close(pipe_out, ddp_out)
-        print("Output equivalence test passed")
-
-    # Compare pipeline gradient and DDP gradient
-    for origin_name, pipe_grad in pipe_grads.items():
-        ddp_param = ddp_ref_mod.module.get_parameter(origin_name)
-        if dp_rank == 0:
-            print(f"Checking gradient of {origin_name}")
-        # Since we use synthetic input and output, the gradients generated are
-        # large. Hence we need to manually set relative tolerance
-        torch.testing.assert_close(
-            pipe_grad,
-            ddp_param.grad,
-            rtol=7e-2,
-            atol=1e-5,
-        )
-
-    print("Gradient equivalence test passed")
-
-
-def main(args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 8))
-    )
-    parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1)))
-    parser.add_argument(
-        "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost")
-    )
-    parser.add_argument(
-        "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500")
-    )
-    parser.add_argument(
-        "--cuda", type=int, default=int(torch.cuda.is_available())
-    )
-    parser.add_argument(
-        "--chunks",
-        type=int,
-        default=4,
-    )
-    args = parser.parse_args(args)
-
-    # pp group size must match with pipe_split's in model
-    args.pp_group_size = 4
-    # world size must be multiple of pp group size
-    assert args.world_size % args.pp_group_size == 0
-    args.dp_group_size = args.world_size // args.pp_group_size
-    if args.rank == 0:
-        print(
-            f"PP group size = {args.pp_group_size}, DP group size = {args.dp_group_size}"
-        )
-
-    if args.cuda:
-        dev_id = args.rank % torch.cuda.device_count()
-        args.device = torch.device(f"cuda:{dev_id}")
-    else:
-        args.device = torch.device("cpu")
-
-    # Init process group
-    backend = "nccl" if args.cuda else "gloo"
-    dist.init_process_group(
-        backend=backend,
-        rank=args.rank,
-        world_size=args.world_size,
-    )
-
-    run_worker(args)
-
-
-if __name__ == "__main__":
-    main()
-
-
-class LocalTestC10dDDPTest(unittest.TestCase):
-    def test_c10d_ddp(self):
-        import random
-
-        port = random.randint(29500, 30000)
-        args = [
-            "--master_port",
-            str(port),
-        ]
-        main(args)
diff --git a/test/local_test_checkpoint.py b/test/local_test_checkpoint.py
deleted file mode 100644
index f8b893226..000000000
--- a/test/local_test_checkpoint.py
+++ /dev/null
@@ -1,210 +0,0 @@
-import argparse
-import json
-import os
-import shutil
-import unittest
-from copy import deepcopy
-from typing import List
-
-import torch
-
-import torch.distributed as dist
-import torch.optim as optim
-
-from pippy._IR import pipe_split, TrivialLossWrapper
-from pippy.compile import compile_stage
-from pippy.utilities.hf_checkpoint import load_checkpoint, save_checkpoint
-
-
-DEFAULT_FILENAME = "pytorch_model.bin.index.json"
-CKPT_DIR = "test_ckpts"
-WEIGHT_MAP = set(
-    [
-        "module.mm_param0",
-        "module.mm_param1",
-        "module.mm_param2",
-        "module.lin0.weight",
-        "module.lin0.bias",
-        "module.lin1.weight",
-        "module.lin1.bias",
-    ]
-)
-D_HID = 512
-CHUNK_SIZE = 256
-
-
-class ExampleCode(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.mm_param0 = torch.nn.Parameter(torch.randn(D_HID, D_HID))
-        self.mm_param1 = torch.nn.Parameter(torch.randn(D_HID, D_HID))
-        self.mm_param2 = torch.nn.Parameter(torch.randn(D_HID, D_HID))
-        self.lin0 = torch.nn.Linear(D_HID, D_HID)
-        self.lin1 = torch.nn.Linear(D_HID, D_HID)
-
-    def forward(self, x):
-        x = torch.mm(x, self.mm_param0)
-        skip_connection = x
-        x = torch.relu(x)
-        pipe_split()
-        x = torch.mm(x, self.mm_param1)
-        x = self.lin0(x)
-        pipe_split()
-        x = torch.relu(x)
-        x = x + skip_connection
-        x = torch.mm(x, self.mm_param2)
-        pipe_split()
-        x = self.lin1(x)
-        x = torch.relu(x)
-        return x
-
-
-def run_worker(args: List[str | int]) -> None:
-    ec = ExampleCode()
-    loss_fn = torch.nn.MSELoss(reduction="sum")
-    ec_with_loss = TrivialLossWrapper(ec, loss_fn)
-    ec_with_loss.to(args.device)
-
-    ec_x = torch.randn(args.chunks * CHUNK_SIZE, D_HID, device=args.device)
-    target = torch.randn(args.chunks * CHUNK_SIZE, D_HID, device=args.device)
-
-    stage = compile_stage(
-        ec_with_loss,
-        args.rank,
-        args.world_size,
-        args.chunks,
-        args.device,
-        None,
-        [ec_x, target],
-    )
-
-    # Create an optimizer for stage submodule's parameters
-    optimizer = optim.SGD(stage.submod.parameters(), lr=1e-3, momentum=0.9)
-
-    # first run
-    # Zero gradients
-    optimizer.zero_grad()
-
-    # Run
-    if args.rank == 0:
-        stage(ec_x)
-    elif args.rank == args.world_size - 1:
-        stage(target)
-    else:
-        stage()
-
-    # Take an optimization step
-    optimizer.step()
-    ref_state_dict = deepcopy(stage.submod.state_dict())
-    ref_optim_state_dict = deepcopy(optimizer.state_dict())
-
-    save_checkpoint(stage, CKPT_DIR, optimizer)
-
-    # save index file in rank 0
-    if args.rank == 0:
-        filepath = os.path.join(CKPT_DIR, DEFAULT_FILENAME)
-        with open(filepath) as f:
-            content = f.read()
-            data = json.loads(content)
-
-        # check file written on disk to given location
-        assert os.path.exists(filepath)
-
-        # check total_size is correct
-        size_calc = sum(param.numel() for param in ec.parameters()) * 4
-        assert size_calc == data["metadata"]["total_size"]
-
-        # check all params present
-        assert len(data["weight_map"]) == 7
-        for param in WEIGHT_MAP:
-            assert param in data["weight_map"]
-
-    # second run
-    # Zero gradients
-    optimizer.zero_grad()
-
-    # Run
-    if args.rank == 0:
-        stage(ec_x)
-    elif args.rank == args.world_size - 1:
-        stage(target)
-    else:
-        stage()
-
-    # Take an optimization step
-    optimizer.step()
-
-    # new api
-    # after index file has been written, load_checkpoint will read it
-    if os.path.exists(os.path.join(CKPT_DIR, DEFAULT_FILENAME)):
-        mod, optimizer = load_checkpoint(
-            stage.submod,
-            os.path.join(CKPT_DIR, DEFAULT_FILENAME),
-            optim=optimizer,
-            device=args.device,
-        )
-
-        torch.testing.assert_close(mod.state_dict(), ref_state_dict)
-        torch.testing.assert_close(optimizer.state_dict(), ref_optim_state_dict)
-
-    dist.barrier()
-    print(f"Rank {args.rank} completes")
-
-    # remove test ckpt directory in last rank
-    if args.rank == args.world_size - 1:
-        shutil.rmtree(CKPT_DIR)
-
-
-def main(args: List[str | int] = None) -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 4))
-    )
-    parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1)))
-    parser.add_argument(
-        "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost")
-    )
-    parser.add_argument(
-        "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500")
-    )
-    parser.add_argument(
-        "--cuda", type=int, default=int(torch.cuda.is_available())
-    )
-    parser.add_argument(
-        "--chunks",
-        type=int,
-        default=4,
-    )
-    args = parser.parse_args(args)
-
-    if args.cuda:
-        dev_id = args.rank % torch.cuda.device_count()
-        args.device = torch.device(f"cuda:{dev_id}")
-    else:
-        args.device = torch.device("cpu")
-
-    # init process group
-    backend = "nccl" if args.cuda else "gloo"
-    dist.init_process_group(
-        backend=backend,
-        rank=args.rank,
-        world_size=args.world_size,
-    )
-
-    run_worker(args)
-
-
-if __name__ == "__main__":
-    main()
-
-
-class LocalCheckpointTest(unittest.TestCase):
-    def test_index_file(self):
-        import random
-
-        port = random.randint(29500, 30000)
-        args = [
-            "--master_port",
-            str(port),
-        ]
-        main(args)
diff --git a/test/local_test_null_coalesce_accumulate.py b/test/local_test_null_coalesce_accumulate.py
deleted file mode 100644
index 227e906cb..000000000
--- a/test/local_test_null_coalesce_accumulate.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates
-import argparse
-import os
-import unittest
-
-import pippy.fx
-
-import torch
-from pippy import run_pippy
-from pippy._IR import (
-    _null_coalesce_accumulate,
-    pipe_split,
-    pipeline,
-    TrivialLossWrapper,
-)
-from pippy.PipelineDriver import (
-    PipelineDriver1F1B,
-    PipelineDriverBase,
-    PipelineDriverFillDrain,
-)
-
-PROFILING_ENABLED = True
-CHECK_NUMERIC_EQUIVALENCE = True
-
-schedules = {
-    "FillDrain": PipelineDriverFillDrain,
-    "1F1B": PipelineDriver1F1B,
-}
-
-pippy.fx.Tracer.proxy_buffer_attributes = True
-
-
-def run_master(_, args):
-    all_ranks = list(range(1, args.world_size))  # exclude master rank = 0
-    chunks = len(all_ranks)
-    bs = 4 * chunks
-    hid_dim = 50
-
-    class Code(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = torch.nn.Linear(hid_dim, hid_dim)
-
-        def forward(self, x):
-            x = self.linear(x)
-            pipe_split()
-            y = torch.relu(x)
-            pipe_split()
-            z = torch.sigmoid(x)
-            pipe_split()
-            return y + z
-
-    c = Code()
-    c.train()
-    mse_loss = torch.nn.MSELoss()
-    wrapper = TrivialLossWrapper(c, mse_loss)
-    accum_pipe = pipeline(wrapper)
-    assert 4 == len(list(accum_pipe.split_gm.children()))
-    assert any(
-        n.target == _null_coalesce_accumulate
-        for n in accum_pipe.split_gm.graph.nodes
-    )
-
-    input = torch.randn(bs, hid_dim)
-    target = torch.randn(bs, hid_dim)
-    accum_pipe(input, target)
-
-    pipe_driver: PipelineDriverBase = schedules[args.schedule](
-        accum_pipe,
-        chunks,
-        args.world_size - 1,
-        all_ranks=all_ranks,
-        _debug_mask_minibatches=True,
-        _record_mem_dumps=bool(args.record_mem_dumps),
-        checkpoint=bool(args.checkpoint),
-    )
-
-    pipe_driver(input, target)
-
-
-def main(args=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--world_size", type=int, default=int(os.getenv("WORLD_SIZE", 5))
-    )
-    parser.add_argument("--rank", type=int, default=int(os.getenv("RANK", -1)))
-    parser.add_argument(
-        "--master_addr", type=str, default=os.getenv("MASTER_ADDR", "localhost")
-    )
-    parser.add_argument(
-        "--master_port", type=str, default=os.getenv("MASTER_PORT", "29500")
-    )
-    parser.add_argument(
-        "-s",
-        "--schedule",
-        type=str,
-        default=list(schedules.keys())[0],
-        choices=schedules.keys(),
-    )
-    parser.add_argument(
-        "--replicate", type=int, default=int(os.getenv("REPLICATE", "0"))
-    )
-    parser.add_argument(
-        "--cuda", type=int, default=int(torch.cuda.is_available())
-    )
-    parser.add_argument(
-        "--record_mem_dumps", type=int, default=0, choices=[0, 1]
-    )
-    parser.add_argument("--checkpoint", type=int, default=0, choices=[0, 1])
-    args = parser.parse_args(args)
-
-    run_pippy(run_master, args)
-
-
-if __name__ == "__main__":
-    main()
-
-
-class LocalTestNullCoalesceAccumulateTest(unittest.TestCase):
-    def test_null_coalesce_accumulate(self):
-        import random
-
-        port = random.randint(29500, 30000)
-        args = [
-            "--master_port",
-            str(port),
-        ]
-        main(args)
diff --git a/test/pippy_wrapper.sh b/test/pippy_wrapper.sh
deleted file mode 100755
index 05b1de4a2..000000000
--- a/test/pippy_wrapper.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates
-
-export MASTER_PORT=29500
-export MASTER_ADDR=$(scontrol show hostname ${SLURM_NODELIST} | head -n 1)
-export LOCAL_RANK=${SLURM_LOCALID}
-# Optional: depending on whether the application wants each procoess to see only 1 GPU or all GPUs
-#export CUDA_VISIBLE_DEVICES=${SLURM_LOCALID}
-export WORLD_SIZE=${SLURM_NTASKS}
-export RANK=${SLURM_PROCID}
-
-python -u "$@" 2>&1
diff --git a/test/run_pipeline_scheduler.sh b/test/run_pipeline_scheduler.sh
deleted file mode 100644
index 24fee401a..000000000
--- a/test/run_pipeline_scheduler.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-# To run samples:
-# launcher for testing pipeline schedules
-torchrun --nnodes=1 --nproc_per_node 8 --rdzv_endpoint="localhost:59124" test_pipeline_schedule.py
\ No newline at end of file