Merge branch 'microsoft-master' into main

VectorInstitute · Nov 26, 2023 · 9920f1b · 9920f1b
2 parents b926043 + 6f42f87
commit 9920f1b
Show file tree

Hide file tree

Showing 812 changed files with 59,808 additions and 5,741 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+ignore = E,F403,F405,F541,F841,W
+select = E9,F,W6
+per-file-ignores =
+    __init__.py:F401
diff --git a/.github/ISSUE_TEMPLATE/ci_failure_report.md b/.github/ISSUE_TEMPLATE/ci_failure_report.md
@@ -0,0 +1,10 @@
+---
+name: CI failure report
+about: Report a DeepSpeed CI failure
+title: "{{ env.GITHUB_WORKFLOW }} CI test failure"
+labels: ci-failure
+assignees: ''
+
+---
+
+The Nightly CI for {{ env.GITHUB_SERVER_URL }}/{{ env.GITHUB_REPOSITORY }}/actions/runs/{{ env.GITHUB_RUN_ID }} failed.
diff --git a/.github/workflows/amd-mi100.yml b/.github/workflows/amd-mi100.yml
@@ -1,16 +1,6 @@
 name: amd-mi100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   schedule:
     - cron: "0 0 * * *"
 
@@ -33,7 +23,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
+          pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -60,7 +50,7 @@ jobs:
       # Runs a set of commands using the runners shell
       - name: Unit tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
+          pytest $PYTEST_OPTS -n 4 --verbose unit/
+          pytest $PYTEST_OPTS -m 'sequential' unit/
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
@@ -1,23 +1,18 @@
 name: amd-mi200
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   schedule:
     - cron: "0 0 * * *"
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+  contents: read
+  issues: write
+
 jobs:
   amd-tests:
     # The type of runner that the job will run on
@@ -33,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -46,14 +41,18 @@ jobs:
           git rev-parse --short HEAD
           pip install .
 
-      - name: Install apex
+      - name: Install (ROCm) apex
         run: |
           git clone https://github.com/ROCmSoftwarePlatform/apex.git
           cd apex
-          python setup.py install --cpp_ext --cuda_ext
-          # Previous install without cloning source doesn't work due to latest pip removing `--install-option`
-          # pip install -v --install-option="--cpp_ext" --install-option="--cuda_ext" 'git+https://github.com/ROCmSoftwarePlatform/apex.git'
-
+          git checkout torch_2.1_higher
+          CURRENT_VER=$(git rev-parse HEAD)
+          INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
+          if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
+            pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
+            git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
+          fi
+          echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
       # Runs a set of commands using the runners shell
       - name: Install deepspeed
         run: |
@@ -68,7 +67,16 @@ jobs:
       # Runs a set of commands using the runners shell
       - name: Unit tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
-          TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
+          pytest $PYTEST_OPTS -n 4 --verbose unit/
+          pytest $PYTEST_OPTS -m 'sequential' unit/
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
@@ -1,14 +1,7 @@
 name: cpu-inference
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
+  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -19,7 +12,7 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
@@ -77,7 +70,6 @@ jobs:
         run: |
           source oneCCL/build/_install/env/setvars.sh
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
           cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference' unit/inference/test_inference_config.py
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -k TestDistAllReduce unit/comm/test_dist.py
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
@@ -1,12 +1,11 @@
 name: Formatting
 
 on:
-  push:
-    branches:
-      - 'staging**'
   pull_request:
     branches:
       '**'
+  merge_group:
+    branches: [ master ]
   schedule:
     - cron: "0 0 * * *"
 
@@ -28,12 +27,13 @@ jobs:
           which python
           python --version
 
-      - name: Install deepspeed
+      - name: Install dependencies
         run: |
-          pip install .[dev,autotuning]
-          ds_report
+          # Previously we would do pip install .[dev] but this is causing out of
+          # space errors start with torch 2.1.0 release
+          grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
 
       - name: Formatting checks
         run: |
-           pip show pre-commit clang-format
-           pre-commit run --all-files
+          pip show pre-commit clang-format
+          pre-commit run --all-files
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -0,0 +1,63 @@
+name: nv-a6000
+
+on:
+  pull_request:
+    paths:
+      - "deepspeed/inference/v2/**"
+      - "tests/unit/inference/v2/**"
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone --depth=1 https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
+          python -m pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+      - name: MII unit tests
+        run: |
+          git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
+          cd DeepSpeed-MII
+          pip install .[dev]
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF ./
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -1,16 +1,14 @@
 name: nv-accelerate-v100
 
 on:
-  push:
-    branches:
-      - 'staging**'
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
   pull_request:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - "tests/unit/inference/v2/**"
+  merge_group:
+    branches: [ master ]
   schedule:
     - cron: "0 0 * * *"
 
@@ -30,7 +28,7 @@ jobs:
 
       - name: Install pytorch
         run: |
-          pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -45,7 +43,7 @@ jobs:
 
       - name: HF Accelerate tests
         run: |
-          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           git clone https://github.com/huggingface/accelerate
           cd accelerate
           git rev-parse --short HEAD
@@ -56,4 +54,4 @@ jobs:
           # tmp fix: force newer datasets version
           #pip install "datasets>=2.0.0"
           pip list
-          HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
+          pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
@@ -0,0 +1,69 @@
+name: nv-ds-chat
+
+on:
+  schedule:
+    - cron: "0 0 * * *"
+  workflow_dispatch:
+    inputs:
+      dse_branch:
+        description: 'DeepSpeedExamples Branch'
+        required: false
+        default: 'master'
+        type: string
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, cu116, v100]
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Install pytorch
+        run: |
+          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev]
+          ds_report
+
+      - name: Install deepspeed-chat
+        run: |
+          BRANCH="master"
+          if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
+              BRANCH="${{ github.event.inputs.dse_branch }}"
+          fi
+          echo "DeepSpeedExamples Branch: $BRANCH"
+          git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
+          cd DeepSpeedExamples/applications/DeepSpeed-Chat
+          pip install -r requirements.txt
+          pip install -e .
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: DS-Chat unit tests
+        run: |
+          cd DeepSpeedExamples/applications/DeepSpeed-Chat
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          pytest $PYTEST_OPTS ./
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true