Skip to content

Commit

Permalink
Merge branch 'microsoft-master' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
Quentin-Anthony committed Nov 26, 2023
2 parents b926043 + 6f42f87 commit 9920f1b
Show file tree
Hide file tree
Showing 812 changed files with 59,808 additions and 5,741 deletions.
5 changes: 5 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[flake8]
ignore = E,F403,F405,F541,F841,W
select = E9,F,W6
per-file-ignores =
__init__.py:F401
10 changes: 10 additions & 0 deletions .github/ISSUE_TEMPLATE/ci_failure_report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
name: CI failure report
about: Report a DeepSpeed CI failure
title: "{{ env.GITHUB_WORKFLOW }} CI test failure"
labels: ci-failure
assignees: ''

---

The Nightly CI for {{ env.GITHUB_SERVER_URL }}/{{ env.GITHUB_REPOSITORY }}/actions/runs/{{ env.GITHUB_RUN_ID }} failed.
18 changes: 4 additions & 14 deletions .github/workflows/amd-mi100.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,6 @@
name: amd-mi100

on:
push:
branches:
- 'staging**'
paths-ignore:
- 'docs/**'
- 'blogs/**'
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
schedule:
- cron: "0 0 * * *"

Expand All @@ -33,7 +23,7 @@ jobs:

- name: Install pytorch
run: |
pip install torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -60,7 +50,7 @@ jobs:
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
pytest $PYTEST_OPTS -n 4 --verbose unit/
pytest $PYTEST_OPTS -m 'sequential' unit/
46 changes: 27 additions & 19 deletions .github/workflows/amd-mi200.yml
Original file line number Diff line number Diff line change
@@ -1,23 +1,18 @@
name: amd-mi200

on:
push:
branches:
- 'staging**'
paths-ignore:
- 'docs/**'
- 'blogs/**'
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
schedule:
- cron: "0 0 * * *"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
amd-tests:
# The type of runner that the job will run on
Expand All @@ -33,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.4.2
pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -46,14 +41,18 @@ jobs:
git rev-parse --short HEAD
pip install .
- name: Install apex
- name: Install (ROCm) apex
run: |
git clone https://github.com/ROCmSoftwarePlatform/apex.git
cd apex
python setup.py install --cpp_ext --cuda_ext
# Previous install without cloning source doesn't work due to latest pip removing `--install-option`
# pip install -v --install-option="--cpp_ext" --install-option="--cuda_ext" 'git+https://github.com/ROCmSoftwarePlatform/apex.git'
git checkout torch_2.1_higher
CURRENT_VER=$(git rev-parse HEAD)
INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings="--global-option=--cpp_ext" --config-settings="--global-option=--cuda_ext" --target=/blob/amd-apex/ --upgrade .
git rev-parse HEAD > /blob/amd-apex/.venv_installed_version
fi
echo PYTHONPATH=$PYTHONPATH:/blob/amd-apex/ >> $GITHUB_ENV
# Runs a set of commands using the runners shell
- name: Install deepspeed
run: |
Expand All @@ -68,7 +67,16 @@ jobs:
# Runs a set of commands using the runners shell
- name: Unit tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -n 4 --verbose unit/
TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'sequential' unit/
pytest $PYTEST_OPTS -n 4 --verbose unit/
pytest $PYTEST_OPTS -m 'sequential' unit/
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
16 changes: 4 additions & 12 deletions .github/workflows/cpu-inference.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,7 @@
name: cpu-inference

on:
push:
branches:
- 'staging**'
paths-ignore:
- 'docs/**'
pull_request:
paths-ignore:
- 'docs/**'
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
Expand All @@ -19,7 +12,7 @@ jobs:
runs-on: ubuntu-20.04

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3

- id: setup-venv
uses: ./.github/workflows/setup-venv
Expand Down Expand Up @@ -77,7 +70,6 @@ jobs:
run: |
source oneCCL/build/_install/env/setvars.sh
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
cd tests
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference' unit/inference/test_inference_config.py
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -k TestDistAllReduce unit/comm/test_dist.py
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
16 changes: 8 additions & 8 deletions .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
name: Formatting

on:
push:
branches:
- 'staging**'
pull_request:
branches:
'**'
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"

Expand All @@ -28,12 +27,13 @@ jobs:
which python
python --version
- name: Install deepspeed
- name: Install dependencies
run: |
pip install .[dev,autotuning]
ds_report
# Previously we would do pip install .[dev] but this is causing out of
# space errors start with torch 2.1.0 release
grep -E "clang-format|pre-commit" requirements/requirements-dev.txt | xargs pip install
- name: Formatting checks
run: |
pip show pre-commit clang-format
pre-commit run --all-files
pip show pre-commit clang-format
pre-commit run --all-files
63 changes: 63 additions & 0 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: nv-a6000

on:
pull_request:
paths:
- "deepspeed/inference/v2/**"
- "tests/unit/inference/v2/**"
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read
issues: write

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
- name: MII unit tests
run: |
git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
cd DeepSpeed-MII
pip install .[dev]
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF ./
16 changes: 7 additions & 9 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
name: nv-accelerate-v100

on:
push:
branches:
- 'staging**'
paths-ignore:
- 'docs/**'
- 'blogs/**'
pull_request:
paths-ignore:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
merge_group:
branches: [ master ]
schedule:
- cron: "0 0 * * *"

Expand All @@ -30,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install --no-cache-dir torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand All @@ -45,7 +43,7 @@ jobs:
- name: HF Accelerate tests
run: |
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
git clone https://github.com/huggingface/accelerate
cd accelerate
git rev-parse --short HEAD
Expand All @@ -56,4 +54,4 @@ jobs:
# tmp fix: force newer datasets version
#pip install "datasets>=2.0.0"
pip list
HF_DATASETS_CACHE=/blob/datasets_cache/ TRANSFORMERS_CACHE=/blob/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --verbose tests/deepspeed
pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
69 changes: 69 additions & 0 deletions .github/workflows/nv-ds-chat.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
name: nv-ds-chat

on:
schedule:
- cron: "0 0 * * *"
workflow_dispatch:
inputs:
dse_branch:
description: 'DeepSpeedExamples Branch'
required: false
default: 'master'
type: string

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v3

- id: setup-venv
uses: ./.github/workflows/setup-venv

- name: Install pytorch
run: |
pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install deepspeed
run: |
pip install .[dev]
ds_report
- name: Install deepspeed-chat
run: |
BRANCH="master"
if [[ ! -z "${{ github.event.inputs.dse_branch }}" ]]; then
BRANCH="${{ github.event.inputs.dse_branch }}"
fi
echo "DeepSpeedExamples Branch: $BRANCH"
git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
cd DeepSpeedExamples/applications/DeepSpeed-Chat
pip install -r requirements.txt
pip install -e .
- name: Python environment
run: |
pip list
- name: DS-Chat unit tests
run: |
cd DeepSpeedExamples/applications/DeepSpeed-Chat
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
pytest $PYTEST_OPTS ./
- name: Open GitHub issue if nightly CI fails
if: ${{ failure() && (github.event_name == 'schedule') }}
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
update_existing: true
Loading

0 comments on commit 9920f1b

Please sign in to comment.