Skip to content

Commit

Permalink
Add backup workflows (#189)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Oct 7, 2023
1 parent b3d0fdb commit 497a9e0
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 6 deletions.
75 changes: 75 additions & 0 deletions .github/workflows/integration-test-backup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
name: IntegrationTest

on: workflow_dispatch

jobs:
IntegrationTest:
runs-on: self-hosted
strategy:
matrix:
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]

container:
image: ${{ matrix.container-image }}
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Install CMake
run: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
- name: Run mscclpp AllGather test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
- name: Run mscclpp SendRecv test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
- name: Run mscclpp AllReduce test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl
- name: Run mscclpp AllToAll test
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
- name: Check collective primitives performance
run: |
set -e
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
8 changes: 2 additions & 6 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,8 @@ jobs:
- name: Install Python dependencies
run: python3.8 -m pip install black

- name: Run linters
uses: wearerequired/lint-action@v2
with:
black: true
black_auto_fix: false
black_args: "--config pyproject.toml --check"
- name: Run black
run: python3.8 -m black --check --config pyproject.toml .

spelling:
runs-on: ubuntu-20.04
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/ut-backup.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: UnitTest

on: workflow_dispatch

jobs:
UnitTest:
runs-on: self-hosted
timeout-minutes: 30
strategy:
matrix:
container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1]

container:
image: ${{ matrix.container-image }}
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Build
run: |
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}

- name: LockGPUClock
run: |
sudo nvidia-smi -pm 1
for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i
done
working-directory: ${{ github.workspace }}

- name: UnitTests
run: |
./build/test/unit_tests
working-directory: ${{ github.workspace }}

- name: MpUnitTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
mpirun -tag-output -np 2 ./build/test/mp_unit_tests
mpirun -tag-output -np 4 ./build/test/mp_unit_tests
mpirun -tag-output -np 8 ./build/test/mp_unit_tests
working-directory: ${{ github.workspace }}

- name: PyTests
run: |
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '${{ matrix.container-image }}' == *'cuda11'* ]]; then
pip3 install -r ../python/test/requirements_cu11.txt
else
pip3 install -r ../python/test/requirements_cu12.txt
fi
mpirun -tag-output -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
working-directory: ${{ github.workspace }}

0 comments on commit 497a9e0

Please sign in to comment.