From dc304cb93eb05636c0401192da86ca28e9d76807 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Sat, 7 Oct 2023 15:06:53 +0800 Subject: [PATCH] Add a backup integration test --- .github/workflows/integration-test-backup.yml | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 .github/workflows/integration-test-backup.yml diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml new file mode 100644 index 000000000..ffc5fe3af --- /dev/null +++ b/.github/workflows/integration-test-backup.yml @@ -0,0 +1,75 @@ +name: IntegrationTest + +on: workflow_dispatch + +jobs: + IntegrationTest: + runs-on: self-hosted + strategy: + matrix: + container-image: [ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8, ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1] + + container: + image: ${{ matrix.container-image }} + options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 + + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Install CMake + run: | + curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz + tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp + + - name: Build + run: | + mkdir build && cd build + MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + make -j + + - name: Lock GPU clock frequency + run: | + sudo nvidia-smi -pm 1 + for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do + sudo nvidia-smi -ac $(nvidia-smi --query-gpu=clocks.max.memory,clocks.max.sm --format=csv,noheader,nounits -i $i | sed 's/\ //') -i $i + done + + - name: Run mscclpp AllGather test + run: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + + - name: Run mscclpp SendRecv test + run: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl + + - name: Run mscclpp AllReduce test + run: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl + + - name: Run mscclpp AllToAll test + run: | + set -e + export PATH=/usr/local/mpi/bin:$PATH + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl + mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl + + - name: Check collective primitives performance + run: | + set -e + python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl