Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 committed Dec 10, 2024
1 parent 935999a commit 8e70707
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 23 deletions.
36 changes: 21 additions & 15 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,13 @@ jobs:
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -97,9 +98,10 @@ jobs:
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -115,15 +117,16 @@ jobs:
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
set -e; \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -138,10 +141,11 @@ jobs:
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
set -e; \
cd /root/mscclpp; \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \
mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -158,6 +162,7 @@ jobs:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand All @@ -175,8 +180,9 @@ jobs:
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
set -e; \
export PATH=/usr/local/mpi/bin:$PATH; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
python3 -m pip install . \
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
Expand Down
14 changes: 7 additions & 7 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,9 +102,9 @@ jobs:
export PATH=/usr/local/mpi/bin:$PATH; \
cd /root/mscclpp; \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
mpirun -tag-output -np 2 ./build/test/mp_unit_tests; \
mpirun -tag-output -np 4 ./build/test/mp_unit_tests; \
mpirun -tag-output -np 8 ./build/test/mp_unit_tests"'
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests; \
mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
Expand All @@ -122,7 +122,7 @@ jobs:
export PATH=/usr/local/mpi/bin:$PATH \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
cd /root/mscclpp; \
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
Expand Down Expand Up @@ -208,7 +208,7 @@ jobs:
export PATH=/usr/local/mpi/bin:$PATH \
export NPKIT_DUMP_DIR=./npkit_dump \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
mpirun -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter="ExecutorTest.TwoNodesAllreduce"\
mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter="ExecutorTest.TwoNodesAllreduce"\
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json \
Expand All @@ -232,14 +232,14 @@ jobs:
export PATH=/usr/local/mpi/bin:$PATH \
export NPKIT_DUMP_DIR=./npkit_dump \
export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \
grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json \
grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json \
grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json \
rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output \
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' \
mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' \
python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \
grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \
grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json \
Expand Down
2 changes: 1 addition & 1 deletion test/deploy/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker pull ${CONTAINERIMAGE}"
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --name=mscclpp-test \
-w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \
--entrypoint /bin/bash ${CONTAINERIMAGE}"
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \
"sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"
Expand Down

0 comments on commit 8e70707

Please sign in to comment.