diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index 02047e58..403ed362 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -76,12 +76,13 @@ jobs: parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ set -e; \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"' + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 @@ -97,9 +98,10 @@ jobs: parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ set -e; \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"' + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 @@ -115,15 +117,16 @@ jobs: parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ set -e; \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"' + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 4 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 12M -e 48M -i 3145728 2 -k 5 -o output.jsonl \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 24K -e 768K -i 24576 -k 6 -w 100 -n 100 -o output.jsonl"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 @@ -138,10 +141,11 @@ jobs: parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \ -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ set -e; \ cd /root/mscclpp; \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ - mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"' + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl; \ + mpirun --allow-run-as-root -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 @@ -158,6 +162,7 @@ jobs: -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ set -e; \ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl"' workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -175,8 +180,9 @@ jobs: -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\ set -e; \ export PATH=/usr/local/mpi/bin:$PATH; \ + export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ python3 -m pip install . \ - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"' + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: AzureCLI@2 diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index c6358322..764f82e8 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -102,9 +102,9 @@ jobs: export PATH=/usr/local/mpi/bin:$PATH; \ cd /root/mscclpp; \ export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ - mpirun -tag-output -np 2 ./build/test/mp_unit_tests; \ - mpirun -tag-output -np 4 ./build/test/mp_unit_tests; \ - mpirun -tag-output -np 8 ./build/test/mp_unit_tests"' + mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 4 ./build/test/mp_unit_tests; \ + mpirun --allow-run-as-root -tag-output -np 8 ./build/test/mp_unit_tests"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: Bash@3 @@ -122,7 +122,7 @@ jobs: export PATH=/usr/local/mpi/bin:$PATH \ export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ cd /root/mscclpp; \ - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x"' workingDirectory: '$(System.DefaultWorkingDirectory)' - task: AzureCLI@2 @@ -208,7 +208,7 @@ jobs: export PATH=/usr/local/mpi/bin:$PATH \ export NPKIT_DUMP_DIR=./npkit_dump \ export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ - mpirun -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter="ExecutorTest.TwoNodesAllreduce"\ + mpirun --allow-run-as-root -tag-output -np 2 ./build/test/mp_unit_tests --gtest_filter="ExecutorTest.TwoNodesAllreduce"\ python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \ grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \ grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json \ @@ -232,14 +232,14 @@ jobs: export PATH=/usr/local/mpi/bin:$PATH \ export NPKIT_DUMP_DIR=./npkit_dump \ export LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH; \ - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' \ + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce.json' \ python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \ grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \ grep -q NPKIT_EVENT_EXECUTOR_SIGNAL_ENTRY ./npkit_output/npkit_event_trace.json \ grep -q NPKIT_EVENT_EXECUTOR_WAIT_ENTRY ./npkit_output/npkit_event_trace.json \ grep -q NPKIT_EVENT_EXECUTOR_READ_REDUCE_COPY_SEND_ENTRY ./npkit_output/npkit_event_trace.json \ rm -rf ./npkit_dump && mkdir ./npkit_dump && rm -rf ./npkit_output && mkdir ./npkit_output \ - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' \ + mpirun --allow-run-as-root -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ./python/test/test_mscclpp.py -x -k 'test_executor[allreduce_packet.json' \ python3 ./tools/npkit/npkit_trace_generator.py --npkit_dump_dir=./npkit_dump --npkit_event_header_path=./include/mscclpp/npkit/npkit_event.hpp --output_dir=./npkit_output \ grep -q NPKIT_EVENT_EXECUTOR_INIT_ENTRY ./npkit_output/npkit_event_trace.json \ grep -q NPKIT_EVENT_EXECUTOR_COPY_PACKET_ENTRY ./npkit_output/npkit_event_trace.json \ diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh index f63b3d45..bd549c4d 100644 --- a/test/deploy/deploy.sh +++ b/test/deploy/deploy.sh @@ -39,7 +39,7 @@ parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker pull ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker run --rm -itd --privileged --net=host --ipc=host --gpus=all \ - -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --name=mscclpp-test \ + -w /root -v ${DST_DIR}:/root/mscclpp -v /opt/microsoft:/opt/microsoft --ulimit memlock=-1:-1 --name=mscclpp-test \ --entrypoint /bin/bash ${CONTAINERIMAGE}" parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \ "sudo docker exec -t --user root mscclpp-test bash '/root/mscclpp/test/deploy/setup.sh'"