Skip to content

Commit

Permalink
Merge branch 'main' into binyli/memory_reduce
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 authored Dec 9, 2024
2 parents d9ec000 + 7a3dcb0 commit 09b65f8
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 2 deletions.
168 changes: 168 additions & 0 deletions .azure-pipelines/nccl-api-test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
trigger:
- main

pr:
branches:
include:
- main
drafts: false

jobs:
- job: NcclTest
displayName: Run MSCCLPP over NCCL Test
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.4
pool:
name: msccl-ci
container:
image: $[ variables['containerImage'] ]

steps:
- checkout: self
- checkout: git://One/msccl-users
- task: Bash@3
name: Build
displayName: Build
inputs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'

- task: DownloadSecureFile@1
name: SshKeyFile
displayName: Download key file
inputs:
secureFile: mscclpp.pem

- task: Bash@3
name: InstallPackages
displayName: Install Packages
inputs:
targetType: 'inline'
script: |
sudo apt-get update -y
sudo apt-get install pssh -y
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
- task: AzureCLI@2
name: StartVMSS
displayName: Start VMSS
inputs:
azureSubscription: mscclpp-ci
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss start --name mscclpp-ci --resource-group mscclpp
- task: Bash@3
name: DeployTestEnv
displayName: Deploy Test Env
inputs:
targetType: filePath
filePath: mscclpp/test/deploy/deploy.sh
arguments: "nccltest-single-node"
workingDirectory: $(System.DefaultWorkingDirectory)/mscclpp

- task: Bash@3
name: CopyMscclUsers
displayName: Copy msccl-users
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/msccl-users
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
DST_DIR="/tmp/mscclpp/msccl-users"
parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: InstallMscclTools
displayName: Install msccl-tools
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd /root/mscclpp; \
git clone https://github.com/Azure/msccl-tools.git; \
cd /root/mscclpp/msccl-tools; pip3 install ."'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: GenerateExecutionFile
displayName: Generate execution file
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
cd /root/mscclpp/msccl-users; \
mkdir -p execution-files; \
cd /root/mscclpp/msccl-users; \
bash algos/mscclpp_a100/generate_execution_plan.sh"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: InstallNcclTests
displayName: Install NCCL Tests
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
cd; git clone https://github.com/NVIDIA/nccl-tests.git; \
cd nccl-tests; \
MPI=1 MPI_HOME=/usr/local/mpi make -j"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: Bash@3
name: RunNcclAllreduceTest
displayName: Run NCCL Allreduce Test
inputs:
targetType: 'inline'
script: |
set -e
HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
ROOT_DIR=$(System.DefaultWorkingDirectory)/mscclpp
SSH_OPTION="StrictHostKeyChecking=no"
KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" \
-O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c "\
cd /root/mscclpp; \
mpirun -np 8 --bind-to numa --allow-run-as-root -x LD_PRELOAD=/root/mscclpp/build/apps/nccl/libmscclpp_nccl.so -x NCCL_DEBUG=WARN -x MSCCLPP_EXECUTION_PLAN_DIR=/root/mscclpp/msccl-users/execution-files /root/nccl-tests/build/all_reduce_perf -b 1K -e 1G -f 2 -d half -G 20 -w 10 -n 20"'
workingDirectory: '$(System.DefaultWorkingDirectory)'

- task: AzureCLI@2
name: StopVMSS
displayName: Deallocate VMSS
condition: always()
inputs:
azureSubscription: mscclpp-ci
scriptType: bash
scriptLocation: inlineScript
inlineScript: |
az vmss deallocate --name mscclpp-ci --resource-group mscclpp
1 change: 1 addition & 0 deletions docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
python3 -m pip install --no-cache-dir --upgrade pip && \
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt

# Set PATH
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "mscclpp"
version = "0.5.2"

[tool.scikit-build]
cmake.minimum-version = "3.25.0"
cmake.version = ">=3.25.0"
build-dir = "build/{wheel_tag}"
wheel.packages = ["python/mscclpp", "python/mscclpp_benchmark"]
wheel.install-dir = "mscclpp"
Expand Down
13 changes: 12 additions & 1 deletion test/deploy/deploy.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
set -e

# get parameter form $1
TEST_NAME=$1

KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
ROOT_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/"
if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
ROOT_DIR="${ROOT_DIR}/mscclpp"
SYSTEM_DEFAULTWORKINGDIRECTORY="${SYSTEM_DEFAULTWORKINGDIRECTORY}/mscclpp"
fi
DST_DIR="/tmp/mscclpp"
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
if [ "${TEST_NAME}" == "nccltest-single-node" ]; then
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile_ci"
else
HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
fi
SSH_OPTION="StrictHostKeyChecking=no"

chmod 400 ${KeyFilePath}
Expand Down
1 change: 1 addition & 0 deletions test/deploy/hostfile_ci
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[email protected]

0 comments on commit 09b65f8

Please sign in to comment.