Skip to content

Commit

Permalink
Fix multi-node ci pipeline (#272)
Browse files Browse the repository at this point in the history
Add `__launch_bounds__` to fix perf regression issue in CI pipeline
  • Loading branch information
Binyang2014 authored Mar 12, 2024
1 parent cdaf3ae commit 4734d87
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -949,13 +949,15 @@ __global__ void __launch_bounds__(1024)
}
}

__global__ void allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce4(int* buff, int* scratch, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
reduceScatterSm(buff, scratch, rank, nRanksPerNode, worldSize, nelems);
deviceSyncer.sync(gridDim.x);
allGatherSm(rank, worldSize, nRanksPerNode, nelems / worldSize);
}

__global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
#if defined(__HIP_PLATFORM_AMD__)
localReduceScatterSm3(buff, rank, nRanksPerNode, nelems / worldSize, nelems / worldSize, gridDim.x);
deviceSyncer.sync(gridDim.x);
Expand All @@ -967,8 +969,8 @@ __global__ void allreduce5(int* buff, int rank, int nRanksPerNode, int worldSize
#endif
}

__global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize,
size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce6(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
// This version of allreduce only works for single nodes
const int nPeers = nRanksPerNode - 1;
const size_t nPkts = nelems / 2;
Expand Down Expand Up @@ -1033,8 +1035,8 @@ __global__ void allreduce6(int* buff, int* scratch, void* resultBuff, int rank,
}
}

__global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize,
size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce7(int* buff, int* scratch, void* resultBuff, int rank, int nRanksPerNode, int worldSize, size_t nelems) {
// This version of allreduce only works for single nodes
const int nPeers = nRanksPerNode - 1;
const size_t nPkts = nelems;
Expand Down Expand Up @@ -1163,13 +1165,12 @@ void AllReduceTestColl::runColl(const TestArgs& args, cudaStream_t stream) {
else if (kernelNum == 5)
allreduce5<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, rank, args.nRanksPerNode, worldSize,
paramCount_);
else if (kernelNum == 6) {
else if (kernelNum == 6)
allreduce6<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
args.nRanksPerNode, worldSize, paramCount_);
} else if (kernelNum == 7) {
else if (kernelNum == 7)
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>((int*)inputBuff, (int*)tmpBuff, resultBuff, rank,
args.nRanksPerNode, worldSize, paramCount_);
}
}

void AllReduceTestColl::initData(const TestArgs& args, std::vector<void*> sendBuff, void* expectedBuff) {
Expand Down

0 comments on commit 4734d87

Please sign in to comment.