add __launch_bounds__ for mscclpp_test

microsoft · Mar 13, 2024 · 23c6996 · 23c6996
1 parent 4734d87
commit 23c6996
Show file tree

Hide file tree

Showing 5 changed files with 14 additions and 12 deletions.
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
@@ -27,7 +27,7 @@ __constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
 __constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
 __device__ uint64_t globalFlag;
 
-__global__ void allgather0(int rank, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
@@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
   constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
-__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
@@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
                  nelemsPerGPU * sizeof(int));
 }
 
-__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
@@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   }
 }
 
-__global__ void allgather3() {
+__global__ void __launch_bounds__(1024) allgather3() {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
@@ -232,7 +232,7 @@ __global__ void allgather3() {
   }
 }
 
-__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
   // Step 1: each node does a local allgather and concurrently,

diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
@@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
                    nBlocksForLocalAllGather);
 }
 
-__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
+__global__ void __launch_bounds__(1024)
+    allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
   int peerId = blockIdx.x / BLOCKS_PER_PEER;
   int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
   int remoteRank = (peerId < rank) ? peerId : peerId + 1;
@@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
   }
 }
 
-__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
-                           int nRanksPerNode, int worldSize, size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
+               int worldSize, size_t nelems) {
   int numPeersPerNode = nRanksPerNode - 1;
   size_t nPkts = nelems / 2;  // 2 elems per packet, assume nelems is even
   size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);

diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
@@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
   }
 }
 
-__global__ void alltoall0(int rank, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
   if (threadIdx.x == 0) {
@@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
   }
 }
 
-__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
   localAlltoall(rank, nRanksPerNode, nElements);
 }
 

diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
@@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
                 str(key),
                 value["time"],
                 baseline[key]["time"],
-                time_threshold,
+                threshold,
             )
             res = False
     return res

diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
@@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
-__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
+__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
   size_t startIndex = blockIdx.x * dataPerBlock;
   size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
   int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;