microsoft · chhwang · Mar 25, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/test/deploy/perf_ndmv4.jsonl b/test/deploy/perf_ndmv4.jsonl
@@ -11,9 +11,9 @@
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":3.3919,"busBw":5.9359, "size":24576,      "time":7.24,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":6.21,  "busBw":10.87,  "size":49152,      "time":7.91,    "target":"latency"}
 {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8,  "algBw":8.90,  "busBw":15.57,  "size":73728,      "time":8.28,    "target":"latency"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55,  "busBw":158.53, "size":25165824,   "time":297.64,  "target":"throughput"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43,  "busBw":186.44, "size":50331648,   "time":506.16,  "target":"throughput"}
-{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84,  "busBw":151.58, "size":25165824,   "time":311.28,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27,  "busBw":182.38, "size":50331648,   "time":517.43,  "target":"throughput"}
+{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"}
 {"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5,  "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
 {"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53,  "busBw":43.63,  "size":1073741824, "time":23071.5, "target":"throughput"}
 {"name":"alltoall",  "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
@@ -27,7 +27,7 @@ __constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
 __constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
 __device__ uint64_t globalFlag;
 
-__global__ void allgather0(int rank, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
@@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
   constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
 }
 
-__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
@@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
                  nelemsPerGPU * sizeof(int));
 }
 
-__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   int warpId = threadIdx.x / WARP_SIZE;
   int remoteRank = (warpId < rank) ? warpId : warpId + 1;
 
@@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
   }
 }
 
-__global__ void allgather3() {
+__global__ void __launch_bounds__(1024) allgather3() {
   int warpId = threadIdx.x / WARP_SIZE;
 
   // Each warp is responsible for one of the remote ranks
@@ -232,7 +232,7 @@ __global__ void allgather3() {
   }
 }
 
-__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
+__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
   // this allgather is a pipelined and hierarchical one and only works for two nodes
   // it is implemented as follows:
   // Step 1: each node does a local allgather and concurrently,

diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
@@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
                    nBlocksForLocalAllGather);
 }
 
-__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
+__global__ void __launch_bounds__(1024)
+    allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
   int peerId = blockIdx.x / BLOCKS_PER_PEER;
   int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
   int remoteRank = (peerId < rank) ? peerId : peerId + 1;
@@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
   }
 }
 
-__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
-                           int nRanksPerNode, int worldSize, size_t nelems) {
+__global__ void __launch_bounds__(1024)
+    allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
+               int worldSize, size_t nelems) {
   int numPeersPerNode = nRanksPerNode - 1;
   size_t nPkts = nelems / 2;  // 2 elems per packet, assume nelems is even
   size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);

diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
@@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
   }
 }
 
-__global__ void alltoall0(int rank, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
   int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
   DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
   if (threadIdx.x == 0) {
@@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
   }
 }
 
-__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
+__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
   localAlltoall(rank, nRanksPerNode, nElements);
 }
 

diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
@@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
                 str(key),
                 value["time"],
                 baseline[key]["time"],
-                time_threshold,
+                threshold,
             )
             res = False
     return res

diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu
@@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
-__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
+__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
   size_t startIndex = blockIdx.x * dataPerBlock;
   size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
   int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;