diff --git a/test/deploy/perf_ndmv4.jsonl b/test/deploy/perf_ndmv4.jsonl index 8d76e9059..757998851 100644 --- a/test/deploy/perf_ndmv4.jsonl +++ b/test/deploy/perf_ndmv4.jsonl @@ -11,9 +11,9 @@ {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"} {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"} {"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"} -{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55, "busBw":158.53, "size":25165824, "time":297.64, "target":"throughput"} -{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43, "busBw":186.44, "size":50331648, "time":506.16, "target":"throughput"} -{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84, "busBw":151.58, "size":25165824, "time":311.28, "target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27, "busBw":182.38, "size":50331648, "time":517.43, "target":"throughput"} +{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"} {"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"} {"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"} {"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"} diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu index 5c101bbd2..714b2858d 100644 --- a/test/mscclpp-test/allgather_test.cu +++ b/test/mscclpp-test/allgather_test.cu @@ -27,7 +27,7 @@ __constant__ DeviceHandle constSmChans[512]; __constant__ DeviceHandle constSmOutOfPlaceChans[16]; __device__ uint64_t globalFlag; -__global__ void allgather0(int rank, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; // Each warp is responsible for one of the remote ranks @@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x); } -__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; int remoteRank = (warpId < rank) ? warpId : warpId + 1; @@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) { nelemsPerGPU * sizeof(int)); } -__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { int warpId = threadIdx.x / WARP_SIZE; int remoteRank = (warpId < rank) ? warpId : warpId + 1; @@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne } } -__global__ void allgather3() { +__global__ void __launch_bounds__(1024) allgather3() { int warpId = threadIdx.x / WARP_SIZE; // Each warp is responsible for one of the remote ranks @@ -232,7 +232,7 @@ __global__ void allgather3() { } } -__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { +__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) { // this allgather is a pipelined and hierarchical one and only works for two nodes // it is implemented as follows: // Step 1: each node does a local allgather and concurrently, diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 9eb2596e9..84eb694b1 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n nBlocksForLocalAllGather); } -__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) { +__global__ void __launch_bounds__(1024) + allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) { int peerId = blockIdx.x / BLOCKS_PER_PEER; int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0); int remoteRank = (peerId < rank) ? peerId : peerId + 1; @@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int } } -__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, - int nRanksPerNode, int worldSize, size_t nelems) { +__global__ void __launch_bounds__(1024) + allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode, + int worldSize, size_t nelems) { int numPeersPerNode = nRanksPerNode - 1; size_t nPkts = nelems / 2; // 2 elems per packet, assume nelems is even size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket); diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu index 8bb0b5a6d..a1881af91 100644 --- a/test/mscclpp-test/alltoall_test.cu +++ b/test/mscclpp-test/alltoall_test.cu @@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) { } } -__global__ void alltoall0(int rank, size_t nElements) { +__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) { int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1; DeviceHandle proxyChan = constProxyChans[blockIdx.x]; if (threadIdx.x == 0) { @@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) { } } -__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) { +__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) { localAlltoall(rank, nRanksPerNode, nElements); } diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py index 1430526ec..22e946794 100644 --- a/test/mscclpp-test/check_perf_result.py +++ b/test/mscclpp-test/check_perf_result.py @@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float, str(key), value["time"], baseline[key]["time"], - time_threshold, + threshold, ) res = False return res diff --git a/test/mscclpp-test/sendrecv_test.cu b/test/mscclpp-test/sendrecv_test.cu index 1170dc4ad..b0f830a1a 100644 --- a/test/mscclpp-test/sendrecv_test.cu +++ b/test/mscclpp-test/sendrecv_test.cu @@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode __device__ mscclpp::DeviceSyncer deviceSyncer; -__global__ void kernel(size_t dataSize, size_t dataPerBlock) { +__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) { size_t startIndex = blockIdx.x * dataPerBlock; size_t blockDataSize = min(dataSize - startIndex, dataPerBlock); int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;