Skip to content

Commit

Permalink
Add __launch_bounds__ for mscclpp-test (#273)
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 authored Mar 25, 2024
1 parent 4734d87 commit bc465ae
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 15 deletions.
6 changes: 3 additions & 3 deletions test/deploy/perf_ndmv4.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55, "busBw":158.53, "size":25165824, "time":297.64, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43, "busBw":186.44, "size":50331648, "time":506.16, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84, "busBw":151.58, "size":25165824, "time":311.28, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27, "busBw":182.38, "size":50331648, "time":517.43, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"}
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"}
{"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
10 changes: 5 additions & 5 deletions test/mscclpp-test/allgather_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ __constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
__device__ uint64_t globalFlag;

__global__ void allgather0(int rank, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand Down Expand Up @@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
}

__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand All @@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
nelemsPerGPU * sizeof(int));
}

__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand Down Expand Up @@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
}
}

__global__ void allgather3() {
__global__ void __launch_bounds__(1024) allgather3() {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand All @@ -232,7 +232,7 @@ __global__ void allgather3() {
}
}

__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
// this allgather is a pipelined and hierarchical one and only works for two nodes
// it is implemented as follows:
// Step 1: each node does a local allgather and concurrently,
Expand Down
8 changes: 5 additions & 3 deletions test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
nBlocksForLocalAllGather);
}

__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
__global__ void __launch_bounds__(1024)
allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
int peerId = blockIdx.x / BLOCKS_PER_PEER;
int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
int remoteRank = (peerId < rank) ? peerId : peerId + 1;
Expand Down Expand Up @@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
}
}

__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
int nRanksPerNode, int worldSize, size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
int worldSize, size_t nelems) {
int numPeersPerNode = nRanksPerNode - 1;
size_t nPkts = nelems / 2; // 2 elems per packet, assume nelems is even
size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);
Expand Down
4 changes: 2 additions & 2 deletions test/mscclpp-test/alltoall_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
}
}

__global__ void alltoall0(int rank, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
if (threadIdx.x == 0) {
Expand All @@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
}
}

__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
localAlltoall(rank, nRanksPerNode, nElements);
}

Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/check_perf_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
str(key),
value["time"],
baseline[key]["time"],
time_threshold,
threshold,
)
res = False
return res
Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/sendrecv_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode

__device__ mscclpp::DeviceSyncer deviceSyncer;

__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
size_t startIndex = blockIdx.x * dataPerBlock;
size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down

0 comments on commit bc465ae

Please sign in to comment.