Skip to content

Commit

Permalink
add __launch_bounds__ for mscclpp_test
Browse files Browse the repository at this point in the history
  • Loading branch information
Binyang2014 committed Mar 13, 2024
1 parent 4734d87 commit 23c6996
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 12 deletions.
10 changes: 5 additions & 5 deletions test/mscclpp-test/allgather_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ __constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
__device__ uint64_t globalFlag;

__global__ void allgather0(int rank, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand Down Expand Up @@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
}

__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand All @@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
nelemsPerGPU * sizeof(int));
}

__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand Down Expand Up @@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
}
}

__global__ void allgather3() {
__global__ void __launch_bounds__(1024) allgather3() {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand All @@ -232,7 +232,7 @@ __global__ void allgather3() {
}
}

__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
// this allgather is a pipelined and hierarchical one and only works for two nodes
// it is implemented as follows:
// Step 1: each node does a local allgather and concurrently,
Expand Down
8 changes: 5 additions & 3 deletions test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
nBlocksForLocalAllGather);
}

__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
__global__ void __launch_bounds__(1024)
allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
int peerId = blockIdx.x / BLOCKS_PER_PEER;
int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
int remoteRank = (peerId < rank) ? peerId : peerId + 1;
Expand Down Expand Up @@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
}
}

__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
int nRanksPerNode, int worldSize, size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
int worldSize, size_t nelems) {
int numPeersPerNode = nRanksPerNode - 1;
size_t nPkts = nelems / 2; // 2 elems per packet, assume nelems is even
size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);
Expand Down
4 changes: 2 additions & 2 deletions test/mscclpp-test/alltoall_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
}
}

__global__ void alltoall0(int rank, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
if (threadIdx.x == 0) {
Expand All @@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
}
}

__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
localAlltoall(rank, nRanksPerNode, nElements);
}

Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/check_perf_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
str(key),
value["time"],
baseline[key]["time"],
time_threshold,
threshold,
)
res = False
return res
Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/sendrecv_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode

__device__ mscclpp::DeviceSyncer deviceSyncer;

__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
size_t startIndex = blockIdx.x * dataPerBlock;
size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down

0 comments on commit 23c6996

Please sign in to comment.