Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add __launch_bounds__ for mscclpp_test #273

Merged
merged 2 commits into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions test/deploy/perf_ndmv4.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":3.3919,"busBw":5.9359, "size":24576, "time":7.24, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":6.21, "busBw":10.87, "size":49152, "time":7.91, "target":"latency"}
{"name":"allreduce", "kernel":6, "ranks":8, "ranksPerNode":8, "algBw":8.90, "busBw":15.57, "size":73728, "time":8.28, "target":"latency"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":84.55, "busBw":158.53, "size":25165824, "time":297.64, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":99.43, "busBw":186.44, "size":50331648, "time":506.16, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":124.60, "busBw":233.64, "size":3221225472, "time":25850.67,"target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":80.84, "busBw":151.58, "size":25165824, "time":311.28, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":97.27, "busBw":182.38, "size":50331648, "time":517.43, "target":"throughput"}
{"name":"allreduce", "kernel":4, "ranks":16,"ranksPerNode":8, "algBw":125.99, "busBw":236.24, "size":3221225472, "time":25565.46,"target":"throughput"}
{"name":"allreduce", "kernel":3, "ranks":16,"ranksPerNode":8, "algBw":119.5, "busBw":224.06, "size":3221225472, "time":26955.85,"target":"throughput"}
{"name":"alltoall", "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.53, "busBw":43.63, "size":1073741824, "time":23071.5, "target":"throughput"}
{"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":276.17, "busBw":241.65, "size":1073741824, "time":3887.87, "target":"throughput"}
10 changes: 5 additions & 5 deletions test/mscclpp-test/allgather_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ __constant__ DeviceHandle<mscclpp::SmChannel> constSmChans[512];
__constant__ DeviceHandle<mscclpp::SmChannel> constSmOutOfPlaceChans[16];
__device__ uint64_t globalFlag;

__global__ void allgather0(int rank, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather0(int rank, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand Down Expand Up @@ -124,7 +124,7 @@ __device__ void localAllGatherSm(int rank, int nRanksPerNode, int startRankChunk
constSmChans[peerIdx].get(offset + offsetForThisBlock, sizeForThisBlock, threadIdx.x, blockDim.x);
}

__global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand All @@ -135,7 +135,7 @@ __global__ void allgather1(int rank, int nRanksPerNode, size_t nelemsPerGPU) {
nelemsPerGPU * sizeof(int));
}

__global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather2(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
int warpId = threadIdx.x / WARP_SIZE;
int remoteRank = (warpId < rank) ? warpId : warpId + 1;

Expand Down Expand Up @@ -210,7 +210,7 @@ __global__ void allgather2(int rank, int worldSize, int nRanksPerNode, size_t ne
}
}

__global__ void allgather3() {
__global__ void __launch_bounds__(1024) allgather3() {
int warpId = threadIdx.x / WARP_SIZE;

// Each warp is responsible for one of the remote ranks
Expand All @@ -232,7 +232,7 @@ __global__ void allgather3() {
}
}

__global__ void allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
__global__ void __launch_bounds__(1024) allgather4(int rank, int worldSize, int nRanksPerNode, size_t nelemsPerGPU) {
// this allgather is a pipelined and hierarchical one and only works for two nodes
// it is implemented as follows:
// Step 1: each node does a local allgather and concurrently,
Expand Down
8 changes: 5 additions & 3 deletions test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,8 @@ __device__ void allGatherSm(int rank, int worldSize, int nRanksPerNode, size_t n
nBlocksForLocalAllGather);
}

__global__ void allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
__global__ void __launch_bounds__(1024)
allreduce0(int* buff, int* scratch, int rank, int worldSize, size_t nelems, size_t scratchDataCount) {
int peerId = blockIdx.x / BLOCKS_PER_PEER;
int isComm = (threadIdx.x == 0) && (blockIdx.x % BLOCKS_PER_PEER == 0);
int remoteRank = (peerId < rank) ? peerId : peerId + 1;
Expand Down Expand Up @@ -836,8 +837,9 @@ __global__ void __launch_bounds__(1024) allreduce1(int* buff, int* scratch, int
}
}

__global__ void allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank,
int nRanksPerNode, int worldSize, size_t nelems) {
__global__ void __launch_bounds__(1024)
allreduce2(int* buff, void* scratch, void* putPktBuf, void* getPktBuf, void* result, int rank, int nRanksPerNode,
int worldSize, size_t nelems) {
int numPeersPerNode = nRanksPerNode - 1;
size_t nPkts = nelems / 2; // 2 elems per packet, assume nelems is even
size_t pktBytes = nPkts * sizeof(mscclpp::LLPacket);
Expand Down
4 changes: 2 additions & 2 deletions test/mscclpp-test/alltoall_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ __device__ void localAlltoall(int rank, int nRanksPerNode, size_t nElements) {
}
}

__global__ void alltoall0(int rank, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall0(int rank, size_t nElements) {
int remoteRank = ((int)blockIdx.x < rank) ? blockIdx.x : blockIdx.x + 1;
DeviceHandle<mscclpp::SimpleProxyChannel> proxyChan = constProxyChans[blockIdx.x];
if (threadIdx.x == 0) {
Expand All @@ -44,7 +44,7 @@ __global__ void alltoall0(int rank, size_t nElements) {
}
}

__global__ void alltoall1(int rank, int nRanksPerNode, size_t nElements) {
__global__ void __launch_bounds__(1024) alltoall1(int rank, int nRanksPerNode, size_t nElements) {
localAlltoall(rank, nRanksPerNode, nElements);
}

Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/check_perf_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
str(key),
value["time"],
baseline[key]["time"],
time_threshold,
threshold,
)
res = False
return res
Expand Down
2 changes: 1 addition & 1 deletion test/mscclpp-test/sendrecv_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ inline mscclpp::Transport getTransport(int rank, int peerRank, int nRanksPerNode

__device__ mscclpp::DeviceSyncer deviceSyncer;

__global__ void kernel(size_t dataSize, size_t dataPerBlock) {
__global__ void __launch_bounds__(1024) kernel(size_t dataSize, size_t dataPerBlock) {
size_t startIndex = blockIdx.x * dataPerBlock;
size_t blockDataSize = min(dataSize - startIndex, dataPerBlock);
int globalIndex = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down
Loading