Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang committed Feb 21, 2024
1 parent 90f93a7 commit 182a8e3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 6 deletions.
8 changes: 4 additions & 4 deletions test/mp_unit/sm_channel_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,9 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r
// sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
}
// __syncthreads();
smChan.putPackets<LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
smChan.putPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
} else {
smChan.getPackets<LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
smChan.getPackets<mscclpp::LL8Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
// If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
// __syncthreads();
for (int j = threadIdx.x; j < nElem; j += blockDim.x) {
Expand Down Expand Up @@ -349,9 +349,9 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int*
sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1;
}
// __syncthreads();
smChan.putPackets<LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
smChan.putPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
} else {
smChan.getPackets<LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
smChan.getPackets<mscclpp::LL16Packet>(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag);
// If each thread reads 8 bytes at once, we don't need a barrier after getPackets().
// __syncthreads();
for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) {
Expand Down
4 changes: 2 additions & 2 deletions test/mscclpp-test/allreduce_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1059,8 +1059,8 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank,
uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int));

// step 1: write to scratch buffer
constSmOutOfPlaceChans[peerIdx].putPackets<LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid,
blockDim.x * nBlocksPerPeer, flag);
constSmOutOfPlaceChans[peerIdx].putPackets<mscclpp::LL8Packet>(scratchOffset, srcOffset, nelemsPerRank * sizeof(int),
tid, blockDim.x * nBlocksPerPeer, flag);
// step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer
for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) {
uint32_t data = 0;
Expand Down

0 comments on commit 182a8e3

Please sign in to comment.