From 182a8e3918d9ab20d4908119b35ffd95cef78847 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Wed, 21 Feb 2024 00:13:03 +0000 Subject: [PATCH] fixes --- test/mp_unit/sm_channel_tests.cu | 8 ++++---- test/mscclpp-test/allreduce_test.cu | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/mp_unit/sm_channel_tests.cu b/test/mp_unit/sm_channel_tests.cu index eba7bc811..45c5fa644 100644 --- a/test/mp_unit/sm_channel_tests.cu +++ b/test/mp_unit/sm_channel_tests.cu @@ -312,9 +312,9 @@ __global__ void kernelSmLL8PacketPingPong(int* buff, int rank, int nElem, int* r // sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem; j += blockDim.x) { @@ -349,9 +349,9 @@ __global__ void kernelSmLL16PacketPingPong(int* buff, int rank, int nElem, int* sendBuff[2 * j + 1] = putOffset + i + 2 * j + 1; } // __syncthreads(); - smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.putPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); } else { - smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); + smChan.getPackets(0, 0, nElem * sizeof(int), threadIdx.x, blockDim.x, flag); // If each thread reads 8 bytes at once, we don't need a barrier after getPackets(). // __syncthreads(); for (int j = threadIdx.x; j < nElem / 2; j += blockDim.x) { diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu index 980f4c18b..cbedcefd0 100644 --- a/test/mscclpp-test/allreduce_test.cu +++ b/test/mscclpp-test/allreduce_test.cu @@ -1059,8 +1059,8 @@ __global__ void allreduce7(int* buff, int* scratch, void* resultBuff, int rank, uint32_t* dst = (uint32_t*)((char*)resultBuff + rank * nelemsPerRank * sizeof(int)); // step 1: write to scratch buffer - constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), tid, - blockDim.x * nBlocksPerPeer, flag); + constSmOutOfPlaceChans[peerIdx].putPackets(scratchOffset, srcOffset, nelemsPerRank * sizeof(int), + tid, blockDim.x * nBlocksPerPeer, flag); // step 2: get data from scratch buffer, reduce data and write result to remote scratch buffer for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < nPktsPerRank; idx += blockDim.x * gridDim.x) { uint32_t data = 0;