Skip to content

Commit

Permalink
apps/nccl: allgather tuning
Browse files Browse the repository at this point in the history
  • Loading branch information
nusislam committed Oct 14, 2024
1 parent f9def85 commit cdbb2de
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions apps/nccl/src/allgather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,19 +103,33 @@ __global__ void __launch_bounds__(1024, 1)
}
}

deviceSyncer.sync(gridDim.x);
//deviceSyncer.sync(gridDim.x);

if (threadIdx.x < nPeer) {
smChans[threadIdx.x].relaxedSignal();
smChans[threadIdx.x].wait();
}
__syncthreads();
}

template <bool IsOutOfPlace, typename T>
cudaError_t allgather(T* buff, [[maybe_unused]] T* scratch, [[maybe_unused]] T* resultBuff,
mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels, size_t channelOutOffset, int rank,
int nRanksPerNode, int worldSize, size_t nelems, cudaStream_t stream) {
allgather6<IsOutOfPlace><<<28, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,

int nBlocks = 28;

if (nelems <= 4096) {
nBlocks = 7;
}
else if (nelems <= 32768) {
nBlocks = 14;
} else if (nelems >= 2097152) {
nBlocks = 35;
}


allgather6<IsOutOfPlace><<<nBlocks, 1024, 0, stream>>>((void*)buff, smChannels, channelOutOffset, rank, worldSize,
nRanksPerNode, nelems * sizeof(T) / sizeof(int));
return cudaGetLastError();
}
Expand Down

0 comments on commit cdbb2de

Please sign in to comment.