From 472ec60f9ec14a74403bd109145c65b85669da59 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 Jan 2024 21:20:42 +0000 Subject: [PATCH 01/67] nvls barely works now --- nvls/README | 2 + nvls/align.h | 47 ++ nvls/alloc.h | 270 +++++++++++ nvls/argcheck.h | 16 + nvls/bootstrap.h | 32 ++ nvls/channel.h | 48 ++ nvls/checks.h | 160 +++++++ nvls/coll_net.h | 35 ++ nvls/collectives.h | 48 ++ nvls/comm.h | 473 +++++++++++++++++++ nvls/core.h | 41 ++ nvls/cpuset.h | 61 +++ nvls/cudawrap.h | 129 ++++++ nvls/debug.h | 48 ++ nvls/device.h | 463 +++++++++++++++++++ nvls/enqueue.h | 26 ++ nvls/gdrwrap.h | 252 +++++++++++ nvls/graph.h | 116 +++++ nvls/group.h | 137 ++++++ nvls/ibvcore.h | 1058 +++++++++++++++++++++++++++++++++++++++++++ nvls/ibvsymbols.h | 46 ++ nvls/ibvwrap.h | 92 ++++ nvls/info.h | 134 ++++++ nvls/ipcsocket.cc | 232 ++++++++++ nvls/ipcsocket.h | 38 ++ nvls/nccl_common.h | 33 ++ nvls/nccl_net.h | 333 ++++++++++++++ nvls/nccl_tuner.h | 55 +++ nvls/net.h | 27 ++ nvls/net_device.h | 29 ++ nvls/nvmlwrap.h | 214 +++++++++ nvls/nvtx.h | 85 ++++ nvls/p2p.h | 29 ++ nvls/param.h | 30 ++ nvls/profiler.h | 37 ++ nvls/proxy.h | 296 ++++++++++++ nvls/shm.h | 25 + nvls/socket.h | 97 ++++ nvls/strongstream.h | 140 ++++++ nvls/test.cu | 172 +++++++ nvls/test2.cpp | 143 ++++++ nvls/timer.h | 60 +++ nvls/transport.h | 128 ++++++ nvls/trees.h | 13 + nvls/tuner.h | 22 + nvls/utils.h | 524 +++++++++++++++++++++ 46 files changed, 6496 insertions(+) create mode 100644 nvls/README create mode 100644 nvls/align.h create mode 100644 nvls/alloc.h create mode 100644 nvls/argcheck.h create mode 100644 nvls/bootstrap.h create mode 100644 nvls/channel.h create mode 100644 nvls/checks.h create mode 100644 nvls/coll_net.h create mode 100644 nvls/collectives.h create mode 100644 nvls/comm.h create mode 100644 nvls/core.h create mode 100644 nvls/cpuset.h create mode 100644 nvls/cudawrap.h create mode 100644 nvls/debug.h create mode 100644 nvls/device.h create mode 100644 nvls/enqueue.h create mode 100644 nvls/gdrwrap.h create mode 100644 nvls/graph.h create mode 100644 nvls/group.h create mode 100644 nvls/ibvcore.h create mode 100644 nvls/ibvsymbols.h create mode 100644 nvls/ibvwrap.h create mode 100644 nvls/info.h create mode 100644 nvls/ipcsocket.cc create mode 100644 nvls/ipcsocket.h create mode 100644 nvls/nccl_common.h create mode 100644 nvls/nccl_net.h create mode 100644 nvls/nccl_tuner.h create mode 100644 nvls/net.h create mode 100644 nvls/net_device.h create mode 100644 nvls/nvmlwrap.h create mode 100644 nvls/nvtx.h create mode 100644 nvls/p2p.h create mode 100644 nvls/param.h create mode 100644 nvls/profiler.h create mode 100644 nvls/proxy.h create mode 100644 nvls/shm.h create mode 100644 nvls/socket.h create mode 100644 nvls/strongstream.h create mode 100644 nvls/test.cu create mode 100644 nvls/test2.cpp create mode 100644 nvls/timer.h create mode 100644 nvls/transport.h create mode 100644 nvls/trees.h create mode 100644 nvls/tuner.h create mode 100644 nvls/utils.h diff --git a/nvls/README b/nvls/README new file mode 100644 index 000000000..c385affc4 --- /dev/null +++ b/nvls/README @@ -0,0 +1,2 @@ +nvcc -I/usr/lib/x86_64-linux-gnu/openmpi/include -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -L/usr/lib/x86_64-linux-gnu/openmpi/lib -L /usr/local/cuda/lib64/ -lmpi_cxx -lmpi -lcupti -lcupti_static test.cu -gencode arch=compute_90,code=sm_90 -lcuda -lcudart -lnccl + diff --git a/nvls/align.h b/nvls/align.h new file mode 100644 index 000000000..2a71dd1bc --- /dev/null +++ b/nvls/align.h @@ -0,0 +1,47 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALIGN_H_ +#define NCCL_ALIGN_H_ + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) + +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_POWER(x, y) \ + ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +#if !__CUDA_ARCH__ + #ifndef __host__ + #define __host__ + #endif + #ifndef __device__ + #define __device__ + #endif +#endif + +template +__host__ __device__ constexpr Z divUp(X x, Y y) { + return (x+y-1)/y; +} + +template +__host__ __device__ constexpr Z roundUp(X x, Y y) { + return (x+y-1) - (x+y-1)%y; +} + +// assumes second argument is a power of 2 +template +__host__ __device__ constexpr Z alignUp(X x, int a) { + return (x+a-1) & Z(-a); +} + +#endif diff --git a/nvls/alloc.h b/nvls/alloc.h new file mode 100644 index 000000000..f8d954469 --- /dev/null +++ b/nvls/alloc.h @@ -0,0 +1,270 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOC_H_ +#define NCCL_ALLOC_H_ + +#include "nccl.h" +#include "checks.h" +#include "align.h" +#include "utils.h" +#include "p2p.h" +#include +#include +#include +#include + +uint64_t clockNano(); // from utils.h with which we have a circular dependency + +template +ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); + memset(*ptr, 0, nelem*sizeof(T)); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + return result; +} +#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +inline ncclResult_t ncclCudaHostFree(void* ptr) { + CUDACHECK(cudaFreeHost(ptr)); + return ncclSuccess; +} + +template +ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + void* p = malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); + memset(p, 0, nelem*sizeof(T)); + *ptr = (T*)p; + return ncclSuccess; +} +#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { + if (nelem < oldNelem) return ncclInternalError; + if (nelem == oldNelem) return ncclSuccess; + + T* oldp = *ptr; + T* p = (T*)malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memcpy(p, oldp, oldNelem*sizeof(T)); + free(oldp); + memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); + *ptr = (T*)p; + INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); + return ncclSuccess; +} + +#if CUDART_VERSION >= 11030 + +#include +#include "cudawrap.h" + +static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { + ncclResult_t result = ncclSuccess; + size_t granularity = 0; + CUdevice currentDev; + CUmemAllocationProp prop = {}; + CUmemAccessDesc accessDesc = {}; + CUmemGenericAllocationHandle handle; + int cudaDev; + int flag = 0; + CUDACHECK(cudaGetDevice(&cudaDev)); + CUCHECK(cuDeviceGet(¤tDev, cudaDev)); + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported + prop.location.id = currentDev; + // Query device to see if RDMA support is available + CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); + if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; + CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + ALIGN_SIZE(size, granularity); + /* Allocate the physical memory on the device */ + CUCHECK(cuMemCreate(&handle, size, &prop, 0)); + /* Reserve a virtual address range */ + CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); + /* Map the virtual address range to the physical allocation */ + CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); + /* Now allow RW access to the newly mapped memory */ + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = currentDev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); + if (handlep) *handlep = handle; + TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle); + return result; +} + +static inline ncclResult_t ncclCuMemFree(void *ptr) { + if (ptr == NULL) return ncclSuccess; + ncclResult_t result = ncclSuccess; + CUmemGenericAllocationHandle handle; + size_t size = 0; + CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); + TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); + CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); + CUCHECK(cuMemRelease(handle)); + CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); + return result; +} + +#else + +extern int ncclCuMemEnable(); + +static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} +static inline ncclResult_t ncclCuMemFree(void *ptr) { + WARN("CUMEM not supported prior to CUDA 11.3"); + return ncclInternalError; +} + +#endif + +template +ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + } +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + return result; +} +#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + // Need a side stream so as not to interfere with graph capture. + cudaStream_t stream; + CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + } + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); + CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + return result; +} +#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + *ptr = nullptr; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); + } else { + CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); + } + CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T)); + INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); + return result; +} +#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) + +template +ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + // Need a side stream so as not to interfere with graph capture. + cudaStream_t stream; + CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); + NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); + CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); + CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} + +template +ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish); +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} + +template +ncclResult_t ncclCudaFree(T* ptr) { + ncclResult_t result = ncclSuccess; + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; + TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + if (ncclCuMemEnable()) { + NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish); + } else { + CUDACHECKGOTO(cudaFree(ptr), result, finish); + } +finish: + CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); + return result; +} + +// Allocate memory to be potentially ibv_reg_mr'd. This needs to be +// allocated on separate pages as those pages will be marked DONTFORK +// and if they are shared, that could cause a crash in a child process +inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { + size_t page_size = sysconf(_SC_PAGESIZE); + void* p; + int size_aligned = ROUNDUP(size, page_size); + int ret = posix_memalign(&p, page_size, size_aligned); + if (ret != 0) return ncclSystemError; + memset(p, 0, size); + *ptr = p; + INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); + return ncclSuccess; +} +#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) + +#endif diff --git a/nvls/argcheck.h b/nvls/argcheck.h new file mode 100644 index 000000000..8d8b74e8e --- /dev/null +++ b/nvls/argcheck.h @@ -0,0 +1,16 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ARGCHECK_H_ +#define NCCL_ARGCHECK_H_ + +#include "core.h" +#include "info.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); + +#endif diff --git a/nvls/bootstrap.h b/nvls/bootstrap.h new file mode 100644 index 000000000..400a479fb --- /dev/null +++ b/nvls/bootstrap.h @@ -0,0 +1,32 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_BOOTSTRAP_H_ +#define NCCL_BOOTSTRAP_H_ + +#include "nccl.h" +#include "comm.h" + +struct ncclBootstrapHandle { + uint64_t magic; + union ncclSocketAddress addr; +}; +static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); + +ncclResult_t bootstrapNetInit(); +ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); +ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); +ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm); +ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); +ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); +ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); +ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); +ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); +ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); +ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); +ncclResult_t bootstrapClose(void* commState); +ncclResult_t bootstrapAbort(void* commState); +#endif diff --git a/nvls/channel.h b/nvls/channel.h new file mode 100644 index 000000000..adc38749a --- /dev/null +++ b/nvls/channel.h @@ -0,0 +1,48 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHANNEL_H_ +#define NCCL_CHANNEL_H_ +#include "comm.h" + +ncclResult_t initChannel(struct ncclComm* comm, int channelid); +ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); +ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); +static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { + int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; + int peerNode = comm->rankToNode[peer]; + int peerIndex = comm->rankToLocalRank[peer]; + int nsteps = comm->maxLocalRanks; + int rankIndex = comm->rankToLocalRank[comm->rank]; + int step, delta; + if (coll == ncclFuncSend) { + step = (nsteps + peerIndex - rankIndex)%nsteps; + delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; + } else if (coll == ncclFuncRecv) { + step = (nsteps + rankIndex - peerIndex)%nsteps; + delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; + } else { + return ncclInternalError; + } + *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; + return ncclSuccess; +} + +static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { + //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; + *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; + return ncclSuccess; +} + +static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { + int base; + NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); + NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); + return ncclSuccess; +} + +#endif diff --git a/nvls/checks.h b/nvls/checks.h new file mode 100644 index 000000000..c9fd16176 --- /dev/null +++ b/nvls/checks.h @@ -0,0 +1,160 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHECKS_H_ +#define NCCL_CHECKS_H_ + +#include "debug.h" + +// Check CUDA RT calls +#define CUDACHECK(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUDACHECKGOTO(cmd, RES, label) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ + RES = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +// Report failure but clear error and continue +#define CUDACHECKIGNORE(cmd) do { \ + cudaError_t err = cmd; \ + if( err != cudaSuccess ) { \ + INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ + (void) cudaGetLastError(); \ + } \ +} while(false) + +#include +// Check system calls +#define SYSCHECK(call, name) do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ +} while (false) + +#define SYSCHECKVAL(call, name, retval) do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (false) + +#define SYSCHECKSYNC(call, name, retval) do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ +} while(true) + +#define SYSCHECKGOTO(statement, RES, label) do { \ + if ((statement) == -1) { \ + /* Print the back trace*/ \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ + goto label; \ + } \ +} while (0); + +#define NEQCHECK(statement, value) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (0); + +#define NEQCHECKGOTO(statement, value, RES, label) do { \ + if ((statement) != value) { \ + /* Print the back trace*/ \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ + goto label; \ + } \ +} while (0); + +#define EQCHECK(statement, value) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (0); + +#define EQCHECKGOTO(statement, value, RES, label) do { \ + if ((statement) == value) { \ + /* Print the back trace*/ \ + RES = ncclSystemError; \ + INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ + goto label; \ + } \ +} while (0); + +// Propagate errors up +#define NCCLCHECK(call) do { \ + ncclResult_t RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + /* Print the back trace*/ \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ + return RES; \ + } \ +} while (0); + +#define NCCLCHECKGOTO(call, RES, label) do { \ + RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + /* Print the back trace*/ \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ + goto label; \ + } \ +} while (0); + +#define NCCLWAIT(call, cond, abortFlagPtr) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + ncclResult_t RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ + return ncclInternalError; \ + } \ + if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ +} while (!(cond)); + +#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ + volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ + RES = call; \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ + goto label; \ + } \ + if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ +} while (!(cond)); + +#define NCCLCHECKTHREAD(a, args) do { \ + if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ + return args; \ + } \ +} while(0) + +#define CUDACHECKTHREAD(a) do { \ + if ((a) != cudaSuccess) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + +#endif diff --git a/nvls/coll_net.h b/nvls/coll_net.h new file mode 100644 index 000000000..f4b540866 --- /dev/null +++ b/nvls/coll_net.h @@ -0,0 +1,35 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef COLL_NET_H_ +#define COLL_NET_H_ + +#include "nccl.h" +#include "nccl_net.h" + +typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; + +// Translation to external API +static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } +static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } +static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } +static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } +static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } +static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } +/* DMA-BUF support */ +static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } +static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } +static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { + NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } +static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } +static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } + +static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } + +#endif diff --git a/nvls/collectives.h b/nvls/collectives.h new file mode 100644 index 000000000..0f965276a --- /dev/null +++ b/nvls/collectives.h @@ -0,0 +1,48 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COLLECTIVES_H_ +#define NCCL_COLLECTIVES_H_ + +#include "nccl.h" + +// CHUNKSIZE must be a multiple of SLICESIZE +#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) +#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) +#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) +#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) +#define BROADCAST_SLICESTEPS 1 +#define BROADCAST_CHUNKSTEPS 1 +#define REDUCE_SLICESTEPS 1 +#define REDUCE_CHUNKSTEPS 1 +#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above + +inline int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + #endif + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} + +#endif diff --git a/nvls/comm.h b/nvls/comm.h new file mode 100644 index 000000000..328ffef3b --- /dev/null +++ b/nvls/comm.h @@ -0,0 +1,473 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COMM_H_ +#define NCCL_COMM_H_ + +#include "transport.h" +#include "p2p.h" +#include "collectives.h" +#include "nccl_tuner.h" +#include "proxy.h" +#include "strongstream.h" +#include "nccl_net.h" + +#if CUDART_VERSION < 9000 +struct cudaLaunchParams { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +}; +#endif + +#define CACHE_LINE_SIZE 128 +#define MEM_ALIGN 4096 +#define CUDA_IPC_MIN 2097152UL + +// Channels / LL tuning +#define NCCL_LL_THREAD_THRESHOLD 8 +#define NCCL_LL128_THREAD_THRESHOLD 8 +#define NCCL_SIMPLE_THREAD_THRESHOLD 64 + +struct ncclSendMem { + union { + struct { + uint64_t head; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + void* ptrExchange; + uint64_t redOpArgExchange[2]; + char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; + int offsFifo[NCCL_STEPS]; + }; + char pad3[MEM_ALIGN]; + }; +}; + +struct ncclRecvMem { + union { + struct { + uint64_t tail; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; + int offsFifo[NCCL_STEPS]; + int flush; // For GDRCopy-based flush + }; + char pad4[MEM_ALIGN]; + }; +}; + +enum helperThreadState {ThreadStart, ThreadStop}; + +#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) + +struct ncclGraphHelperResources { + ncclComm* comm; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; + enum helperThreadState threadState; + void* ipcBases[NCCL_IPC_POOL_SIZE]; + int ipcTail; + int ipcHead; +}; + +struct ncclUserRedOp { + int freeNext; // -1=allocated, otherwise index of next free entry in array + ncclDataType_t datatype; + ncclDevRedOpFull opFull; +}; + +struct ncclNodeRanks { + int localRanks; + int* localRankToRank; +}; + +struct ncclDestructor { + struct ncclDestructor* next; + void* obj; + ncclResult_t(*fn)(struct ncclDestructor* me); +}; + +struct ncclCommCallback { + struct ncclCommCallback* next; + ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); +}; + +struct ncclSharedResources { + int refCount; + struct ncclComm* owner; /* comm which creates this shared res. */ + struct ncclChannelPeer* peers[MAXCHANNELS]; + struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; + /* P2P operation counter, one per channel */ + uint64_t p2pOpCount[MAXCHANNELS]; + /* Collective operation counter */ + uint64_t collOpCount; + int tpNRanks; + int tpNLocalRanks; + int tpNChannels; + int tpP2pNChannels; + int tpP2pChunkSize; + uint64_t magic; + + // top parent rank to localRank translation table + int* tpRankToLocalRank; + // Internal streams + struct ncclStrongStream deviceStream, hostStream; + + /* proxy related shared res */ + struct ncclProxyState* proxyState; +}; + +struct ncclChannel { + struct ncclChannelPeer** peers; + struct ncclDevChannelPeer** devPeers; + /* devPeer pointer array used for host side access */ + struct ncclDevChannelPeer** devPeersHostPtr; + struct ncclRing ring; + int* devRingUserRanks; + struct ncclTree tree; + + struct ncclTree collnetChain; + struct ncclDirect collnetDirect; + + struct ncclNvls nvls; + + int id; // index of this channel + uint32_t workFifoSent; // last used work index+1 + + /* comm split sharable resources */ + struct ncclChannelPeer* collnetPeers; + struct ncclDevChannelPeer* collnetDevPeers; + struct ncclChannelPeer* nvlsPeers; + struct ncclDevChannelPeer* nvlsDevPeers; +}; + +struct ncclWorkList { + struct ncclWorkList* next; + struct ncclWork work; +}; + +struct ncclPointerList { + struct ncclPointerList* next; + void *ptr; +}; + +struct ncclNvlsMcHandleList { + struct ncclNvlsMcHandleList *next; + CUmemGenericAllocationHandle mcHandle; + CUdeviceptr ptr; + int dev; + size_t size; +}; + +struct ncclKernelPlan { + // A kernel plan is also a callback that reclaims itself. Hence this must + // be the first member. + struct ncclCommCallback reclaimer; + struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup + + struct ncclComm* comm; + struct ncclKernelPlan* next; + + bool persistent; // aka captured in a graph + bool kernelSpecialized; + void *kernelFn; + int channelUbound; // only channels c < channelUbound are present + int channelCount; // number of channels present + uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask) + bool hasProxyOps; // does any channel have a non-empty proxyOpQueue + int threadPerBlock; + // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel() + struct ncclWork* workHead; + + int collOpCount; // zero based for this plan + + struct ncclIntruQueue ipcMemQueue; + struct ncclIntruQueue nvlsMcHandleQueue; + + struct Channel { + int nWork; + union { + int nWorkElem; // used for coll and reg coll + int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1 + }; + size_t collBytes; + struct ncclIntruQueue workQueue; + struct ncclIntruQueue proxyOpQueue; + } channels[MAXCHANNELS]; +}; + +struct ncclRegRequest { + uintptr_t buff; + size_t size; + struct ncclRegRequest *next; +}; + +struct ncclRegRecord { + uintptr_t buff; + size_t size; + CUdeviceptr regAddr; + size_t regSize; + int dev; + CUmemGenericAllocationHandle mcHandle; + uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */ + struct ncclRegRecord *next; +}; + +struct ncclComm { + struct ncclMemoryStack memPermanent, memScoped; + // List of destructors to run when comm is destructed + struct ncclDestructor* destructorHead; + + struct ncclSharedResources* sharedRes; + /* map to top parent ranks. */ + int* topParentRanks; + int* topParentLocalRanks; + struct ncclChannel channels[MAXCHANNELS]; + struct ncclPeerInfo* peerInfo; + struct ncclTopoSystem* topo; + + ncclNet_t* ncclNet; + ncclCollNet_t* ncclCollNet; + void* bootstrap; + // Bitmasks for ncclTransportP2pSetup + uint64_t* connectSend; + uint64_t* connectRecv; + + uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. + + uint64_t commHash; + int rank; // my rank in the communicator + int nRanks; // number of GPUs in communicator + int cudaDev; // my cuda device index + int nvmlDev; // my nvml device index + int compCap; // compute capability of the GPU + int minCompCap, maxCompCap; // min/max compute capability in the communicator + int64_t busId; // my PCI bus ID in int format + cpu_set_t cpuAffinity; // CPU affinity of the GPU + int cudaArch; // matches __CUDA_ARCH__ of device + + int node; + int nNodes; + int localRank; + int localRanks; + int maxLocalRanks; + int* rankToNode; + int* rankToLocalRank; + int* localRankToRank; + // localRanks and localRanktoRank for all nodes + struct ncclNodeRanks* nodeRanks; + + bool checkPointers; + bool dmaBufSupport; + + // Counter for tracking CUDA launches (P2P and collectives included) + uint64_t opCount; + + // Channels for collectives + int nChannels; + int nvlsChannels; + int collNetChannels; + // Channels (per peer) for p2p + int p2pnChannels; + int p2pnChannelsPerPeer; + int p2pChannels[MAXCHANNELS]; + + // Should this comm allocate LL buffers for network P2P connections? + bool allocP2pNetLLBuffers; + + // Buffer sizes + int buffSizes[NCCL_NUM_PROTOCOLS]; + int p2pChunkSize; + + // Algorithm/Protocols thresholds + ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS]; + int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + + /* This attribute can indicate the states of communicators and return code of + * asynchronous NCCL operations. */ + ncclResult_t asyncResult; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + volatile uint32_t *childAbortFlag; + uint32_t *abortFlagRefCount; + + // Device side of the communicator (for cudaFree's) + struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm + + // Operation pool. + int workFifoDepth; // size of workFifoHeap[], power of 2 + struct ncclWork* workFifoHeap; + struct ncclWork* devWorkFifoHeap; + void* workFifoHeapGdrHandle; + + // Work completion notificaion + uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory + uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. + uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. + + // Intra-process sync + struct ncclComm* intraComm0; // leader of intra-process comms (self possible) + struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head + int intraRank; + int intraRanks; + uint32_t intraBarrierPhase; + char intraPad1[64 - sizeof(uint64_t)]; + uint64_t intraBarrierCounter; // only used if this is intraComm0 + char intraPad2[64 - sizeof(uint64_t)]; + uint64_t intraBarrierGate; // only used if this is intraComm0 + + struct ncclProxyState* proxyState; + int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ + // Whether this communicator uses collNet + int collNetSupport; + uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes]; + int intraHighestTransportType; + int* collNetHeads; + int collNetHeadsNum; + /* sharable collNet proxy progress resource. */ + struct ncclCollNetSharedRes* collNetSharedRes; + + // NVLink SHARP (NVLS) support + int nvlsSupport; + int nvlsRegSupport; + /* sharable NVLS resource. */ + struct ncclNvlsSharedRes* nvlsResources; + + ssize_t channelSize; // User requested work size (bytes) for channel partitions + + // pools backed by comm->memPermanent + struct ncclMemoryPool memPool_ncclProxyOp; + struct ncclMemoryPool memPool_ncclKernelPlan; + struct ncclMemoryPool memPool_ncclPointerList; + struct ncclMemoryPool memPool_ncclNvlsHandleList; + // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when + // this comm is not yet in a group. + struct ncclComm* groupNext; + // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. + struct ncclComm* preconnectNext; + int persistentRefs; // number of persistent plan-lists capturing this comm + struct ncclTasks tasks; + + // user-created reduction ops + int userRedOpCapacity, userRedOpFreeHead; + ncclUserRedOp *userRedOps; + + // Queue of things for the main thread to do + struct ncclIntruQueueMpsc callbackQueue; + + // List of kernel plans built form tasks. + struct ncclIntruQueue planQueue; + // First of the unlaunched kernels in `planQueue` + struct ncclKernelPlan* unlaunchedPlansHead; + + ncclConfig_t config; + // initState is to more conveniently reclaim resources when errors happen. + ncclResult_t initState; + // flag to indicate if ncclCommFinalize() is called + bool finalizeCalled; + // shared structures for finalization + int finalizeRankCnt; + // group job to support multi-thread FT + struct ncclGroupJob *groupJob; + + /* store to buffer register request */ + struct ncclIntruQueue regRequestQueue; + /* store registered buffer */ + struct ncclIntruQueue regRecordQueue; + + // Tuning plugin + ncclTuner_t* tuner; +}; + +enum ncclLaunchMode { + ncclLaunchModeInvalid=0, + ncclLaunchModeParallel, + ncclLaunchModeGroup +}; +extern enum ncclLaunchMode ncclParamLaunchMode; + +void ncclCommPushFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); +void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); + +inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { + ncclResult_t result = ncclSuccess; + struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); + while (cb != nullptr) { + struct ncclCommCallback* next = cb->next; + ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb + if (res1 != ncclSuccess) result = res1; + cb = next; + } + NCCLCHECK(result); + return ncclSuccess; +} + +inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { + int phase = comm->intraBarrierPhase; + if (comm->intraRanks == 1) { + // Release everyone (just me). + comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); + } else { + struct ncclComm* comm0 = comm->intraComm0; + uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); + if (uint32_t(count) == uint32_t(comm->intraRanks)) { + // Reset. + __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); + // Release everyone. + __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); + } + } +} + +// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) +inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { + struct ncclComm* comm0 = comm->intraComm0; + comm->intraBarrierPhase ^= 1; + uint32_t phase = comm->intraBarrierPhase; + uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); + if ((gate & 1) != phase) { + uint64_t t0 = clockNano(); + do { + // Spin vigorously for first 5us. + if (clockNano()-t0 >= 5*1000) sched_yield(); + gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); + } while ((gate & 1) != phase); + } + if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); + return gate>>32; +} + +// Scrambles the bits of non-builtin values of ncclRedOp_t according to the +// communicator memory address. Used to catch bugs so that integer handles +// associated with this communicator won't collide with handles of other +// communicatrs. This function is its own inverse. +static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { + // Preserve the built-in values. + if(int(op) < int(ncclNumOps)) + return op; + uint64_t h = reinterpret_cast(comm); + h ^= h >> 32; + h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant + h >>= 32; // h is now an excellent 32-bit hash of the comm pointer + h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1 + int op1 = int(h) ^ int(op); + // Since builtin values are preserved, we also have to preserve their preimage. + return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); +} + +ncclResult_t ncclCommEnsureReady(ncclComm_t comm); +ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); + +#endif diff --git a/nvls/core.h b/nvls/core.h new file mode 100644 index 000000000..a1754beeb --- /dev/null +++ b/nvls/core.h @@ -0,0 +1,41 @@ +/************************************************************************* + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CORE_H_ +#define NCCL_CORE_H_ + +#include +#include +#include +#include +#include // For std::min/std::max +#include "nccl.h" + +#ifdef PROFAPI +#define NCCL_API(ret, func, args...) \ + __attribute__ ((visibility("default"))) \ + __attribute__ ((alias(#func))) \ + ret p##func (args); \ + extern "C" \ + __attribute__ ((visibility("default"))) \ + __attribute__ ((weak)) \ + ret func(args) +#else +#define NCCL_API(ret, func, args...) \ + extern "C" \ + __attribute__ ((visibility("default"))) \ + ret func(args) +#endif // end PROFAPI + +#include "debug.h" +#include "checks.h" +#include "cudawrap.h" +#include "alloc.h" +#include "utils.h" +#include "param.h" +#include "nvtx.h" + +#endif // end include guard diff --git a/nvls/cpuset.h b/nvls/cpuset.h new file mode 100644 index 000000000..ec55cbc54 --- /dev/null +++ b/nvls/cpuset.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CPUSET_H_ +#define NCCL_CPUSET_H_ + +// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t + +static int hexToInt(char c) { + int v = c - '0'; + if (v < 0) return -1; + if (v > 9) v = 10 + c - 'a'; + if ((v < 0) || (v > 15)) return -1; + return v; +} + +#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) + +static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { + uint32_t cpumasks[CPU_SET_N_U32]; + int m = CPU_SET_N_U32-1; + cpumasks[m] = 0; + for (int o=0; o=0; o--) { + if (c == 0 && m8[o] == 0) continue; + sprintf(str+c, "%02x", m8[o]); + c+=2; + if (o && o%4 == 0) { + sprintf(str+c, ","); + c++; + } + } + str[c] = '\0'; + return ncclSuccess; +} + +#endif diff --git a/nvls/cudawrap.h b/nvls/cudawrap.h new file mode 100644 index 000000000..cc363c1ac --- /dev/null +++ b/nvls/cudawrap.h @@ -0,0 +1,129 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CUDAWRAP_H_ +#define NCCL_CUDAWRAP_H_ + +#include +#include +#include "checks.h" + +// Is cuMem API usage enabled +extern int ncclCuMemEnable(); + +#if CUDART_VERSION >= 11030 +#include +#else +typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion); +typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); +#endif + +#define CUPFN(symbol) pfn_##symbol + +// Check CUDA PFN driver calls +#define CUCHECK(cmd) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + WARN("Cuda failure %d '%s'", err, errStr); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUCHECKGOTO(cmd, res, label) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + WARN("Cuda failure %d '%s'", err, errStr); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +// Report failure but clear error and continue +#define CUCHECKIGNORE(cmd) do { \ + CUresult err = pfn_##cmd; \ + if( err != CUDA_SUCCESS ) { \ + const char *errStr; \ + (void) pfn_cuGetErrorString(err, &errStr); \ + INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \ + } \ +} while(false) + +#define CUCHECKTHREAD(cmd, args) do { \ + CUresult err = pfn_##cmd; \ + if (err != CUDA_SUCCESS) { \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ + args->ret = ncclUnhandledCudaError; \ + return args; \ + } \ +} while(0) + +#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol + +#if CUDART_VERSION >= 11030 +/* CUDA Driver functions loaded with cuGetProcAddress for versioning */ +DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000); +DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000); +DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020); +DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); +DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000); +DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000); +// cuMem API support +DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000); +DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020); +DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020); +#if CUDA_VERSION >= 11070 +DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support +#endif +#if CUDA_VERSION >= 12010 +/* NVSwitch Multicast support */ +DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010); +DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010); +#endif +#endif + +/* CUDA Driver functions loaded with dlsym() */ +DECLARE_CUDA_PFN_EXTERN(cuInit, 2000); +DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020); +DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030); + + +ncclResult_t ncclCudaLibraryInit(void); + +extern int ncclCudaDriverVersionCache; +extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() + +inline ncclResult_t ncclCudaDriverVersion(int* driver) { + int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); + if (version == -1) { + CUDACHECK(cudaDriverGetVersion(&version)); + __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); + } + *driver = version; + return ncclSuccess; +} +#endif diff --git a/nvls/debug.h b/nvls/debug.h new file mode 100644 index 000000000..d10217856 --- /dev/null +++ b/nvls/debug.h @@ -0,0 +1,48 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_DEBUG_H_ +#define NCCL_INT_DEBUG_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include +#include +#include + +#include +#include +#include + +// Conform to pthread and NVTX standard +#define NCCL_THREAD_NAMELEN 16 + +extern int ncclDebugLevel; +extern uint64_t ncclDebugMask; +extern pthread_mutex_t ncclDebugLock; +extern FILE *ncclDebugFile; +extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); + +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); + +// Let code temporarily downgrade WARN into INFO +extern thread_local int ncclDebugNoWarn; +extern char ncclLastError[]; + +#define WARN(...) printf(__VA_ARGS__) +#define INFO(FLAGS, ...) printf(__VA_ARGS__) +#define TRACE_CALL(...) printf(__VA_ARGS__) + +#ifdef ENABLE_TRACE +#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) +extern std::chrono::steady_clock::time_point ncclEpoch; +#else +#define TRACE(...) +#endif + +void ncclSetThreadName(pthread_t thread, const char *fmt, ...); + +#endif diff --git a/nvls/device.h b/nvls/device.h new file mode 100644 index 000000000..56f8039f3 --- /dev/null +++ b/nvls/device.h @@ -0,0 +1,463 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_H_ +#define NCCL_DEVICE_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "align.h" +#include + +extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; + +extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; + +extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; + +#define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 + +#include "net_device.h" + +enum ncclDevRedOp_t { + ncclDevSum, ncclDevProd, ncclDevMinMax, + ncclDevPreMulSum, ncclDevSumPostDiv, + ncclNumDevRedOps +}; +struct ncclDevRedOpFull { + ncclDevRedOp_t op; + ncclRedOp_t proxyOp; + bool scalarArgIsPtr; + uint64_t scalarArg; +}; + +union ncclLLFifoLine { + /* Flags have to be *after* data, because otherwise, an incomplete receive + from the network may receive the flag but not the data. + Note this is assuming that either we receive contiguous chunks of data + (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ + struct { + uint32_t data1; + uint32_t flag1; + uint32_t data2; + uint32_t flag2; + }; + uint64_t v[2]; + int4 i4; +}; + +#define WARP_SIZE 32 +#define MAXCHANNELS 32 +#define NCCL_MAX_NTHREADS 640 +#define NCCL_SIMPLE_MAX_NTHREADS 512 +#define NCCL_LL_MAX_NTHREADS 512 +#define NCCL_LL_LINES_PER_THREAD 8 +#ifdef TEST_LL_CLEANUP +#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup +#define NCCL_LL_FLAG_MAX 0x100 +#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) +#else +#define NCCL_LL_CLEAN_MASK 0x7ffffff8 +#define NCCL_LL_FLAG(a) ((uint32_t)(a)) +#endif +// Make sure the clean mask will last for at least NCCL_NSTEPS +static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); + +#define NCCL_LL128_LINESIZE 128 +#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) +#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) + +#define NCCL_LL128_MAX_NTHREADS 640 +#define NCCL_LL128_ELEMS_PER_THREAD 120 + +#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 +#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) + +#define NCCL_DIRECT_WRITE 0x01 +#define NCCL_DIRECT_READ 0x02 +#define NCCL_DIRECT_NIC 0x04 +#define NCCL_IPC_WRITE 0x08 +#define NCCL_IPC_READ 0x10 +#define NCCL_NVLS_MIN_POLL 0x20 + +struct ncclConnInfo { + // Regular comm mechanism + char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send + void* mhandles[NCCL_NUM_PROTOCOLS]; + uint64_t *tail; // Local for recv, remote for send + uint64_t *head; // Local for send, remote for recv + + int flags; // Direct communication / other flags + int shared; // Buffers are shared + void **ptrExchange; // Pointer exchange for direct communication + uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case + + int *sizesFifo; // Sizes fifo from GPU to proxy + int *offsFifo; // Buffer fifo from proxy to GPU + + uint64_t step; // Keep where we are + uint64_t llLastCleaning; + ncclNetDeviceHandle_t netDeviceHandle; +}; + +struct ncclProxyConnector { + int tpRank; + int tpLocalRank; + int sameProcess; + struct ncclProxyConnection* connection; + ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary +}; + +struct ncclConnector { + int connected; + struct ncclProxyConnector proxyConn; + struct ncclTransportComm* transportComm; + void* transportResources; + struct ncclConnInfo conn; +}; + +struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + + int index; // This rank's index in the ring +}; + + +// The root of each tree only has one node down (+1 intra-node). +#define NCCL_MAX_TREE_ARITY_TOP 2 +// Nodes inside the binary tree can have to two nodes down (+1 intra-node). +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +#define NCCL_MAX_DIRECT_ARITY 7 +struct ncclDirect { + int depth; + int out; + int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down + int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) + int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads + int up[NCCL_MAX_DIRECT_ARITY]; + int down[NCCL_MAX_DIRECT_ARITY]; +}; + +#define NCCL_MAX_NVLS_ARITY 8 +#define NCCL_MAX_NVLS_TREE_ARITY 3 +struct ncclNvls { + int out; + int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down + int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) + int up[NCCL_MAX_NVLS_ARITY]; + int down; + int treeUp; + int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; + int node; + int nNodes; +}; + +#define NCCL_MAX_CONNS 2 +struct ncclChannelPeer { + struct ncclConnector send[NCCL_MAX_CONNS]; + struct ncclConnector recv[NCCL_MAX_CONNS]; + int refCount; +}; + +struct ncclDevComm; + +/* ncclWork is to be a power of two, currently 8x64 bytes, */ +/* to make sure reads to host from the CUDA kernel are aligned. */ +/* Make sure to adjust padding at the end of ncclWorkElem. */ +#define NCCL_WORK_SIZE 512 + +enum ncclWorkType : uint8_t { + ncclWorkTypeUnused=0, + ncclWorkTypeColl=1, + ncclWorkTypeP2p=2, + ncclWorkTypeRegColl=3 +}; +enum ncclWorkP2PType : uint8_t { + ncclWorkP2pTypeUnused=0, + ncclWorkP2pTypeSend, + ncclWorkP2pTypeRecv +}; + +struct ncclWorkHeader { + union { + int32_t workNext; // when isLast=0: Offset from kernel argument workHead + uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. + }; + uint16_t funcIndex; + uint8_t isLast:1; // last work for this kernel + uint8_t inFifo:1; // is this work in the fifo + enum ncclWorkType type; +}; + +struct ncclWorkElem { + union { + uint8_t flagBits; + struct { + uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1; + }; + }; + uint8_t nWarps; + uint8_t direct; + + const void * sendbuff; + void * recvbuff; + + size_t count; + size_t lastChunkSize; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint64_t redOpArg; +}; + +#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) +static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); + +struct ncclWorkElemP2p { + int peer : 30; + int proto : 2; + + enum ncclWorkP2PType p2pType; + uint8_t nWarps; + uint8_t warpStart; + uint8_t ngroups; + // Important not to use any fields with greater than 4-byte alignment since + // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if + // there were 8-byte fields. + //void* buff; + uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32; + //size_t count; + uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32; + int chunkSize; +}; + +static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16"); +#define NCCL_MAX_WORK_ELEMENTS_P2P 16 + +struct ncclWorkElemReg { + struct ncclWorkElem elem; + void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; + void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; + void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; +}; + +#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg)) +static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2"); + +// Number of named barriers supported by CUDA +#define NCCL_MAX_GROUPS 16 + +struct ncclWork { + struct ncclWorkHeader header; + union { + char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)]; + struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; + struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; + struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; + }; +}; +static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE"); +static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0"); + +struct ncclDevChannelPeer { + // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo + // instead of the full ncclConnector. + struct ncclConnInfo send[NCCL_MAX_CONNS]; + struct ncclConnInfo recv[NCCL_MAX_CONNS]; +}; + +struct alignas(16) ncclDevChannel { + struct ncclDevChannelPeer** peers; + struct ncclRing ring; + struct ncclTree tree; + struct ncclTree collnetChain; + struct ncclDirect collnetDirect; + struct ncclNvls nvls; + uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed +}; + +struct ncclDevComm { + int rank; + int nRanks; + int buffSizes[NCCL_NUM_PROTOCOLS]; + int p2pChunkSize; + + // Operation list for aggregation + int workFifoDepth; + struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory + + // Flag to ask NCCL kernels to abort + volatile uint32_t* abortFlag; + + // Channels, device side + struct ncclDevChannel* channels/*[MAXCHANNELS]*/; +}; + +struct alignas(16) ncclDevCommAndChannels { + struct ncclDevComm comm; + struct ncclDevChannel channels[MAXCHANNELS]; +}; + +#ifdef __CUDA_ARCH__ + #define NCCL_CUDA_ARCH __CUDA_ARCH__ +#else + #define NCCL_CUDA_ARCH 0 +#endif + +template +__host__ __device__ constexpr T min_constexpr(T a) { return a; } +template +__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) { + return min_constexpr((a < b ? a : b), c...); +} + +template +__host__ __device__ constexpr T max_constexpr(T a) { return a; } +template +__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) { + return max_constexpr((a > b ? a : b), c...); +} + +// Calculate the unroll factor given: +// * bytePerPack: number of bytes accessed per instruction +// * insns: max permissible unroll value +// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack) +__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) { + return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack); +} + +// Note that all unroll value logic should depend on a given cudaArch argument +// and not __CUDA_ARCH__ since these need to be host-side executable where the +// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device +// side code can elide passing the arch for brevity. + +__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { + // Our collective unroll should move to the same bytes&insns model as NVLS. + return cudaArch >= 800 ? 8 : 4; +} + +__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } +__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } + +__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) { + return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch)); +} + +// The amount of dynamic shmem per warp +__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { + return (max_constexpr( + /*LL */0, + /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t), + /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16, + // NVLS needs an extra 16B to read unaligned data. + /*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16 + ) + 15) & -16; // pad to 16 bytes +} + +// The amount of dynamic shmem per block +__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { + return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); +} + +// Host-side table of kernel function pointers. +extern int const ncclDevKernelCount; +extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; + +// Table of most specialized kernel function to run given func index. +extern int const ncclDevFuncRowToId[]; +extern void* const ncclDevKernelForFunc[/*funcIndex*/]; +extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; + +// Launch a one-rank reduction on stream. +ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream); + +// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py" +inline bool ncclNvlsSupported(int devRedOp, int type) { + switch (type) { + case ncclInt32: + case ncclUint32: + case ncclInt64: + case ncclUint64: + case ncclFloat16: + #if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: + #endif + return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax; + case ncclFloat: + case ncclDouble: + return devRedOp == ncclDevSum; + default: + return false; + } +} + +// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py" +inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) { + #if defined(__CUDA_BF16_TYPES_EXIST__) + constexpr int NumTypes = ncclNumTypes; + #else + constexpr int NumTypes = ncclNumTypes + 1; + #endif + + int row = 0; // ncclDevFuncIndex_P2p + if (coll == ncclFuncSendRecv) goto have_row; + row += 1; + + if (coll == ncclFuncAllGather) { + int algo1 = algo == NCCL_ALGO_RING ? 0 : + /*algo == NCCL_ALGO_NVLS*/ 1; + row += algo1*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncBroadcast) { + row += proto; + goto have_row; + } + row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncAllReduce) { + row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncReduce) { + row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; + + if (coll == ncclFuncReduceScatter) { + int algo1 = algo == NCCL_ALGO_RING ? 0 : + /*algo == NCCL_ALGO_NVLS*/ 1; + row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto; + goto have_row; + } + row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; + +have_row: + return ncclDevFuncRowToId[row]; +} + +inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; } + +#endif diff --git a/nvls/enqueue.h b/nvls/enqueue.h new file mode 100644 index 000000000..634f037cb --- /dev/null +++ b/nvls/enqueue.h @@ -0,0 +1,26 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ENQUEUE_H_ +#define NCCL_ENQUEUE_H_ + +#include "comm.h" +#include "group.h" +#include "collectives.h" +#include "utils.h" + +#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) +#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ + +ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); +ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); +ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); +ncclResult_t ncclLaunchFinish(struct ncclComm* comm); + +#endif // End include guard diff --git a/nvls/gdrwrap.h b/nvls/gdrwrap.h new file mode 100644 index 000000000..a64674cc5 --- /dev/null +++ b/nvls/gdrwrap.h @@ -0,0 +1,252 @@ +/************************************************************************* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GDRWRAP_H_ +#define NCCL_GDRWRAP_H_ + +#include "nccl.h" +#include // for standard [u]intX_t types +#include +#include + +// These can be used if the GDR library isn't thread safe +#include +extern pthread_mutex_t gdrLock; +#define GDRLOCK() pthread_mutex_lock(&gdrLock) +#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) +#define GDRLOCKCALL(cmd, ret) do { \ + GDRLOCK(); \ + ret = cmd; \ + GDRUNLOCK(); \ +} while(false) + +#define GDRCHECK(cmd) do { \ + int e; \ + /* GDRLOCKCALL(cmd, e); */ \ + e = cmd; \ + if( e != 0 ) { \ + WARN("GDRCOPY failure %d", e); \ + return ncclSystemError; \ + } \ +} while(false) + +// This is required as the GDR memory is mapped WC +#if !defined(__NVCC__) +#if defined(__PPC__) +static inline void wc_store_fence(void) { asm volatile("sync") ; } +#elif defined(__x86_64__) +#include +static inline void wc_store_fence(void) { _mm_sfence(); } +#elif defined(__aarch64__) +#ifdef __cplusplus +#include +static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); } +#else +#include +static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); } +#endif +#endif +#endif + +//#define GDR_DIRECT 1 +#ifdef GDR_DIRECT +// Call the GDR API library code directly rather than via +// dlopen() wrappers +#include + +static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; } +static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; } +static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; } +static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { + GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { + GDRCHECK(gdr_unpin_buffer(g, handle)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { + GDRCHECK(gdr_get_info(g, handle, info)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { + GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { + GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size)); + return ncclSuccess; +} +static void wrap_gdr_runtime_get_version(int *major, int *minor) { + gdr_runtime_get_version(major, minor); + return ncclSuccess; +} +static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { + gdr_driver_get_version(g, major, minor); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { + GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { + GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size)); + return ncclSuccess; +} + +#else +// Dynamically handle dependency the GDR API library + +/* Extracted from gdrapi.h (v2.1 Nov 2020) */ + +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) +#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) +#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) + +struct gdr; +typedef struct gdr *gdr_t; + +typedef struct gdr_mh_s { + unsigned long h; +} gdr_mh_t; + +struct gdr_info { + uint64_t va; + uint64_t mapped_size; + uint32_t page_size; + uint64_t tm_cycles; + uint32_t cycles_per_ms; + unsigned mapped:1; + unsigned wc_mapping:1; +}; +typedef struct gdr_info gdr_info_t; + +/* End of gdrapi.h */ + +ncclResult_t wrap_gdr_symbols(void); + +gdr_t wrap_gdr_open(void); +ncclResult_t wrap_gdr_close(gdr_t g); +ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); +ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); +ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); +ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size); +ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size); +ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor); +ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor); +ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); +ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); + +#endif // GDR_DIRECT + +// Global GDR driver handle +extern gdr_t ncclGdrCopy; + +#include "alloc.h" + +typedef struct gdr_mem_desc { + void *gdrDevMem; + void *gdrMap; + size_t gdrOffset; + size_t gdrMapSize; + gdr_mh_t gdrMh; +} gdr_mem_desc_t; + +static gdr_t ncclGdrInit() { + int libMajor, libMinor, drvMajor, drvMinor; + gdr_t handle = NULL; + // Dynamically load the GDRAPI library symbols + if (wrap_gdr_symbols() == ncclSuccess) { + handle = wrap_gdr_open(); + + if (handle != NULL) { + ncclResult_t res; + + // Query the version of libgdrapi + NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); + + // Query the version of gdrdrv driver + NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); + + // Only support GDRAPI 2.1 and later + if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { + goto error; + } + else + INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); + } + } + return handle; +error: + if (handle != NULL) (void) wrap_gdr_close(handle); + return NULL; +} + +template +static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { + gdr_info_t info; + size_t mapSize; + gdr_mh_t mh; + char *devMem; + void *gdrMap; + + mapSize = sizeof(T)*nelem; + + // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE + ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); + // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1)); + uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; + size_t align = alignedAddr - (uint64_t)devMem; + + //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize); + NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); + + NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); + //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap); + + NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info)); + + // Will offset ever be non zero ? + ssize_t off = info.va - alignedAddr; + + gdr_mem_desc_t* md; + NCCLCHECK(ncclCalloc(&md, 1)); + md->gdrDevMem = devMem; + md->gdrMap = gdrMap; + md->gdrMapSize = mapSize; + md->gdrOffset = off+align; + md->gdrMh = mh; + *gdrHandle = md; + + *ptr = (T *)((char *)gdrMap+off); + if (devPtr) *devPtr = (T *)(devMem+off+align); + + TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", + md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); + + return ncclSuccess; +} + +template +static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { + gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; + NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T))); + return ncclSuccess; +} + +static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { + gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; + NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); + NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); + NCCLCHECK(ncclCudaFree(md->gdrDevMem)); + free(md); + + return ncclSuccess; +} + +#endif // End include guard diff --git a/nvls/graph.h b/nvls/graph.h new file mode 100644 index 000000000..fdd634894 --- /dev/null +++ b/nvls/graph.h @@ -0,0 +1,116 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GRAPH_H_ +#define NCCL_GRAPH_H_ + +#include "nccl.h" +#include "device.h" +#include +#include +#include +#include +#include + +ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); + +struct ncclTopoSystem; +// Build the topology +ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); +ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); +ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); + +ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm); +void ncclTopoFree(struct ncclTopoSystem* system); +ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); +ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); +ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); +int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); + +// Query topology +ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); +ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); +ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); +ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); +ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); +int ncclPxnDisable(struct ncclComm* comm); +ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); + +// Find CPU affinity +ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); + +#define NCCL_TOPO_CPU_ARCH_X86 1 +#define NCCL_TOPO_CPU_ARCH_POWER 2 +#define NCCL_TOPO_CPU_ARCH_ARM 3 +#define NCCL_TOPO_CPU_VENDOR_INTEL 1 +#define NCCL_TOPO_CPU_VENDOR_AMD 2 +#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 +#define NCCL_TOPO_CPU_TYPE_BDW 1 +#define NCCL_TOPO_CPU_TYPE_SKL 2 +#define NCCL_TOPO_CPU_TYPE_YONGFENG 1 +ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); +ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); +ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id); +ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex); + +#define NCCL_TOPO_MAX_NODES 256 + +// Init search. Needs to be done before calling ncclTopoCompute +ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); + +#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) +#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) +#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU +#define NCCL_TOPO_PATTERN_RING 4 // Ring +#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree +struct ncclTopoGraph { + // Input / output + int id; // ring : 0, tree : 1, collnet : 2 + int pattern; + int crossNic; + int collNet; + int minChannels; + int maxChannels; + // Output + int nChannels; + float bwIntra; + float bwInter; + float latencyInter; + int typeIntra; + int typeInter; + int sameChannels; + int nHops; + int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; + int inter[MAXCHANNELS*2]; +}; +ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); + +ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); +ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); + +struct ncclTopoRanks { + int ringRecv[MAXCHANNELS]; + int ringSend[MAXCHANNELS]; + int ringPrev[MAXCHANNELS]; + int ringNext[MAXCHANNELS]; + int treeToParent[MAXCHANNELS]; + int treeToChild0[MAXCHANNELS]; + int treeToChild1[MAXCHANNELS]; + int nvlsHeads[MAXCHANNELS]; +}; + +ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks); + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, + struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs); + +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); +#include "info.h" +ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL); + +#endif diff --git a/nvls/group.h b/nvls/group.h new file mode 100644 index 000000000..72251147f --- /dev/null +++ b/nvls/group.h @@ -0,0 +1,137 @@ +/************************************************************************* + * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GROUP_H_ +#define NCCL_GROUP_H_ + +#include "nccl.h" +#include "comm.h" + +ncclResult_t ncclGroupErrCheck(ncclResult_t ret); +void ncclGroupCommJoin(struct ncclComm* comm); +void ncclGroupCommPreconnect(struct ncclComm* comm); +ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); +ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob); +ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob); + +typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); + +ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); + +typedef enum ncclGroupJobState { + ncclGroupJobRunning = 0, + ncclGroupJobDone = 1, + ncclGroupJobJoined = 2, +} ncclGroupJobState_t; + +struct ncclAsyncJob { + struct ncclAsyncJob* next; + pthread_t thread; + ncclResult_t result; + ncclResult_t(*func)(struct ncclAsyncJob*); + void(*undo)(struct ncclAsyncJob*); + void(*destructor)(void*); + ncclGroupJobState_t state; + volatile uint32_t *abortFlag; /* point to comm abortFlag */ + volatile uint32_t *childAbortFlag; /* point to child abortFlag */ + ncclComm_t comm; +}; + +ncclResult_t ncclAsyncLaunch( + struct ncclAsyncJob* job, + ncclResult_t(*func)(struct ncclAsyncJob*), + void(*undo)(struct ncclAsyncJob*), + void(*destructor)(void*), ncclComm_t comm +); + +struct ncclGroupJob { + struct ncclAsyncJob base; + struct ncclComm **groupCommHeadPtr; + struct ncclComm **groupCommPreconnectHeadPtr; + ncclResult_t *groupErrorPtr; + volatile bool *abortFlagPtr; + int *groupBlockingPtr; + struct ncclIntruQueue *asyncJobsPtr; + bool initialized; +}; + +ncclResult_t ncclGroupStartInternal(); +ncclResult_t ncclGroupEndInternal(); +ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); + +//////////////////////////////////////////////////////////////////////////////// + +extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting +extern __thread ncclResult_t ncclGroupError; +extern __thread struct ncclComm* ncclGroupCommHead; +extern __thread struct ncclComm* ncclGroupCommPreconnectHead; +extern __thread int ncclGroupBlocking; +extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; +extern __thread struct ncclGroupJob ncclGroupJobMain; + +static inline void groupResetJobState() { + ncclGroupBlocking = -1; + ncclGroupJobMainPtr = NULL; + memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); + return; +} + +static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { + ncclResult_t ret = ncclSuccess; + if (job) { + ret = ncclAsyncJobComplete(&job->base); + groupResetJobState(); + } + return ret; +} + +inline ncclResult_t ncclGroupStartInternal() { + ncclGroupDepth++; + return ncclSuccess; +} + +inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { + if (ncclGroupDepth > 0) { + if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; + } + return ret; +} + +// Add comm to this thread's group +inline void ncclGroupCommJoin(struct ncclComm* comm) { + if (comm->groupNext == reinterpret_cast(0x1)) { + // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves + // the users program order yet insures siblings occur consecutively. This + // is required by doLaunches() in "group.cc". + struct ncclComm** pp = &ncclGroupCommHead; + while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) + pp = &(*pp)->groupNext; + comm->groupNext = *pp; + *pp = comm; + // Comms gets a new memory stack scope upon joining. Each task batched for + // this comm is allocated there. + ncclMemoryStackPush(&comm->memScoped); + } + + ncclGroupBlocking = comm->config.blocking; +} + +// Add comm to this thread's group needing preconnect +inline void ncclGroupCommPreconnect(struct ncclComm* comm) { + if (comm->preconnectNext == reinterpret_cast(0x1)) { + comm->preconnectNext = ncclGroupCommPreconnectHead; + ncclGroupCommPreconnectHead = comm; + } +} + +// Comm has left group +inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { + comm->groupNext = reinterpret_cast(0x1); + ncclMemoryStackPop(&comm->memScoped); + return ncclSuccess; +} + +#endif diff --git a/nvls/ibvcore.h b/nvls/ibvcore.h new file mode 100644 index 000000000..8d8ecf1ec --- /dev/null +++ b/nvls/ibvcore.h @@ -0,0 +1,1058 @@ +#ifndef NCCL_IBV_CORE_H_ +#define NCCL_IBV_CORE_H_ + +/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without + * explicit including of IB verbs header. + */ + +#include +#include +#include +#include + +#if __GNUC__ >= 3 +# define __attribute_const __attribute__((const)) +#else +# define __attribute_const +#endif + +union ibv_gid { + uint8_t raw[16]; + struct { + uint64_t subnet_prefix; + uint64_t interface_id; + } global; +}; + +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) \ + ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) +#endif + +#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) + +/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ +//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; + +enum ibv_node_type { + IBV_NODE_UNKNOWN = -1, + IBV_NODE_CA = 1, + IBV_NODE_SWITCH, + IBV_NODE_ROUTER, + IBV_NODE_RNIC, + + /* Leave a gap for future node types before starting with + * experimental node types. + */ + IBV_EXP_NODE_TYPE_START = 32, + IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START +}; + +enum ibv_transport_type { + IBV_TRANSPORT_UNKNOWN = -1, + IBV_TRANSPORT_IB = 0, + IBV_TRANSPORT_IWARP, + + /* Leave a gap for future transport types before starting with + * experimental transport types. + */ + IBV_EXP_TRANSPORT_TYPE_START = 32, + IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START +}; + +enum ibv_device_cap_flags { + IBV_DEVICE_RESIZE_MAX_WR = 1, + IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, + IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, + IBV_DEVICE_RAW_MULTI = 1 << 3, + IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, + IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, + IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, + IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, + IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, + IBV_DEVICE_INIT_TYPE = 1 << 9, + IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, + IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, + IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, + IBV_DEVICE_SRQ_RESIZE = 1 << 13, + IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, + IBV_DEVICE_XRC = 1 << 20, + IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 +}; + +enum ibv_atomic_cap { + IBV_ATOMIC_NONE, + IBV_ATOMIC_HCA, + IBV_ATOMIC_GLOB +}; + +struct ibv_device_attr { + char fw_ver[64]; + uint64_t node_guid; + uint64_t sys_image_guid; + uint64_t max_mr_size; + uint64_t page_size_cap; + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t hw_ver; + int max_qp; + int max_qp_wr; + int device_cap_flags; + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_qp_rd_atom; + int max_ee_rd_atom; + int max_res_rd_atom; + int max_qp_init_rd_atom; + int max_ee_init_rd_atom; + enum ibv_atomic_cap atomic_cap; + int max_ee; + int max_rdd; + int max_mw; + int max_raw_ipv6_qp; + int max_raw_ethy_qp; + int max_mcast_grp; + int max_mcast_qp_attach; + int max_total_mcast_qp_attach; + int max_ah; + int max_fmr; + int max_map_per_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; + uint16_t max_pkeys; + uint8_t local_ca_ack_delay; + uint8_t phys_port_cnt; +}; + +enum ibv_mtu { + IBV_MTU_256 = 1, + IBV_MTU_512 = 2, + IBV_MTU_1024 = 3, + IBV_MTU_2048 = 4, + IBV_MTU_4096 = 5 +}; + +enum ibv_port_state { + IBV_PORT_NOP = 0, + IBV_PORT_DOWN = 1, + IBV_PORT_INIT = 2, + IBV_PORT_ARMED = 3, + IBV_PORT_ACTIVE = 4, + IBV_PORT_ACTIVE_DEFER = 5 +}; + +enum { + IBV_LINK_LAYER_UNSPECIFIED, + IBV_LINK_LAYER_INFINIBAND, + IBV_LINK_LAYER_ETHERNET, + + /* Leave a gap for future link layer types before starting with + * experimental link layer. + */ + IBV_EXP_LINK_LAYER_START = 32, + IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START +}; + +enum ibv_port_cap_flags { + IBV_PORT_SM = 1 << 1, + IBV_PORT_NOTICE_SUP = 1 << 2, + IBV_PORT_TRAP_SUP = 1 << 3, + IBV_PORT_OPT_IPD_SUP = 1 << 4, + IBV_PORT_AUTO_MIGR_SUP = 1 << 5, + IBV_PORT_SL_MAP_SUP = 1 << 6, + IBV_PORT_MKEY_NVRAM = 1 << 7, + IBV_PORT_PKEY_NVRAM = 1 << 8, + IBV_PORT_LED_INFO_SUP = 1 << 9, + IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, + IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, + IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, + IBV_PORT_CM_SUP = 1 << 16, + IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, + IBV_PORT_REINIT_SUP = 1 << 18, + IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, + IBV_PORT_VENDOR_CLASS = 1 << 24, + IBV_PORT_CLIENT_REG_SUP = 1 << 25, + IBV_PORT_IP_BASED_GIDS = 1 << 26, +}; + +struct ibv_port_attr { + enum ibv_port_state state; + enum ibv_mtu max_mtu; + enum ibv_mtu active_mtu; + int gid_tbl_len; + uint32_t port_cap_flags; + uint32_t max_msg_sz; + uint32_t bad_pkey_cntr; + uint32_t qkey_viol_cntr; + uint16_t pkey_tbl_len; + uint16_t lid; + uint16_t sm_lid; + uint8_t lmc; + uint8_t max_vl_num; + uint8_t sm_sl; + uint8_t subnet_timeout; + uint8_t init_type_reply; + uint8_t active_width; + uint8_t active_speed; + uint8_t phys_state; + uint8_t link_layer; + uint8_t reserved; +}; + +enum ibv_event_type { + IBV_EVENT_CQ_ERR, + IBV_EVENT_QP_FATAL, + IBV_EVENT_QP_REQ_ERR, + IBV_EVENT_QP_ACCESS_ERR, + IBV_EVENT_COMM_EST, + IBV_EVENT_SQ_DRAINED, + IBV_EVENT_PATH_MIG, + IBV_EVENT_PATH_MIG_ERR, + IBV_EVENT_DEVICE_FATAL, + IBV_EVENT_PORT_ACTIVE, + IBV_EVENT_PORT_ERR, + IBV_EVENT_LID_CHANGE, + IBV_EVENT_PKEY_CHANGE, + IBV_EVENT_SM_CHANGE, + IBV_EVENT_SRQ_ERR, + IBV_EVENT_SRQ_LIMIT_REACHED, + IBV_EVENT_QP_LAST_WQE_REACHED, + IBV_EVENT_CLIENT_REREGISTER, + IBV_EVENT_GID_CHANGE, + + /* new experimental events start here leaving enough + * room for 14 events which should be enough + */ + IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, + IBV_EXP_EVENT_DCT_ACCESS_ERR, + IBV_EXP_EVENT_DCT_REQ_ERR, +}; + +struct ibv_async_event { + union { + struct ibv_cq *cq; + struct ibv_qp *qp; + struct ibv_srq *srq; + struct ibv_exp_dct *dct; + int port_num; + /* For source compatible with Legacy API */ + uint32_t xrc_qp_num; + } element; + enum ibv_event_type event_type; +}; + +enum ibv_wc_status { + IBV_WC_SUCCESS, + IBV_WC_LOC_LEN_ERR, + IBV_WC_LOC_QP_OP_ERR, + IBV_WC_LOC_EEC_OP_ERR, + IBV_WC_LOC_PROT_ERR, + IBV_WC_WR_FLUSH_ERR, + IBV_WC_MW_BIND_ERR, + IBV_WC_BAD_RESP_ERR, + IBV_WC_LOC_ACCESS_ERR, + IBV_WC_REM_INV_REQ_ERR, + IBV_WC_REM_ACCESS_ERR, + IBV_WC_REM_OP_ERR, + IBV_WC_RETRY_EXC_ERR, + IBV_WC_RNR_RETRY_EXC_ERR, + IBV_WC_LOC_RDD_VIOL_ERR, + IBV_WC_REM_INV_RD_REQ_ERR, + IBV_WC_REM_ABORT_ERR, + IBV_WC_INV_EECN_ERR, + IBV_WC_INV_EEC_STATE_ERR, + IBV_WC_FATAL_ERR, + IBV_WC_RESP_TIMEOUT_ERR, + IBV_WC_GENERAL_ERR +}; +const char *ibv_wc_status_str(enum ibv_wc_status status); + +enum ibv_wc_opcode { + IBV_WC_SEND, + IBV_WC_RDMA_WRITE, + IBV_WC_RDMA_READ, + IBV_WC_COMP_SWAP, + IBV_WC_FETCH_ADD, + IBV_WC_BIND_MW, +/* + * Set value of IBV_WC_RECV so consumers can test if a completion is a + * receive by testing (opcode & IBV_WC_RECV). + */ + IBV_WC_RECV = 1 << 7, + IBV_WC_RECV_RDMA_WITH_IMM +}; + +enum ibv_wc_flags { + IBV_WC_GRH = 1 << 0, + IBV_WC_WITH_IMM = 1 << 1 +}; + +struct ibv_wc { + uint64_t wr_id; + enum ibv_wc_status status; + enum ibv_wc_opcode opcode; + uint32_t vendor_err; + uint32_t byte_len; + uint32_t imm_data; /* in network byte order */ + uint32_t qp_num; + uint32_t src_qp; + int wc_flags; + uint16_t pkey_index; + uint16_t slid; + uint8_t sl; + uint8_t dlid_path_bits; +}; + +enum ibv_access_flags { + IBV_ACCESS_LOCAL_WRITE = 1, + IBV_ACCESS_REMOTE_WRITE = (1<<1), + IBV_ACCESS_REMOTE_READ = (1<<2), + IBV_ACCESS_REMOTE_ATOMIC = (1<<3), + IBV_ACCESS_MW_BIND = (1<<4), + IBV_ACCESS_RELAXED_ORDERING = (1<<20), +}; + +struct ibv_pd { + struct ibv_context *context; + uint32_t handle; +}; + +enum ibv_xrcd_init_attr_mask { + IBV_XRCD_INIT_ATTR_FD = 1 << 0, + IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, + IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_xrcd_init_attr { + uint32_t comp_mask; + int fd; + int oflags; +}; + +struct ibv_xrcd { + struct ibv_context *context; +}; + +enum ibv_rereg_mr_flags { + IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), + IBV_REREG_MR_CHANGE_PD = (1 << 1), + IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), + IBV_REREG_MR_KEEP_VALID = (1 << 3) +}; + +struct ibv_mr { + struct ibv_context *context; + struct ibv_pd *pd; + void *addr; + size_t length; + uint32_t handle; + uint32_t lkey; + uint32_t rkey; +}; + +enum ibv_mw_type { + IBV_MW_TYPE_1 = 1, + IBV_MW_TYPE_2 = 2 +}; + +struct ibv_mw { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t rkey; +}; + +struct ibv_global_route { + union ibv_gid dgid; + uint32_t flow_label; + uint8_t sgid_index; + uint8_t hop_limit; + uint8_t traffic_class; +}; + +struct ibv_grh { + uint32_t version_tclass_flow; + uint16_t paylen; + uint8_t next_hdr; + uint8_t hop_limit; + union ibv_gid sgid; + union ibv_gid dgid; +}; + +enum ibv_rate { + IBV_RATE_MAX = 0, + IBV_RATE_2_5_GBPS = 2, + IBV_RATE_5_GBPS = 5, + IBV_RATE_10_GBPS = 3, + IBV_RATE_20_GBPS = 6, + IBV_RATE_30_GBPS = 4, + IBV_RATE_40_GBPS = 7, + IBV_RATE_60_GBPS = 8, + IBV_RATE_80_GBPS = 9, + IBV_RATE_120_GBPS = 10, + IBV_RATE_14_GBPS = 11, + IBV_RATE_56_GBPS = 12, + IBV_RATE_112_GBPS = 13, + IBV_RATE_168_GBPS = 14, + IBV_RATE_25_GBPS = 15, + IBV_RATE_100_GBPS = 16, + IBV_RATE_200_GBPS = 17, + IBV_RATE_300_GBPS = 18 +}; + +/** + * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the + * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be + * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. + * @rate: rate to convert. + */ +int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; + +/** + * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. + * @mult: multiple to convert. + */ +enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; + +/** + * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. + * For example, IBV_RATE_5_GBPS will return the value 5000. + * @rate: rate to convert. + */ +int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; + +/** + * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. + * @mbps: value to convert. + */ +enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; + +struct ibv_ah_attr { + struct ibv_global_route grh; + uint16_t dlid; + uint8_t sl; + uint8_t src_path_bits; + uint8_t static_rate; + uint8_t is_global; + uint8_t port_num; +}; + +enum ibv_srq_attr_mask { + IBV_SRQ_MAX_WR = 1 << 0, + IBV_SRQ_LIMIT = 1 << 1 +}; + +struct ibv_srq_attr { + uint32_t max_wr; + uint32_t max_sge; + uint32_t srq_limit; +}; + +struct ibv_srq_init_attr { + void *srq_context; + struct ibv_srq_attr attr; +}; + +enum ibv_srq_type { + IBV_SRQT_BASIC, + IBV_SRQT_XRC +}; + +enum ibv_srq_init_attr_mask { + IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, + IBV_SRQ_INIT_ATTR_PD = 1 << 1, + IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, + IBV_SRQ_INIT_ATTR_CQ = 1 << 3, + IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_srq_init_attr_ex { + void *srq_context; + struct ibv_srq_attr attr; + + uint32_t comp_mask; + enum ibv_srq_type srq_type; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; + struct ibv_cq *cq; +}; + +enum ibv_qp_type { + IBV_QPT_RC = 2, + IBV_QPT_UC, + IBV_QPT_UD, + /* XRC compatible code */ + IBV_QPT_XRC, + IBV_QPT_RAW_PACKET = 8, + IBV_QPT_RAW_ETH = 8, + IBV_QPT_XRC_SEND = 9, + IBV_QPT_XRC_RECV, + + /* Leave a gap for future qp types before starting with + * experimental qp types. + */ + IBV_EXP_QP_TYPE_START = 32, + IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START +}; + +struct ibv_qp_cap { + uint32_t max_send_wr; + uint32_t max_recv_wr; + uint32_t max_send_sge; + uint32_t max_recv_sge; + uint32_t max_inline_data; +}; + +struct ibv_qp_init_attr { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + /* Below is needed for backwards compatabile */ + struct ibv_xrc_domain *xrc_domain; +}; + +enum ibv_qp_init_attr_mask { + IBV_QP_INIT_ATTR_PD = 1 << 0, + IBV_QP_INIT_ATTR_XRCD = 1 << 1, + IBV_QP_INIT_ATTR_RESERVED = 1 << 2 +}; + +struct ibv_qp_init_attr_ex { + void *qp_context; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + struct ibv_qp_cap cap; + enum ibv_qp_type qp_type; + int sq_sig_all; + + uint32_t comp_mask; + struct ibv_pd *pd; + struct ibv_xrcd *xrcd; +}; + +enum ibv_qp_open_attr_mask { + IBV_QP_OPEN_ATTR_NUM = 1 << 0, + IBV_QP_OPEN_ATTR_XRCD = 1 << 1, + IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, + IBV_QP_OPEN_ATTR_TYPE = 1 << 3, + IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 +}; + +struct ibv_qp_open_attr { + uint32_t comp_mask; + uint32_t qp_num; + struct ibv_xrcd *xrcd; + void *qp_context; + enum ibv_qp_type qp_type; +}; + +enum ibv_qp_attr_mask { + IBV_QP_STATE = 1 << 0, + IBV_QP_CUR_STATE = 1 << 1, + IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, + IBV_QP_ACCESS_FLAGS = 1 << 3, + IBV_QP_PKEY_INDEX = 1 << 4, + IBV_QP_PORT = 1 << 5, + IBV_QP_QKEY = 1 << 6, + IBV_QP_AV = 1 << 7, + IBV_QP_PATH_MTU = 1 << 8, + IBV_QP_TIMEOUT = 1 << 9, + IBV_QP_RETRY_CNT = 1 << 10, + IBV_QP_RNR_RETRY = 1 << 11, + IBV_QP_RQ_PSN = 1 << 12, + IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, + IBV_QP_ALT_PATH = 1 << 14, + IBV_QP_MIN_RNR_TIMER = 1 << 15, + IBV_QP_SQ_PSN = 1 << 16, + IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, + IBV_QP_PATH_MIG_STATE = 1 << 18, + IBV_QP_CAP = 1 << 19, + IBV_QP_DEST_QPN = 1 << 20 +}; + +enum ibv_qp_state { + IBV_QPS_RESET, + IBV_QPS_INIT, + IBV_QPS_RTR, + IBV_QPS_RTS, + IBV_QPS_SQD, + IBV_QPS_SQE, + IBV_QPS_ERR, + IBV_QPS_UNKNOWN +}; + +enum ibv_mig_state { + IBV_MIG_MIGRATED, + IBV_MIG_REARM, + IBV_MIG_ARMED +}; + +struct ibv_qp_attr { + enum ibv_qp_state qp_state; + enum ibv_qp_state cur_qp_state; + enum ibv_mtu path_mtu; + enum ibv_mig_state path_mig_state; + uint32_t qkey; + uint32_t rq_psn; + uint32_t sq_psn; + uint32_t dest_qp_num; + int qp_access_flags; + struct ibv_qp_cap cap; + struct ibv_ah_attr ah_attr; + struct ibv_ah_attr alt_ah_attr; + uint16_t pkey_index; + uint16_t alt_pkey_index; + uint8_t en_sqd_async_notify; + uint8_t sq_draining; + uint8_t max_rd_atomic; + uint8_t max_dest_rd_atomic; + uint8_t min_rnr_timer; + uint8_t port_num; + uint8_t timeout; + uint8_t retry_cnt; + uint8_t rnr_retry; + uint8_t alt_port_num; + uint8_t alt_timeout; +}; + +enum ibv_wr_opcode { + IBV_WR_RDMA_WRITE, + IBV_WR_RDMA_WRITE_WITH_IMM, + IBV_WR_SEND, + IBV_WR_SEND_WITH_IMM, + IBV_WR_RDMA_READ, + IBV_WR_ATOMIC_CMP_AND_SWP, + IBV_WR_ATOMIC_FETCH_AND_ADD +}; + +enum ibv_send_flags { + IBV_SEND_FENCE = 1 << 0, + IBV_SEND_SIGNALED = 1 << 1, + IBV_SEND_SOLICITED = 1 << 2, + IBV_SEND_INLINE = 1 << 3 +}; + +struct ibv_sge { + uint64_t addr; + uint32_t length; + uint32_t lkey; +}; + +struct ibv_send_wr { + uint64_t wr_id; + struct ibv_send_wr *next; + struct ibv_sge *sg_list; + int num_sge; + enum ibv_wr_opcode opcode; + int send_flags; + uint32_t imm_data; /* in network byte order */ + union { + struct { + uint64_t remote_addr; + uint32_t rkey; + } rdma; + struct { + uint64_t remote_addr; + uint64_t compare_add; + uint64_t swap; + uint32_t rkey; + } atomic; + struct { + struct ibv_ah *ah; + uint32_t remote_qpn; + uint32_t remote_qkey; + } ud; + } wr; + union { + union { + struct { + uint32_t remote_srqn; + } xrc; + } qp_type; + + uint32_t xrc_remote_srq_num; + }; +}; + +struct ibv_recv_wr { + uint64_t wr_id; + struct ibv_recv_wr *next; + struct ibv_sge *sg_list; + int num_sge; +}; + +struct ibv_mw_bind { + uint64_t wr_id; + struct ibv_mr *mr; + void *addr; + size_t length; + int send_flags; + int mw_access_flags; +}; + +struct ibv_srq { + struct ibv_context *context; + void *srq_context; + struct ibv_pd *pd; + uint32_t handle; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; + + /* below are for source compatabilty with legacy XRC, + * padding based on ibv_srq_legacy. + */ + uint32_t xrc_srq_num_bin_compat_padding; + struct ibv_xrc_domain *xrc_domain_bin_compat_padding; + struct ibv_cq *xrc_cq_bin_compat_padding; + void *ibv_srq_padding; + + /* legacy fields */ + uint32_t xrc_srq_num; + struct ibv_xrc_domain *xrc_domain; + struct ibv_cq *xrc_cq; +}; + +/* Not in use in new API, needed for compilation as part of source compat layer */ +enum ibv_event_flags { + IBV_XRC_QP_EVENT_FLAG = 0x80000000, +}; + + + +struct ibv_qp { + struct ibv_context *context; + void *qp_context; + struct ibv_pd *pd; + struct ibv_cq *send_cq; + struct ibv_cq *recv_cq; + struct ibv_srq *srq; + uint32_t handle; + uint32_t qp_num; + enum ibv_qp_state state; + enum ibv_qp_type qp_type; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t events_completed; +}; + +struct ibv_comp_channel { + struct ibv_context *context; + int fd; + int refcnt; +}; + +struct ibv_cq { + struct ibv_context *context; + struct ibv_comp_channel *channel; + void *cq_context; + uint32_t handle; + int cqe; + + pthread_mutex_t mutex; + pthread_cond_t cond; + uint32_t comp_events_completed; + uint32_t async_events_completed; +}; + +struct ibv_ah { + struct ibv_context *context; + struct ibv_pd *pd; + uint32_t handle; +}; + +enum ibv_flow_flags { + IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, + IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, +}; + +enum ibv_flow_attr_type { + /* steering according to rule specifications */ + IBV_FLOW_ATTR_NORMAL = 0x0, + /* default unicast and multicast rule - + * receive all Eth traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, + /* default multicast rule - + * receive all Eth multicast traffic which isn't steered to any QP + */ + IBV_FLOW_ATTR_MC_DEFAULT = 0x2, +}; + +enum ibv_flow_spec_type { + IBV_FLOW_SPEC_ETH = 0x20, + IBV_FLOW_SPEC_IPV4 = 0x30, + IBV_FLOW_SPEC_TCP = 0x40, + IBV_FLOW_SPEC_UDP = 0x41, +}; + +struct ibv_flow_eth_filter { + uint8_t dst_mac[6]; + uint8_t src_mac[6]; + uint16_t ether_type; + /* + * same layout as 802.1q: prio 3, cfi 1, vlan id 12 + */ + uint16_t vlan_tag; +}; + +struct ibv_flow_spec_eth { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_eth_filter val; + struct ibv_flow_eth_filter mask; +}; + +struct ibv_flow_ipv4_filter { + uint32_t src_ip; + uint32_t dst_ip; +}; + +struct ibv_flow_spec_ipv4 { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_ipv4_filter val; + struct ibv_flow_ipv4_filter mask; +}; + +struct ibv_flow_tcp_udp_filter { + uint16_t dst_port; + uint16_t src_port; +}; + +struct ibv_flow_spec_tcp_udp { + enum ibv_flow_spec_type type; + uint16_t size; + struct ibv_flow_tcp_udp_filter val; + struct ibv_flow_tcp_udp_filter mask; +}; + +struct ibv_flow_spec { + union { + struct { + enum ibv_flow_spec_type type; + uint16_t size; + } hdr; + struct ibv_flow_spec_eth eth; + struct ibv_flow_spec_ipv4 ipv4; + struct ibv_flow_spec_tcp_udp tcp_udp; + }; +}; + +struct ibv_flow_attr { + uint32_t comp_mask; + enum ibv_flow_attr_type type; + uint16_t size; + uint16_t priority; + uint8_t num_of_specs; + uint8_t port; + uint32_t flags; + /* Following are the optional layers according to user request + * struct ibv_flow_spec_xxx [L2] + * struct ibv_flow_spec_yyy [L3/L4] + */ +}; + +struct ibv_flow { + uint32_t comp_mask; + struct ibv_context *context; + uint32_t handle; +}; + +struct ibv_device; +struct ibv_context; + +struct ibv_device_ops { + struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); + void (*free_context)(struct ibv_context *context); +}; + +enum { + IBV_SYSFS_NAME_MAX = 64, + IBV_SYSFS_PATH_MAX = 256 +}; + +struct ibv_device { + struct ibv_device_ops ops; + enum ibv_node_type node_type; + enum ibv_transport_type transport_type; + /* Name of underlying kernel IB device, eg "mthca0" */ + char name[IBV_SYSFS_NAME_MAX]; + /* Name of uverbs device, eg "uverbs0" */ + char dev_name[IBV_SYSFS_NAME_MAX]; + /* Path to infiniband_verbs class device in sysfs */ + char dev_path[IBV_SYSFS_PATH_MAX]; + /* Path to infiniband class device in sysfs */ + char ibdev_path[IBV_SYSFS_PATH_MAX]; +}; + +struct verbs_device { + struct ibv_device device; /* Must be first */ + size_t sz; + size_t size_of_context; + int (*init_context)(struct verbs_device *device, + struct ibv_context *ctx, int cmd_fd); + void (*uninit_context)(struct verbs_device *device, + struct ibv_context *ctx); + /* future fields added here */ +}; + +struct ibv_context_ops { + int (*query_device)(struct ibv_context *context, + struct ibv_device_attr *device_attr); + int (*query_port)(struct ibv_context *context, uint8_t port_num, + struct ibv_port_attr *port_attr); + struct ibv_pd * (*alloc_pd)(struct ibv_context *context); + int (*dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, + int access); + struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, void *addr, + size_t length, + int access); + int (*dereg_mr)(struct ibv_mr *mr); + struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); + int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind); + int (*dealloc_mw)(struct ibv_mw *mw); + struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector); + int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); + int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); + void (*cq_event)(struct ibv_cq *cq); + int (*resize_cq)(struct ibv_cq *cq, int cqe); + int (*destroy_cq)(struct ibv_cq *cq); + struct ibv_srq * (*create_srq)(struct ibv_pd *pd, + struct ibv_srq_init_attr *srq_init_attr); + int (*modify_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr, + int srq_attr_mask); + int (*query_srq)(struct ibv_srq *srq, + struct ibv_srq_attr *srq_attr); + int (*destroy_srq)(struct ibv_srq *srq); + int (*post_srq_recv)(struct ibv_srq *srq, + struct ibv_recv_wr *recv_wr, + struct ibv_recv_wr **bad_recv_wr); + struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); + int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask, + struct ibv_qp_init_attr *init_attr); + int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask); + int (*destroy_qp)(struct ibv_qp *qp); + int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, + struct ibv_send_wr **bad_wr); + int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, + struct ibv_recv_wr **bad_wr); + struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); + int (*destroy_ah)(struct ibv_ah *ah); + int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid); + void (*async_event)(struct ibv_async_event *event); +}; + +struct ibv_context { + struct ibv_device *device; + struct ibv_context_ops ops; + int cmd_fd; + int async_fd; + int num_comp_vectors; + pthread_mutex_t mutex; + void *abi_compat; +}; + +enum verbs_context_mask { + VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, + VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, + VERBS_CONTEXT_QP = (uint64_t)1 << 2, + VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, + VERBS_CONTEXT_EXP = (uint64_t)1 << 62 +}; + +struct verbs_context { + /* "grows up" - new fields go here */ + int (*_reserved_2) (void); + int (*destroy_flow) (struct ibv_flow *flow); + int (*_reserved_1) (void); + struct ibv_flow * (*create_flow) (struct ibv_qp *qp, + struct ibv_flow_attr *flow_attr); + struct ibv_qp * (*open_qp)(struct ibv_context *context, + struct ibv_qp_open_attr *attr); + struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_init_attr_ex); + int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, + struct ibv_srq_init_attr_ex *srq_init_attr_ex); + struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, + struct ibv_xrcd_init_attr *xrcd_init_attr); + int (*close_xrcd)(struct ibv_xrcd *xrcd); + uint64_t has_comp_mask; + size_t sz; /* Must be immediately before struct ibv_context */ + struct ibv_context context;/* Must be last field in the struct */ +}; + +/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ +/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) +{ + return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? + NULL : container_of(ctx, struct verbs_context, context); +} + +#define verbs_get_ctx_op(ctx, op) ({ \ + struct verbs_context *_vctx = verbs_get_ctx(ctx); \ + (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ + !_vctx->op) ? NULL : _vctx; })*/ + +#define verbs_set_ctx_op(_vctx, op, ptr) ({ \ + struct verbs_context *vctx = _vctx; \ + if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ + vctx->op = ptr; }) + +static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) +{ + return (dev->ops.alloc_context) ? + NULL : container_of(dev, struct verbs_device, device); +} + +static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { + return qp->context->ops.post_send(qp, wr, bad_wr); +} + +struct ibv_ece { + /* + * Unique identifier of the provider vendor on the network. + * The providers will set IEEE OUI here to distinguish + * itself in non-homogenius network. + */ + uint32_t vendor_id; + /* + * Provider specific attributes which are supported or + * needed to be enabled by ECE users. + */ + uint32_t options; + uint32_t comp_mask; +}; + +#endif // NCCL_IBV_CORE_H_ diff --git a/nvls/ibvsymbols.h b/nvls/ibvsymbols.h new file mode 100644 index 000000000..906b0df74 --- /dev/null +++ b/nvls/ibvsymbols.h @@ -0,0 +1,46 @@ +#ifndef NCCL_IBV_SYMBOLS_H_ +#define NCCL_IBV_SYMBOLS_H_ + +#ifdef NCCL_BUILD_RDMA_CORE +#include +#else +#include "ibvcore.h" +#endif + +#include "nccl.h" + +/* IB Verbs Function Pointers*/ +struct ncclIbvSymbols { + int (*ibv_internal_fork_init)(void); + struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); + void (*ibv_internal_free_device_list)(struct ibv_device **list); + const char * (*ibv_internal_get_device_name)(struct ibv_device *device); + struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); + int (*ibv_internal_close_device)(struct ibv_context *context); + int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); + void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); + int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); + int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); + int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); + int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); + struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); + int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); + struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); + struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); + /* DMA-BUF support */ + struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); + int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); + struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); + int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); + struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); + int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); + int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); + const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); + int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); + int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); +}; + +/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ +ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); + +#endif // NCCL_IBV_SYMBOLS_H_ diff --git a/nvls/ibvwrap.h b/nvls/ibvwrap.h new file mode 100644 index 000000000..c3709584c --- /dev/null +++ b/nvls/ibvwrap.h @@ -0,0 +1,92 @@ +/************************************************************************* + * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. + * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. + * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2005 PathScale, Inc. All rights reserved. + * + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_IBVWRAP_H_ +#define NCCL_IBVWRAP_H_ + +#ifdef NCCL_BUILD_RDMA_CORE +#include +#else +#include "ibvcore.h" +#endif + +#include "core.h" +#include +#include + +typedef enum ibv_return_enum +{ + IBV_SUCCESS = 0, //!< The operation was successful +} ibv_return_t; + +ncclResult_t wrap_ibv_symbols(void); +/* NCCL wrappers of IB verbs functions */ +ncclResult_t wrap_ibv_fork_init(void); +ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); +ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); +const char *wrap_ibv_get_device_name(struct ibv_device *device); +ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); +ncclResult_t wrap_ibv_close_device(struct ibv_context *context); +ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); +ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); +ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); +ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); +ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); +ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); +ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); +ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); +ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); +struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); +ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); +/* DMA-BUF support */ +ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); +struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); +ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); +ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); +ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); +ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); +ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); +static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { + int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ + if (done < 0) { + WARN("Call to ibv_poll_cq() returned %d", done); + return ncclSystemError; + } + *num_done = done; + return ncclSuccess; +} +ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); +ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); +ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); +ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); +ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); + +static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { + int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ + if (ret != IBV_SUCCESS) { + WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); + return ncclSystemError; + } + return ncclSuccess; +} + +static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { + int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ + if (ret != IBV_SUCCESS) { + WARN("ibv_post_recv() failed with error %s", strerror(ret)); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); + +#endif //End include guard diff --git a/nvls/info.h b/nvls/info.h new file mode 100644 index 000000000..f65ed2e69 --- /dev/null +++ b/nvls/info.h @@ -0,0 +1,134 @@ +/************************************************************************* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INFO_H_ +#define NCCL_INFO_H_ + +#include "nccl.h" +#include "device.h" +#include "collectives.h" +#include "core.h" +#include "utils.h" +#include "strongstream.h" + +typedef enum : uint8_t { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown, + ncclPatternCollnetChain, + ncclPatternCollnetDirect, + ncclPatternNvls, + ncclPatternNvlsTree, + ncclPatternSend, + ncclPatternRecv +} ncclPattern_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclFunc_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; // peer for p2p operations + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclDevRedOpFull opFull; + int algorithm; + int protocol; + ncclPattern_t pattern; + int nChannels; + int nThreads; + size_t nBytes; + size_t sendbuffSize; + size_t recvbuffSize; + int nstepsPerLoop; + int nchunksPerLoop; + int chunkSize; + int channelId; +}; + +inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { + info->nBytes = info->count * ncclTypeSize(info->datatype); + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { + info->count = info->nBytes; + info->datatype = ncclInt8; + } + if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank + + /* compute buffer size for NVLS buffer registration */ + if (info->coll == ncclFuncAllGather) { + info->sendbuffSize = info->count * ncclTypeSize(info->datatype); + info->recvbuffSize = info->sendbuffSize * nRanks; + } else if (info->coll == ncclFuncReduceScatter) { + info->recvbuffSize = info->count * ncclTypeSize(info->datatype); + info->sendbuffSize = info->recvbuffSize * nRanks; + } else { + info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype); + } + return ncclSuccess; +} + +struct ncclTaskColl { + struct ncclTaskColl* next; + ncclFunc_t func; + void const* sendbuff; + void* recvbuff; + size_t count; + int root; + ncclDataType_t datatype; + ncclDevRedOpFull op; + int chunkSteps, sliceSteps; +}; +struct ncclTaskP2p { + ncclTaskP2p *next; + void *buff; + size_t bytes; + // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track + // of where it left off. + int chunk; +}; + +struct ncclCudaStreamList { + struct ncclCudaStreamList *next; + cudaStream_t stream; +}; +struct ncclTasks { + struct Peer { + bool sendSeen, recvSeen; + struct ncclIntruQueue sendQueue; + struct ncclIntruQueue recvQueue; + }; + struct ncclIntruQueue collQueue; + size_t collBytesTotal; + struct Peer* peers/*[nRanks]*/; + int *p2pSendOrder, *p2pRecvOrder; + int p2pOrderSteps; + int nTasksColl, nTasksP2p; + + // The list of user streams aggregated over all tasks present. + struct ncclCudaStreamList* streams; + // The most recent user stream. Ignored if streams==nullptr + cudaStream_t streamRecent; + // The graph capturing all user streams or invalid if none. Thus we restrict the + // user that all streams must be captured in the same graph or not captured + // at all. Technically we could probably relax this, but that would mean + // collecting a different `ncclTasks` per graph and one for non-graph. + struct ncclCudaGraph capturingGraph; +}; + +#endif diff --git a/nvls/ipcsocket.cc b/nvls/ipcsocket.cc new file mode 100644 index 000000000..9d66ac719 --- /dev/null +++ b/nvls/ipcsocket.cc @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. + * + * See COPYRIGHT for license information + */ + +#include "ipcsocket.h" +#include "utils.h" +#include +#include +#include + +// Enable Linux abstract socket naming +#define USE_ABSTRACT_SOCKET + +#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx" + +/* + * Create a Unix Domain Socket + */ +ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) { + int fd = -1; + struct sockaddr_un cliaddr; + char temp[NCCL_IPC_SOCKNAME_LEN] = ""; + + if (handle == NULL) { + return ncclInternalError; + } + + handle->fd = -1; + handle->socketName[0] = '\0'; + if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { + WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno); + return ncclSystemError; + } + + bzero(&cliaddr, sizeof(cliaddr)); + cliaddr.sun_family = AF_UNIX; + + // Create unique name for the socket. + int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); + if (len > (sizeof(cliaddr.sun_path) - 1)) { + WARN("UDS: Cannot bind provided name to socket. Name too large"); + return ncclInternalError; + } +#ifndef USE_ABSTRACT_SOCKET + unlink(temp); +#endif + + TRACE(NCCL_INIT, "UDS: Creating socket %s", temp); + + strncpy(cliaddr.sun_path, temp, len); +#ifdef USE_ABSTRACT_SOCKET + cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick +#endif + if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) { + WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno); + close(fd); + return ncclSystemError; + } + + handle->fd = fd; + strcpy(handle->socketName, temp); + + handle->abortFlag = abortFlag; + // Mark socket as non-blocking + if (handle->abortFlag) { + int flags; + EQCHECK(flags = fcntl(fd, F_GETFL), -1); + SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); + } + + return ncclSuccess; +} + +ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) { + if (handle == NULL) { + WARN("ncclSocketGetFd: pass NULL socket"); + return ncclInvalidArgument; + } + if (fd) *fd = handle->fd; + return ncclSuccess; +} + +ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { + if (handle == NULL) { + return ncclInternalError; + } + if (handle->fd <= 0) { + return ncclSuccess; + } +#ifndef USE_ABSTRACT_SOCKET + if (handle->socketName[0] != '\0') { + unlink(handle->socketName); + } +#endif + close(handle->fd); + + return ncclSuccess; +} + +ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) { + struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; + struct iovec iov[1]; + + // Union to guarantee alignment requirements for control array + union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; + } control_un; + + struct cmsghdr *cmptr; + char dummy_buffer[1]; + int ret; + + msg.msg_control = control_un.control; + msg.msg_controllen = sizeof(control_un.control); + + if (hdr == NULL) { + iov[0].iov_base = (void *)dummy_buffer; + iov[0].iov_len = sizeof(dummy_buffer); + } else { + iov[0].iov_base = hdr; + iov[0].iov_len = hdrLen; + } + + msg.msg_iov = iov; + msg.msg_iovlen = 1; + + while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + WARN("UDS: Receiving data over socket failed : %d", errno); + return ncclSystemError; + } + if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; + } + + if (recvFd != NULL) { + if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { + if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { + WARN("UDS: Receiving data over socket failed"); + return ncclSystemError; + } + + memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); + } else { + WARN("UDS: Receiving data over socket %s failed", handle->socketName); + return ncclSystemError; + } + TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); + } + + return ncclSuccess; +} + +ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { + return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd); +} + +ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) { + struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; + struct iovec iov[1]; + char temp[NCCL_IPC_SOCKNAME_LEN]; + + union { + struct cmsghdr cm; + char control[CMSG_SPACE(sizeof(int))]; + } control_un; + + struct cmsghdr *cmptr; + char dummy_buffer[1]; + struct sockaddr_un cliaddr; + + // Construct client address to send this shareable handle to + bzero(&cliaddr, sizeof(cliaddr)); + cliaddr.sun_family = AF_UNIX; + + int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); + if (len > (sizeof(cliaddr.sun_path) - 1)) { + WARN("UDS: Cannot connect to provided name for socket. Name too large"); + return ncclInternalError; + } + (void) strncpy(cliaddr.sun_path, temp, len); + +#ifdef USE_ABSTRACT_SOCKET + cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick +#endif + + TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp); + + if (sendFd != -1) { + TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); + + msg.msg_control = control_un.control; + msg.msg_controllen = sizeof(control_un.control); + + cmptr = CMSG_FIRSTHDR(&msg); + cmptr->cmsg_len = CMSG_LEN(sizeof(int)); + cmptr->cmsg_level = SOL_SOCKET; + cmptr->cmsg_type = SCM_RIGHTS; + memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); + } + + msg.msg_name = (void *)&cliaddr; + msg.msg_namelen = sizeof(struct sockaddr_un); + + if (hdr == NULL) { + iov[0].iov_base = (void *)dummy_buffer; + iov[0].iov_len = sizeof(dummy_buffer); + } else { + iov[0].iov_base = hdr; + iov[0].iov_len = hdrLen; + } + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_flags = 0; + + ssize_t sendResult; + while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { + WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno); + return ncclSystemError; + } + if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; + } + + return ncclSuccess; +} + +ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { + return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); +} diff --git a/nvls/ipcsocket.h b/nvls/ipcsocket.h new file mode 100644 index 000000000..ccecde84c --- /dev/null +++ b/nvls/ipcsocket.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. + * + * See COPYRIGHT for license information + */ + +#ifndef NCCL_IPCSOCKET_H +#define NCCL_IPCSOCKET_H + +#include "nccl.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NCCL_IPC_SOCKNAME_LEN 64 + +struct ncclIpcSocket { + int fd; + char socketName[NCCL_IPC_SOCKNAME_LEN]; + volatile uint32_t* abortFlag; +}; + +ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); +ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); +ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); + +ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); +ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); + +#endif /* NCCL_IPCSOCKET_H */ diff --git a/nvls/nccl_common.h b/nvls/nccl_common.h new file mode 100644 index 000000000..a37ac203e --- /dev/null +++ b/nvls/nccl_common.h @@ -0,0 +1,33 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEBUG_H_ +#define NCCL_DEBUG_H_ + +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); + +#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now +typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; + +#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* +#define NCCL_ALGO_UNDEF -1 +#define NCCL_ALGO_TREE 0 +#define NCCL_ALGO_RING 1 +#define NCCL_ALGO_COLLNET_DIRECT 2 +#define NCCL_ALGO_COLLNET_CHAIN 3 +#define NCCL_ALGO_NVLS 4 +#define NCCL_ALGO_NVLS_TREE 5 + +#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 +#define NCCL_PROTO_UNDEF -1 +#define NCCL_PROTO_LL 0 +#define NCCL_PROTO_LL128 1 +#define NCCL_PROTO_SIMPLE 2 + +#endif diff --git a/nvls/nccl_net.h b/nvls/nccl_net.h new file mode 100644 index 000000000..9b3e6719f --- /dev/null +++ b/nvls/nccl_net.h @@ -0,0 +1,333 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_H_ +#define NCCL_NET_H_ + +#include "nccl.h" +#include "nccl_common.h" +#include "net_device.h" +#include + +#define NCCL_NET_HANDLE_MAXSIZE 128 + +#define NCCL_PTR_HOST 0x1 +#define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 + +// Maximum number of requests per comm object +#define NCCL_NET_MAX_REQUESTS 32 + +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload +} ncclNetProperties_v7_t; + +typedef ncclNetProperties_v7_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection + ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); + + // Copy the given mhandle to a dptr in a format usable by this plugin's device code + ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); + + // Notify the plugin that a recv has completed by the device + ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); +} ncclNet_v7_t; + +typedef ncclNet_v7_t ncclNet_t; + +#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7 + +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7 + +#define NCCL_NET_MAX_REQUESTS_V6 8 + +// v6 struct for backwards compatibility +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] + int speed; // Port speed in Mbps. + int port; // Port number. + float latency; // Network latency + int maxComms; // Maximum number of comms we can create + int maxRecvs; // Maximum number of grouped receives. +} ncclNetProperties_v6_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v7_t; + +typedef ncclCollNet_v7_t ncclCollNet_t; + +// v6 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; + +// v5 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +// v5 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v5_t; + +#endif // end include guard diff --git a/nvls/nccl_tuner.h b/nvls/nccl_tuner.h new file mode 100644 index 000000000..b4a696e38 --- /dev/null +++ b/nvls/nccl_tuner.h @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TUNER_H_ +#define NCCL_TUNER_H_ + +#include "nccl.h" +#include "nccl_common.h" + +// API to be implemented by external tuner +typedef struct { + // Name of the tuner + const char* name; + + // Initializes tuner states. + // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. + // nNodes: number of nodes in current communicator. + // logFunction: a logFunction can be useful to integrate logging together with NCCL core. + ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction); + + // Gets info (algo, protocol, number of ctas and threads) for a given collective. + // Inputs: + // - collType: collective type , e.g., allreduce, allgather… + // - nBytes: collective size in bytes + // - collNetTypeSupport: whether collnet supports this type + // - nvlsTypeSupport: whether nvlink sharp supports this time + // - numPipeOps: number of operations in the group + // + // Outputs: + // - algorithm: selected algorithm to be used for the given collective + // - protocol: selected protocol to be used for the given collective + // - nChannels: number of channels (hence SMs) to be used. + // + // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the + // default tuning for the given collective. + // Also, the plugin is allowed to not set any output, or set only the + // algorithm and protocol, but not only the algorithm or only the protocol. + // Unset fields will be set automatically by NCCL. + ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes, + int collNetSupport, int nvlsSupport, int numPipeOps, + int *algorithm, int *protocol, int* nChannels); + + // Terminates the plugin and cleans up any resources that the plugin allocated. + ncclResult_t (*destroy)(); +} ncclTuner_v1_t; + +typedef ncclTuner_v1_t ncclTuner_t; + +#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1" + +#endif diff --git a/nvls/net.h b/nvls/net.h new file mode 100644 index 000000000..b5df58968 --- /dev/null +++ b/nvls/net.h @@ -0,0 +1,27 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_NET_H_ +#define NCCL_INT_NET_H_ + +#include "nccl.h" +#include "nccl_net.h" +#include "comm.h" +#include "checks.h" + +typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; + +ncclResult_t ncclNetPluginInit(); +ncclResult_t ncclNetInit(struct ncclComm* comm); +int ncclNetVersion(struct ncclComm* comm); + +// Test whether the current GPU support GPU Direct RDMA. +ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); + +extern ncclNet_t ncclNetIb; +extern ncclNet_t ncclNetSocket; + +#endif diff --git a/nvls/net_device.h b/nvls/net_device.h new file mode 100644 index 000000000..8f7c0d6e1 --- /dev/null +++ b/nvls/net_device.h @@ -0,0 +1,29 @@ +/************************************************************************* + * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NET_DEVICE_H_ +#define NCCL_NET_DEVICE_H_ + +#define NCCL_NET_DEVICE_INVALID_VERSION 0x0 +#define NCCL_NET_MTU_SIZE 4096 + +// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin +// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. +#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 + +typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; + +typedef struct { + ncclNetDeviceType netDeviceType; // Network offload type + int netDeviceVersion; // Version number for network offload + void* handle; + size_t size; + int needsProxyProgress; +} ncclNetDeviceHandle_v7_t; + +typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; + +#endif diff --git a/nvls/nvmlwrap.h b/nvls/nvmlwrap.h new file mode 100644 index 000000000..2ab8e3a2b --- /dev/null +++ b/nvls/nvmlwrap.h @@ -0,0 +1,214 @@ +/************************************************************************* + * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NVMLWRAP_H_ +#define NCCL_NVMLWRAP_H_ + +#include "nccl.h" + +//#define NCCL_NVML_DIRECT 1 +#ifndef NCCL_NVML_DIRECT +#define NCCL_NVML_DIRECT 0 +#endif + +#if NCCL_NVML_DIRECT +#include "nvml.h" +#else +// Dynamically handle dependencies on NVML + +/* Extracted from nvml.h */ +typedef struct nvmlDevice_st* nvmlDevice_t; +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 + +typedef enum nvmlEnableState_enum +{ + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled +} nvmlEnableState_t; + +typedef enum nvmlNvLinkCapability_enum +{ + NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported + NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported + NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported + NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported + NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link + NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device + // should be last + NVML_NVLINK_CAP_COUNT +} nvmlNvLinkCapability_t; + +typedef enum nvmlReturn_enum +{ + NVML_SUCCESS = 0, //!< The operation was successful + NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() + NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid + NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device + NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation + NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting + NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful + NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough + NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached + NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded + NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed + NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU + NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded + NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function + NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted + NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible + NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again + NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups + NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch + NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use + NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred +} nvmlReturn_t; + +typedef struct nvmlPciInfo_st +{ + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + + // Added in NVML 2.285 API + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + // NVIDIA reserved for internal use only + unsigned int reserved0; + unsigned int reserved1; + unsigned int reserved2; + unsigned int reserved3; +} nvmlPciInfo_t; + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +} nvmlGpuP2PCapsIndex_t; + +/** + * Represents the type for sample value returned + */ +typedef enum nvmlValueType_enum +{ + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT +}nvmlValueType_t; + + +/** + * Union to represent different types of Value + */ +typedef union nvmlValue_st +{ + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long +}nvmlValue_t; + +/** + * Field Identifiers. + * + * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. + */ + +/* NVLink Speed */ +#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links +#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device + +/** + * Remote device NVLink ID + * + * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. + */ +#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID + +/** + * NVSwitch: connected NVLink count + */ +#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch + +#define NVML_FI_DEV_NVLINK_GET_SPEED 164 +#define NVML_FI_DEV_NVLINK_GET_STATE 165 +#define NVML_FI_DEV_NVLINK_GET_VERSION 166 + +#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device +#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE +#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links + +#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above + +/** + * Information for a Field Value Sample + */ +typedef struct nvmlFieldValue_st +{ + unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. + unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. + long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 + long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. + nvmlValueType_t valueType; //!< Type of the value stored in value + nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS + nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS +} nvmlFieldValue_t; + +/* End of nvml.h */ +#endif // NCCL_NVML_DIRECT + +constexpr int ncclNvmlMaxDevices = 32; +struct ncclNvmlDeviceInfo { + nvmlDevice_t handle; + int computeCapabilityMajor, computeCapabilityMinor; +}; +struct ncclNvmlDevicePairInfo { + nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; +}; +extern int ncclNvmlDeviceCount; +extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; +extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; + +// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. +// Outsiders need only call it if they want to inspect the ncclNvml global +// tables above. +ncclResult_t ncclNvmlEnsureInitialized(); + +ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); +ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); +ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); +ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); +ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); +ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); +ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); +ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + +#endif // End include guard diff --git a/nvls/nvtx.h b/nvls/nvtx.h new file mode 100644 index 000000000..ab32ef27f --- /dev/null +++ b/nvls/nvtx.h @@ -0,0 +1,85 @@ +/************************************************************************* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_NVTX_H_ +#define NCCL_NVTX_H_ + +#include "nvtx3/nvtx3.hpp" + +#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) +#define NVTX3_CONSTEXPR_IF_CPP14 constexpr +#else +#define NVTX3_CONSTEXPR_IF_CPP14 +#endif + +// Define all NCCL-provided static schema IDs here (avoid duplicates). +#define NVTX_SID_CommInitRank 0 +#define NVTX_SID_CommInitAll 1 +#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank +#define NVTX_SID_AllGather 4 +#define NVTX_SID_AllReduce 5 +#define NVTX_SID_Broadcast 6 +#define NVTX_SID_ReduceScatter 7 +#define NVTX_SID_Reduce 8 +#define NVTX_SID_Send 9 +#define NVTX_SID_Recv 10 + +// Define static schema ID for the reduction operation. +#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + +extern const nvtxDomainHandle_t ncclNvtxDomainHandle; + +struct nccl_domain{static constexpr char const* name{"NCCL"};}; + +class payload_schema { + public: + explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept + { + schema_attr.name = schemaName; + schema_attr.entries = entries; + schema_attr.numEntries = numEntries; + schema_attr.schemaId = schemaId; + nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); + } + + payload_schema() = delete; + ~payload_schema() = default; + payload_schema(payload_schema const&) = default; + payload_schema& operator=(payload_schema const&) = default; + payload_schema(payload_schema&&) = default; + payload_schema& operator=(payload_schema&&) = default; + + private: + nvtxPayloadSchemaAttr_t schema_attr{ + NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | + NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | + NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | + NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, + nullptr, + NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, + NVTX_PAYLOAD_SCHEMA_FLAG_NONE, + nullptr, 0, 0, 0}; +}; + +// Create NVTX push/pop range with parameters +// @param name of the operation (see `NVTX_SID_*`) +// @param N schema name +// @param S schema (entries) +// @param P payload (struct) +#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ + static const payload_schema schema{S, std::extent::value, \ + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ + static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ + nvtxPayloadData_t nvtx3_bpl__[] = { \ + {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ + ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ + ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; + +extern void initNvtxRegisteredEnums(); + +#endif diff --git a/nvls/p2p.h b/nvls/p2p.h new file mode 100644 index 000000000..6ffba4b0e --- /dev/null +++ b/nvls/p2p.h @@ -0,0 +1,29 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include + +#ifndef NCCL_P2P_H_ +#define NCCL_P2P_H_ + +#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR + +typedef struct { + uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support +} ncclCuDesc; + +typedef union { + // Legacy CUDA IPC + cudaIpcMemHandle_t devIpc; + // cuMem API support + ncclCuDesc cuDesc; +} ncclIpcDesc; + +ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); +ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); +ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); + +#endif diff --git a/nvls/param.h b/nvls/param.h new file mode 100644 index 000000000..963da9d17 --- /dev/null +++ b/nvls/param.h @@ -0,0 +1,30 @@ +/************************************************************************* + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PARAM_H_ +#define NCCL_PARAM_H_ + +#include + +const char* userHomeDir(); +void setEnvFile(const char* fileName); +void initEnv(); +const char *ncclGetEnv(const char *name); + +void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); + +#define NCCL_PARAM(name, env, deftVal) \ + int64_t ncclParam##name() { \ + constexpr int64_t uninitialized = INT64_MIN; \ + static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ + static int64_t cache = uninitialized; \ + if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ + ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ + } \ + return cache; \ + } + +#endif diff --git a/nvls/profiler.h b/nvls/profiler.h new file mode 100644 index 000000000..103af99ad --- /dev/null +++ b/nvls/profiler.h @@ -0,0 +1,37 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROFILER_H_ +#define NCCL_PROFILER_H_ + +#include "proxy.h" + +enum ncclProxyProfileState { + ncclProxyProfileBegin = 0, + + ncclProxyProfileSendGPUWait = 1, + ncclProxyProfileSendWait = 2, + + ncclProxyProfileRecvWait = 1, + ncclProxyProfileRecvFlushWait = 2, + ncclProxyProfileRecvGPUWait = 3, + + ncclProxyProfileEnd = 4, + + ncclProxyProfileSleep = 8, + ncclProxyProfileWakeup = 9, + + ncclProxyProfileIdle = 16, + ncclProxyProfileActive = 17, + + ncclProxyProfileAppend = 24, + ncclProxyProfileAppendEnd = 25 +}; + +ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); +void ncclProfilingDump(); + +#endif diff --git a/nvls/proxy.h b/nvls/proxy.h new file mode 100644 index 000000000..8093c0ce6 --- /dev/null +++ b/nvls/proxy.h @@ -0,0 +1,296 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_PROXY_H_ +#define NCCL_PROXY_H_ + +#include "device.h" +#include "info.h" +#include "socket.h" +#include "ipcsocket.h" +#include "nccl_net.h" +#include +#include "shm.h" +#include "p2p.h" + +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; + +struct ncclProxyArgs; +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); + +#define NCCL_PROXY_MAX_SUBS MAXCHANNELS +static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); + +struct ncclProxyOp { + struct ncclProxyConnection* connection; + int channelId; + int nsteps; + ssize_t nbytes; + int root; + int next; + + uint64_t opCount; + int sliceSteps; + int chunkSteps; + int chunkSize; + uint8_t /*ncclDataType_t*/ dtype; + uint8_t /*ncclDevRedOp_t*/ redOp; + uint8_t /*ncclPattern_t*/ pattern; + uint8_t protocol; + + union { + uint64_t unused; + // For use by enqueue.cc + struct ncclProxyOp *enqNext; + }; +}; +static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); + +struct ncclProxySubArgs { + struct ncclProxyConnection* connection; + int channelId; + int nsteps; + ssize_t nbytes; + int peer; + + int groupSize; // Number of consecutive sub operations sharing the same recvComm + uint64_t base; + uint64_t posted; + uint64_t received; + uint64_t flushed; + uint64_t transmitted; + uint64_t done; + uint64_t end; + void* requests[NCCL_STEPS]; + void* profilingEvents[NCCL_STEPS]; + void* recvRequestsCache[NCCL_STEPS]; + int recvRequestsSubCount; +}; + +struct ncclProxyArgs { + struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; + proxyProgressFunc_t progress; + int nsubs; + int done; + uint64_t opCount; + int sliceSteps; + int chunkSteps; + int chunkSize; + uint8_t /*ncclDataType_t*/ dtype; + uint8_t /*ncclDevRedOp_t*/ redOp; + uint8_t /*ncclPattern_t*/ pattern; + uint8_t protocol; + int state; + char* sharedBuff[NCCL_STEPS]; + int sharedSize[NCCL_STEPS]; + + int idle; + + // Element linking + struct ncclProxyArgs* next; + struct ncclProxyArgs* nextPeer; + struct ncclProxyArgs** proxyAppendPtr; +}; +#define NCCL_MAX_NETDEVS 128 + +// ProxyOps are used to communicate between main thread and service thread +// Make sure we have enough to store two full rounds of operations on all channels. +// Otherwise we'd be unable to post half of them to free new elements. +#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) +#define NCCL_MAX_LOCAL_RANKS 64 +struct ncclProxyOpsPool { + struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; + volatile int nextOps; + volatile int nextOpsEnd; + volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +struct ncclProxyOps { + ncclProxyOpsPool* pool; + ncclShmHandle_t handle; + int count; + int freeOp; + int nextOps; + int nextOpsEnd; +}; + +struct ncclProxySharedP2p { + int refcount; + int size; + char* cudaBuff; + char* hostBuff; + // CUDA IPC + ncclIpcDesc ipcDesc; + struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv +}; + +struct ncclProxyPeer { + struct ncclProxySharedP2p send; + struct ncclProxySharedP2p recv; +}; + +struct ncclSharedNetComms { + void* sendComm[MAXCHANNELS]; + void* recvComm[MAXCHANNELS]; + int sendRefCount[MAXCHANNELS]; + int recvRefCount[MAXCHANNELS]; +}; + +struct ncclProxyPool; +struct ncclProxyProgressState { + // Used by main threads to send work to progress thread + struct ncclProxyOpsPool* opsPool; + ncclShmHandle_t handle; + char opsPoolShmSuffix[6]; + + pthread_t thread; + volatile int stop; + struct ncclProxyPeer** localPeers; + struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; + struct ncclProxyArgs* active; + struct ncclProxyArgs* pool; + struct ncclProxyPool* pools; + int nextOps; +}; + +// Expected proxy response fifo +struct ncclExpectedProxyResponse { + void* opId; + int respSize; + bool done; + void* respBuff; + ncclResult_t res; + struct ncclExpectedProxyResponse* next; +}; + +struct ncclProxyAsyncOp { + int type; + struct ncclProxyConnection* connection; + int reqSize, respSize; + char *reqBuff, *respBuff; + void* opId; + ncclProxyAsyncOp* next; +}; + +struct ncclProxyLocalPeer { + struct ncclSocket sock; + int tpRank; + int tpLocalRank; + ncclProxyAsyncOp* asyncOps; + int asyncOpCounter; +}; + +// Common response header for all proxyOps +// We pack this into a struct to reduce the number of blocking send and recv calls +struct ncclProxyRpcResponseHeader { + void* opId; + ncclResult_t res; + int respSize; +}; + +struct ncclProxyState { + int refCount; + int tpRank; + int tpnRanks; + int tpLocalnRanks; + int cudaDev; + int p2pnChannels; + int p2pChunkSize; + int nChannels; + int buffSizes[NCCL_NUM_PROTOCOLS]; + bool allocP2pNetLLBuffers; + bool dmaBufSupport; + ncclNet_t* ncclNet; + ncclCollNet_t* ncclCollNet; + volatile uint32_t* abortFlag; + // Service thread + pthread_t thread; + struct ncclSocket* listenSock; + int stop; + CUcontext cudaCtx; + ncclResult_t asyncResult; + + // Used by main thread + union ncclSocketAddress* peerAddresses; + struct ncclSocket* peerSocks; + struct ncclProxyOps* proxyOps; + void** sharedDevMems; + struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS) + + // Progress thread + struct ncclProxyProgressState progressState; + + // Queue of expected responses from the proxy + struct ncclExpectedProxyResponse* expectedResponses; +}; + +enum proxyConnectState { + connUninitialized = 0, + connInitialized = 1, + connSharedInitialized = 2, + connSetupDone = 3, + connConnected = 4, + numConnStates = 5 +}; + +struct ncclProxyConnection { + int send, transport, shared; + int tpLocalRank, sameProcess; + struct ncclSocket* sock; + struct ncclTransportComm* tcomm; + struct ncclProxyArgs *proxyAppend; + struct ncclProxyArgs **proxyAppendPtr; + void* transportResources; + ncclNetDeviceHandle_t* netDeviceHandle; + void* mhandles[NCCL_NUM_PROTOCOLS]; + proxyConnectState state; + struct ncclCollNetSharedRes* collNet; + int needsProxyProgress; +}; + +typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); + +enum proxyMode { + proxyRing = 0, + proxyFrom = 1, + proxyTo = 2 +}; + +ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); +ncclResult_t ncclProxyStart(struct ncclComm* comm); +ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); +ncclResult_t ncclProxyCreate(struct ncclComm* comm); +ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); +enum ncclProxyMsgType { + ncclProxyMsgInit = 1, + ncclProxyMsgSharedInit = 2, + ncclProxyMsgSetup = 3, + ncclProxyMsgConnect = 4, + ncclProxyMsgStart = 5, + ncclProxyMsgClose = 6, + ncclProxyMsgAbort = 7, + ncclProxyMsgStop = 8, + ncclProxyMsgGetFd = 9, // cuMem API support (UDS) +}; + +// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types +// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of +// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed +ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); + +// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received +ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); +ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); + +ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd); + +ncclResult_t ncclProxyStop(struct ncclComm* comm); +ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); +ncclResult_t ncclProxyDestroy(struct ncclComm* comm); +#endif diff --git a/nvls/shm.h b/nvls/shm.h new file mode 100644 index 000000000..e75caa6a6 --- /dev/null +++ b/nvls/shm.h @@ -0,0 +1,25 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SHM_H_ +#define NCCL_SHM_H_ + +#include "nccl.h" + +typedef void* ncclShmHandle_t; +ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); +ncclResult_t ncclShmClose(ncclShmHandle_t handle); +ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); + +struct ncclShmemCollBuff { + volatile size_t *cnt[2]; + volatile void *ptr[2]; + int round; +}; + +ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); + +#endif diff --git a/nvls/socket.h b/nvls/socket.h new file mode 100644 index 000000000..9e5137289 --- /dev/null +++ b/nvls/socket.h @@ -0,0 +1,97 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_SOCKET_H_ +#define NCCL_SOCKET_H_ + +#include "nccl.h" +#include +#include +#include +#include +#include +#include + +#define MAX_IFS 16 +#define MAX_IF_NAME_SIZE 16 +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) +#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) +#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL + +/* Common socket address storage structure for IPv4/IPv6 */ +union ncclSocketAddress { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; + +enum ncclSocketState { + ncclSocketStateNone = 0, + ncclSocketStateInitialized = 1, + ncclSocketStateAccepting = 2, + ncclSocketStateAccepted = 3, + ncclSocketStateConnecting = 4, + ncclSocketStateConnectPolling = 5, + ncclSocketStateConnected = 6, + ncclSocketStateReady = 7, + ncclSocketStateClosed = 8, + ncclSocketStateError = 9, + ncclSocketStateNum = 10 +}; + +enum ncclSocketType { + ncclSocketTypeUnknown = 0, + ncclSocketTypeBootstrap = 1, + ncclSocketTypeProxy = 2, + ncclSocketTypeNetSocket = 3, + ncclSocketTypeNetIb = 4 +}; + +struct ncclSocket { + int fd; + int acceptFd; + int timedOutRetries; + int refusedRetries; + union ncclSocketAddress addr; + volatile uint32_t* abortFlag; + int asyncFlag; + enum ncclSocketState state; + int salen; + uint64_t magic; + enum ncclSocketType type; +}; + +const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); +ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); +int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); +int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); + +// Initialize a socket +ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); +// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call +ncclResult_t ncclSocketListen(struct ncclSocket* sock); +ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); +// Connect to sock->addr. sock->fd is set after a successful call. +ncclResult_t ncclSocketConnect(struct ncclSocket* sock); +// Return socket connection state. +ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); +// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. +ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); +ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); +ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); + +#define NCCL_SOCKET_SEND 0 +#define NCCL_SOCKET_RECV 1 + +ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); +ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); +ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); +ncclResult_t ncclSocketClose(struct ncclSocket* sock); +#endif diff --git a/nvls/strongstream.h b/nvls/strongstream.h new file mode 100644 index 000000000..0984dfe57 --- /dev/null +++ b/nvls/strongstream.h @@ -0,0 +1,140 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_STRONGSTREAM_H_ +#define NCCL_STRONGSTREAM_H_ + +#include "nccl.h" +#include "checks.h" + +#include + +/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes + * easily. + */ +struct ncclCudaGraph { +#if CUDART_VERSION >= 11030 + cudaGraph_t graph; + unsigned long long graphId; +#endif +}; + +inline struct ncclCudaGraph ncclCudaGraphNone() { + struct ncclCudaGraph tmp; + #if CUDART_VERSION >= 11030 + tmp.graph = nullptr; + tmp.graphId = ULLONG_MAX; + #endif + return tmp; +} + +inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { + #if CUDART_VERSION >= 11030 + return graph.graph != nullptr; + #else + return false; + #endif +} + +inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) { + #if CUDART_VERSION >= 11030 + return a.graphId == b.graphId; + #else + return true; + #endif +} + +ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream); +ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg); + +/* ncclStrongStream: An abstraction over CUDA streams that do not lose their + * identity while being captured. Regular streams have the deficiency that the + * captured form of a stream in one graph launch has no relation to the + * uncaptured stream or to the captured form in other graph launches. This makes + * streams unfit for the use of serializing access to a persistent resource. + * Strong streams have been introduced to address this need. + * + * - All updates to a strong stream must be enclosed by a Acquire/Release pair. + * + * - The Acquire, Release, and all updates take a ncclCudaGraph parameter + * indicating the currently capturing graph (or none). This parameter must be + * the same for the entire sequence of {Acquire; ...; Release}. + * + * - An {Acquire; ...; Release} sequence must not be concurrent with any + * other operations against the strong stream including graph launches which + * reference this stream. + */ +struct ncclStrongStream; + +ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); +ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); + +// Acquire-fence the strong stream. +ncclResult_t ncclStrongStreamAcquire( + struct ncclCudaGraph graph, struct ncclStrongStream* ss +); + +// Acquire-fence the strong stream assuming no graph is capturing. This permits +// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA +// calls. Strong stream still must be released via: +// ncclStrongStreamRelease(ncclCudaGraphNone(), ss); +ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); + +// Release-fence of the strong stream. +ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); + +// Add a host launch to the stream. +ncclResult_t ncclStrongStreamLaunchHost( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, + cudaHostFn_t fn, void* arg +); +// Add a kernel launch to the stream. +ncclResult_t ncclStrongStreamLaunchKernel( + struct ncclCudaGraph graph, struct ncclStrongStream* ss, + void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes +); + +// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. +// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus +// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the +// implementation to induce few graph dependencies. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false +); +// `b` must be capturing within `graph`. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false +); +// `a` must be capturing within `graph`. +ncclResult_t ncclStrongStreamWaitStream( + struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false +); + +// Synchrnoization does not need the strong stream to be acquired. +ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclStrongStreamGraph; // internal to ncclStrongStream + +struct ncclStrongStream { + // Used when not graph capturing. + cudaStream_t cudaStream; +#if CUDART_VERSION >= 11030 + // The event used to establish order between graphs and streams. During acquire + // this event is waited on, during release it is recorded to. + cudaEvent_t serialEvent; + // This stream ever appeared in a graph capture. + bool everCaptured; + // Tracks whether serialEvent needs to be recorded to upon Release(). + bool serialEventNeedsRecord; + struct ncclStrongStreamGraph* graphHead; +#else + cudaEvent_t scratchEvent; +#endif +}; + +#endif diff --git a/nvls/test.cu b/nvls/test.cu new file mode 100644 index 000000000..6e4d39bc8 --- /dev/null +++ b/nvls/test.cu @@ -0,0 +1,172 @@ +#include +#include +#include +#include +// #include +#include +#include +#include "ipcsocket.cc" + +#define CUCHECK(cmd) do { \ + auto err = cmd; \ + if( err != 0 ) { \ + printf("Cuda failure %d: Line %d", err, __LINE__); \ + } \ +} while(false) + +//AR kernel snippet for sm_90 only + +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), \ + "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) \ + : "memory"); +//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ + : "memory"); + +__global__ void testing2(float* uc_ptr){ + uc_ptr[0] = 1.0; + printf("ptr -> %f\n", uc_ptr[0]); +} + + +__global__ void testing(float* mc_ptr, int numlines, int myrank, int RANKS){ + //for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction + //line is assumed to be 16B 4 ints of 8 halves + const int start_elem = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); + const int end_elem = max(start_elem, numlines); + const int lineoffset = (blockIdx.x * blockDim.x + threadIdx.x) * 4; + const int loop_step0 = (blockDim.x * gridDim.x) * 4; + __syncthreads(); + printf("start %d, end %d step %d\n", start_elem, end_elem, loop_step0); + for (int line = start_elem; line < end_elem; line += loop_step0) { + uint4 val; + MULTIMEM_LD(val, mc_ptr + (lineoffset + line)); + printf("val %f\n", *(float*)&(val.x)); + MULTIMEM_ST(val, mc_ptr + (lineoffset + line)); + } + __syncthreads(); +} + +int main(){ + int myrank, nranks; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + + cudaSetDevice(myrank); + CUresult res; + + size_t size = 1024*1024*512*3; + CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + CUmulticastObjectProp mcProp = {}; + mcProp.numDevices = nranks; + mcProp.size = size; + mcProp.handleTypes = handleType; + + size_t minGran, gran; + gran = 0; + minGran = 0; + CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); + CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + // printf("gran = %lld, minGrad = %lld\n", gran, minGran); + size_t mcSize = ((size+gran-1)/gran)*gran; + mcProp.size = mcSize; + + CUmemGenericAllocationHandle handle; + //only one rank creates the multicast object + if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); + + int fd, peerfd; + fd = 0; + peerfd = 0; + if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); + + //some ugly UDS business + // Borrow ipcsocket.{c,h} from nccl code + //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles + // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node + + volatile uint32_t abortFlag = 0; + struct ncclIpcSocket ipcSock = { 0 }; + uint64_t opId=0xdeadcafebeef; + // ncclResult_t ret = ncclSuccess; + + ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag); + MPI_Barrier(MPI_COMM_WORLD); + if(!myrank) { + for(int p=1;p>>((float*)mc_va); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + testing<<<1, 1>>>((float*)mc_va, 1, myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +} +//........ + + diff --git a/nvls/test2.cpp b/nvls/test2.cpp new file mode 100644 index 000000000..400d566ae --- /dev/null +++ b/nvls/test2.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include +#include +#include + +#define CUCHECK(cmd) do { \ + auto err = cmd; \ + if( err != 0 ) { \ + printf("Cuda failure %d: Line %d", err, __LINE__); \ + } \ +} while(false) + +int main(){ + int myrank, nranks; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + + cudaSetDevice(myrank); + CUresult res; + + +CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + CUmulticastObjectProp mcProp = {}; + mcProp.numDevices = nranks; + mcProp.size = size; + mcProp.handleTypes = handleType; + + size_t minGran, gran; + CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); + CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + size_t mcSize = ((size+gran-1)/gran)*gran; + mcProp.size = mcSize; + + //only one rank creates the multicast object + if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); + + int fd, peerfd; + if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); + + //some ugly UDS business + // Borrow ipcsocket.{c,h} from nccl code + //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles + // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node + + volatile uint32_t abortFlag = 0; + struct ncclIpcSocket ipcSock = { 0 }; + uint64_t opId=0xdeadcafebeef; + ncclResult_t ret = ncclSuccess; + + NCCLCHECK(ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag)); + MPI_Barrier(MPI_COMM_WORLD); + if(!myrank) + for(int p=1;p= 900 +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), \ + "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) \ + : "memory"); +//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ + : "memory"); +#endif + +//for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction +//line is assumed to be 16B 4 ints of 8 halves +const int start_elem = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); +const int end_elem = max(start_elem, numlines); +__syncthreads(); + for (int line = start_elem; line < end_elem; line += loop_step0) { + uint4 val; + MULTIMEM_LD(val, mc_ptr + (lineoffset + line)) + MULTIMEM_ST(val, mc_ptr + (lineoffset + line)) + } +__syncthreads(); + +*/ diff --git a/nvls/timer.h b/nvls/timer.h new file mode 100644 index 000000000..284fec6e0 --- /dev/null +++ b/nvls/timer.h @@ -0,0 +1,60 @@ +/************************************************************************* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TIMER_H_ +#define NCCL_TIMER_H_ +#if ENABLE_TIMER +#include +#include +#include +static double freq = -1; +static void calibrate() { + struct timeval tv; + gettimeofday(&tv, NULL); + uint64_t timeCycles = __rdtsc(); + double time = - tv.tv_sec*1E6 - tv.tv_usec; + uint64_t total = 0ULL; + for (int i=0; i<10000; i++) total += __rdtsc(); + gettimeofday(&tv, NULL); + timeCycles = __rdtsc() - timeCycles; + time += tv.tv_sec*1E6 + tv.tv_usec; + freq = timeCycles/time; +} +static inline double gettime() { + if (freq == -1) calibrate(); + return __rdtsc()/freq; +} +static uint64_t counts[8]; +static double times[8]; +static double startTimes[8]; +#define TIME_START(index) do { \ + counts[index]++; \ + startTimes[index] = gettime(); \ +} while (0); + +#define TIME_STOP(index) do { \ + times[index] += gettime() - startTimes[index]; \ +} while (0); + +#define TIME_CANCEL(index) do { \ + counts[index]--; \ +} while (0); + +#define TIME_PRINT(name) do { \ + printf("%s stats", name); \ + for (int i=0; i<8; i++) { \ + if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ + counts[i] = 0; \ + } \ + printf("\n"); \ +} while (0); +#else +#define TIME_START(index) while(0); +#define TIME_STOP(index) while(0); +#define TIME_CANCEL(index) while(0); +#define TIME_PRINT(name) +#endif +#endif diff --git a/nvls/transport.h b/nvls/transport.h new file mode 100644 index 000000000..27529df5e --- /dev/null +++ b/nvls/transport.h @@ -0,0 +1,128 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TRANSPORT_H_ +#define NCCL_TRANSPORT_H_ + +#include "device.h" +#include "graph.h" +#include "nvmlwrap.h" +#include "core.h" + +#define NTRANSPORTS 4 +#define TRANSPORT_P2P 0 +#define TRANSPORT_SHM 1 +#define TRANSPORT_NET 2 +#define TRANSPORT_COLLNET 3 + +#include "proxy.h" + +extern struct ncclTransport p2pTransport; +extern struct ncclTransport shmTransport; +extern struct ncclTransport netTransport; +extern struct ncclTransport collNetTransport; + +extern struct ncclTransport* ncclTransports[]; + +// Forward declarations +struct ncclRing; +struct ncclConnector; +struct ncclComm; + +struct ncclPeerInfo { + int rank; + int cudaDev; + int nvmlDev; + int gdrSupport; + uint64_t hostHash; + uint64_t pidHash; + dev_t shmDev; + int64_t busId; + struct ncclComm* comm; + int cudaCompCap; +}; + +#define CONNECT_SIZE 128 +struct ncclConnect { + char data[CONNECT_SIZE]; +}; + +#if CUDART_VERSION >= 12010 + +#define NVLS_HANDLE_SIZE 64 +struct ncclNvlsSharedRes { + int refCount; + CUmulticastObjectProp properties; + CUmemAccessDesc accessDesc; + int dev; + size_t size; + size_t granularity; + CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer + char* mcBuff; // Multicast NVLS buffer address + CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer + char* ucBuff; // Unicast NVLS buffer address + char shareableHandle[NVLS_HANDLE_SIZE]; + size_t ucGran; + int nChannels; + struct ncclShmemCollBuff nvlsShmem; + void *nvlsShmemHandle; +}; + +#endif /* CUDART_VERSION >= 12010 */ + +struct ncclCollNetSharedRes { + int refCount; + int size; + char* cudaBuff; + char* hostBuff; + struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; + void* resources; + int nChannels; + size_t buffSize; +}; + +struct ncclTransportComm { + ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); + ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); + ncclResult_t (*free)(struct ncclConnector*); + ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels); + ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); + ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState); + ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); +}; + +struct ncclTransport { + const char name[8]; + ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); + struct ncclTransportComm send; + struct ncclTransportComm recv; +}; + +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); + +// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange +#define USE_POSIX_FD 1 + +#if USE_POSIX_FD +#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR +#else +#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE +#endif + +ncclResult_t ncclNvlsInit(struct ncclComm* comm); +ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); +ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); +ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); +ncclResult_t ncclNvlsFree(struct ncclComm* comm); + +enum { collNetRecv=0, collNetSend=1 }; +int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); +ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); +ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); +#endif diff --git a/nvls/trees.h b/nvls/trees.h new file mode 100644 index 000000000..ded84a667 --- /dev/null +++ b/nvls/trees.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TREES_H_ +#define NCCL_TREES_H_ + +ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); +ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); + +#endif diff --git a/nvls/tuner.h b/nvls/tuner.h new file mode 100644 index 000000000..d8b275017 --- /dev/null +++ b/nvls/tuner.h @@ -0,0 +1,22 @@ +/************************************************************************* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_TUNER_H_ +#define NCCL_INT_TUNER_H_ + +#include "nccl_tuner.h" + +// Tuning plugin to override NCCL's default algorithm/protocol tuning. + +// Attempts to load NCCL tuner from environmental variable. +// Returns ncclSuccess if the correct tuner symbol has been found and +// successully loaded. Otherwise returns an error and also logs the error. +ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner); + +// Cleans up NCCL tuner plugin. +ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner); +#endif diff --git a/nvls/utils.h b/nvls/utils.h new file mode 100644 index 000000000..60f6efb5f --- /dev/null +++ b/nvls/utils.h @@ -0,0 +1,524 @@ +/************************************************************************* + * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_UTILS_H_ +#define NCCL_UTILS_H_ + +#include "nccl.h" +#include "alloc.h" +#include "checks.h" +#include +#include +#include +#include +#include + +int ncclCudaCompCap(); + +// PCI Bus ID <-> int64 conversion functions +ncclResult_t int64ToBusId(int64_t id, char* busId); +ncclResult_t busIdToInt64(const char* busId, int64_t* id); + +ncclResult_t getBusId(int cudaDev, int64_t *busId); + +ncclResult_t getHostName(char* hostname, int maxlen, const char delim); +uint64_t getHash(const char* string, int n); +uint64_t getHostHash(); +uint64_t getPidHash(); +ncclResult_t getRandomData(void* buffer, size_t bytes); + +struct netIf { + char prefix[64]; + int port; +}; + +int parseStringList(const char* string, struct netIf* ifList, int maxList); +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); + +static long log2i(long n) { + long l = 0; + while (n>>=1) l++; + return l; +} + +inline uint64_t clockNano() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; +} + +/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else + * return -1 */ +inline ncclResult_t getRandomData(void* buffer, size_t bytes) { + ncclResult_t ret = ncclSuccess; + if (bytes > 0) { + const size_t one = 1UL; + FILE* fp = fopen("/dev/urandom", "r"); + if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError; + if (fp) fclose(fp); + } + return ret; +} + +//////////////////////////////////////////////////////////////////////////////// + +template +inline void ncclAtomicRefCountIncrement(Int* refs) { + __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); +} + +template +inline Int ncclAtomicRefCountDecrement(Int* refs) { + return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); +} + +//////////////////////////////////////////////////////////////////////////////// +/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that + * granularity of LIFO is not per object, instead frames containing many objects + * are pushed and popped. Therefor deallocation is extremely cheap since its + * done at the frame granularity. + * + * The initial state of the stack is with one frame, the "nil" frame, which + * cannot be popped. Therefor objects allocated in the nil frame cannot be + * deallocated sooner than stack destruction. + */ +struct ncclMemoryStack; + +void ncclMemoryStackConstruct(struct ncclMemoryStack* me); +void ncclMemoryStackDestruct(struct ncclMemoryStack* me); +void ncclMemoryStackPush(struct ncclMemoryStack* me); +void ncclMemoryStackPop(struct ncclMemoryStack* me); +template +T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for + * a pool instance to ever hold objects whose type have differing + * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by + * a backing `ncclMemoryStack` passed during Alloc(). If memory + * backing any currently held object is deallocated then it is an error to do + * anything other than reconstruct it, after which it is a valid empty pool. + */ +struct ncclMemoryPool; + +// Equivalent to zero-initialization +void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); +template +T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); +template +void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); +void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer + * field is given via the `next` template argument. + * + * Example: + * struct Foo { + * struct Foo *next1, *next2; // can be a member of two lists at once + * }; + * ncclIntruQueue list1; + * ncclIntruQueue list2; + */ +template +struct ncclIntruQueue; + +template +void ncclIntruQueueConstruct(ncclIntruQueue *me); +template +bool ncclIntruQueueEmpty(ncclIntruQueue *me); +template +T* ncclIntruQueueHead(ncclIntruQueue *me); +template +void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); +template +T* ncclIntruQueueDequeue(ncclIntruQueue *me); +template +T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); +template +void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *memPool); + +//////////////////////////////////////////////////////////////////////////////// +/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" + * and "cond" fields are part of the public interface. + */ +struct ncclThreadSignal { + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} +constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); + +void ncclThreadSignalConstruct(struct ncclThreadSignal* me); +void ncclThreadSignalDestruct(struct ncclThreadSignal* me); + +// A convenience instance per-thread. +extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; + +//////////////////////////////////////////////////////////////////////////////// + +template +struct ncclIntruQueueMpsc; + +template +void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); +template +bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); +// Enqueue element. Returns true if queue is not abandoned. Even if queue is +// abandoned the element enqueued, so the caller needs to make arrangements for +// the queue to be tended. +template +bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); +// Dequeue all elements at a glance. If there aren't any and `waitSome` is +// true then this call will wait until it can return a non empty list. +template +T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); +// Dequeue all elements and set queue to abandoned state. +template +T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclMemoryStack { + struct Hunk { + struct Hunk* above; // reverse stack pointer + size_t size; // size of this allocation (including this header struct) + }; + struct Unhunk { // proxy header for objects allocated out-of-hunk + struct Unhunk* next; + void* obj; + }; + struct Frame { + struct Hunk* hunk; // top of non-empty hunks + uintptr_t bumper, end; // points into top hunk + struct Unhunk* unhunks; + struct Frame* below; + }; + + static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); + static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); + + struct Hunk stub; + struct Frame topFrame; +}; + +inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { + me->stub.above = nullptr; + me->stub.size = 0; + me->topFrame.hunk = &me->stub; + me->topFrame.bumper = 0; + me->topFrame.end = 0; + me->topFrame.unhunks = nullptr; + me->topFrame.below = nullptr; +} + +inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { + uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align); + void* obj; + if (__builtin_expect(o + size <= me->topFrame.end, true)) { + me->topFrame.bumper = o + size; + obj = reinterpret_cast(o); + } else { + obj = allocateSpilled(me, size, align); + } + return obj; +} + +template +inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { + void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); + memset(obj, 0, n*sizeof(T)); + return (T*)obj; +} + +inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { + using Frame = ncclMemoryStack::Frame; + Frame tmp = me->topFrame; + Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); + *snapshot = tmp; // C++ struct assignment + me->topFrame.unhunks = nullptr; + me->topFrame.below = snapshot; +} + +inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { + ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; + while (un != nullptr) { + free(un->obj); + un = un->next; + } + me->topFrame = *me->topFrame.below; // C++ struct assignment +} + + +//////////////////////////////////////////////////////////////////////////////// + +struct ncclMemoryPool { + struct Cell { + Cell *next; + }; + struct Cell* head; + struct Cell* tail; // meaningful only when head != nullptr +}; + +inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { + me->head = nullptr; +} + +template +inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { + using Cell = ncclMemoryPool::Cell; + Cell* cell; + if (__builtin_expect(me->head != nullptr, true)) { + cell = me->head; + me->head = cell->next; + } else { + // Use the internal allocate() since it doesn't memset to 0 yet. + size_t cellSize = std::max(sizeof(Cell), sizeof(T)); + size_t cellAlign = std::max(alignof(Cell), alignof(T)); + cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign); + } + memset(cell, 0, sizeof(T)); + return reinterpret_cast(cell); +} + +template +inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { + using Cell = ncclMemoryPool::Cell; + Cell* cell = reinterpret_cast(obj); + cell->next = me->head; + if (me->head == nullptr) me->tail = cell; + me->head = cell; +} + +inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { + if (from->head != nullptr) { + from->tail->next = me->head; + if (me->head == nullptr) me->tail = from->tail; + me->head = from->head; + from->head = nullptr; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +template +struct ncclIntruQueue { + T *head, *tail; +}; + +template +inline void ncclIntruQueueConstruct(ncclIntruQueue *me) { + me->head = nullptr; + me->tail = nullptr; +} + +template +inline bool ncclIntruQueueEmpty(ncclIntruQueue *me) { + return me->head == nullptr; +} + +template +inline T* ncclIntruQueueHead(ncclIntruQueue *me) { + return me->head; +} + +template +inline T* ncclIntruQueueTail(ncclIntruQueue *me) { + return me->tail; +} + +template +inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { + x->*next = nullptr; + (me->head ? me->tail->*next : me->head) = x; + me->tail = x; +} + +template +inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { + T *ans = me->head; + me->head = ans->*next; + if (me->head == nullptr) me->tail = nullptr; + return ans; +} + +template +inline bool ncclIntruQueueDelete(ncclIntruQueue *me, T *x) { + T *prev = nullptr; + T *cur = me->head; + bool found = false; + + while (cur) { + if (cur == x) { + found = true; + break; + } + prev = cur; + cur = cur->*next; + } + + if (found) { + if (prev == nullptr) + me->head = cur->*next; + else + prev->*next = cur->*next; + if (cur == me->tail) + me->tail = prev; + } + return found; +} + +template +inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { + T *ans = me->head; + if (ans != nullptr) { + me->head = ans->*next; + if (me->head == nullptr) me->tail = nullptr; + } + return ans; +} + +template +void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *pool) { + T *head = me->head; + me->head = nullptr; + me->tail = nullptr; + while (head != nullptr) { + T *tmp = head->*next; + ncclMemoryPoolFree(pool, tmp); + head = tmp; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { + return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; +} + +inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { + pthread_mutex_init(&me->mutex, nullptr); + pthread_cond_init(&me->cond, nullptr); +} + +inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { + pthread_mutex_destroy(&me->mutex); + pthread_cond_destroy(&me->cond); +} + +//////////////////////////////////////////////////////////////////////////////// + +template +struct ncclIntruQueueMpsc { + T* head; + uintptr_t tail; + struct ncclThreadSignal* waiting; +}; + +template +void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { + me->head = nullptr; + me->tail = 0x0; + me->waiting = nullptr; +} + +template +bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { + return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; +} + +template +bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { + __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); + T* prev = reinterpret_cast(utail); + T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); + __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); + if (utail == 0x1) { // waiting + __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting + // This lock/unlock is essential to ensure we don't race ahead of the consumer + // and signal the cond before they begin waiting on it. + struct ncclThreadSignal* waiting = me->waiting; + pthread_mutex_lock(&waiting->mutex); + pthread_mutex_unlock(&waiting->mutex); + pthread_cond_broadcast(&waiting->cond); + } + return utail != 0x2; // not abandoned +} + +template +T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { + T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + if (head == nullptr) { + if (!waitSome) return nullptr; + uint64_t t0 = clockNano(); + bool sleeping = false; + do { + if (clockNano()-t0 >= 10*1000) { // spin for first 10us + struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; + pthread_mutex_lock(&waitSignal->mutex); + uintptr_t expected = sleeping ? 0x1 : 0x0; + uintptr_t desired = 0x1; + me->waiting = waitSignal; // release done by successful compare exchange + if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { + sleeping = true; + pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); + } + pthread_mutex_unlock(&waitSignal->mutex); + } + head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + } while (head == nullptr); + } + + __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); + T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); + T *x = head; + while (x != tail) { + T *x1; + int spins = 0; + while (true) { + x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); + if (x1 != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + x = x1; + } + return head; +} + +template +T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { + uintptr_t expected = 0x0; + if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { + return nullptr; + } else { + int spins = 0; + T* head; + while (true) { + head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); + if (head != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); + uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); + T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); + T *x = head; + while (x != tail) { + T *x1; + spins = 0; + while (true) { + x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); + if (x1 != nullptr) break; + if (++spins == 1024) { spins = 1024-1; sched_yield(); } + } + x = x1; + } + return head; + } +} +#endif From a7b627a7165747cf092d428ddf89790c3e9b9a35 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 Jan 2024 22:47:53 +0000 Subject: [PATCH 02/67] a bit cleaned up --- nvls/test.cu | 198 +++++++++++++++++++++++++++------------------------ 1 file changed, 103 insertions(+), 95 deletions(-) diff --git a/nvls/test.cu b/nvls/test.cu index 6e4d39bc8..08a156dfc 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -1,172 +1,180 @@ -#include #include +#include #include #include // #include -#include #include +#include + #include "ipcsocket.cc" -#define CUCHECK(cmd) do { \ - auto err = cmd; \ - if( err != 0 ) { \ - printf("Cuda failure %d: Line %d", err, __LINE__); \ - } \ -} while(false) +#define CUCHECK(cmd) \ + do { \ + auto err = cmd; \ + if (err != 0) { \ + printf("Cuda failure %d: Line %d", err, __LINE__); \ + exit(-1); \ + } \ + } while (false) -//AR kernel snippet for sm_90 only +// AR kernel snippet for sm_90 only -#define MULTIMEM_ST(val, ptr) \ - asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), \ - "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) \ +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ + "r"(val.w) \ : "memory"); -//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc -#define MULTIMEM_LD(val, ptr) \ - asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ - : "l"(ptr) \ +// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ : "memory"); -__global__ void testing2(float* uc_ptr){ - uc_ptr[0] = 1.0; - printf("ptr -> %f\n", uc_ptr[0]); -} +__global__ void testing2(float* uc_ptr) { uc_ptr[0] = 1.0; } +#define UNROLL 8 +__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) { + // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction + // line is assumed to be 16B 4 ints of 8 halves + int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks; + int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks; -__global__ void testing(float* mc_ptr, int numlines, int myrank, int RANKS){ - //for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction - //line is assumed to be 16B 4 ints of 8 halves - const int start_elem = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); - const int end_elem = max(start_elem, numlines); - const int lineoffset = (blockIdx.x * blockDim.x + threadIdx.x) * 4; - const int loop_step0 = (blockDim.x * gridDim.x) * 4; - __syncthreads(); - printf("start %d, end %d step %d\n", start_elem, end_elem, loop_step0); - for (int line = start_elem; line < end_elem; line += loop_step0) { - uint4 val; - MULTIMEM_LD(val, mc_ptr + (lineoffset + line)); - printf("val %f\n", *(float*)&(val.x)); - MULTIMEM_ST(val, mc_ptr + (lineoffset + line)); - } - __syncthreads(); + int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4; + int my_step = blockDim.x * gridDim.x * 4; + + for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { + uint4 val; + MULTIMEM_LD(val, mc_ptr + idx); + MULTIMEM_ST(val, mc_ptr + idx); + } } -int main(){ +int main() { int myrank, nranks; MPI_Init(NULL, NULL); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); - + cudaSetDevice(myrank); CUresult res; - size_t size = 1024*1024*512*3; + size_t size = 1024 * 1024 * 512; CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - + CUmulticastObjectProp mcProp = {}; mcProp.numDevices = nranks; mcProp.size = size; mcProp.handleTypes = handleType; - + size_t minGran, gran; gran = 0; minGran = 0; CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - // printf("gran = %lld, minGrad = %lld\n", gran, minGran); - size_t mcSize = ((size+gran-1)/gran)*gran; + if (!myrank) printf("gran = %lu, minGrad = %lu\n", gran, minGran); + size_t mcSize = ((size + gran - 1) / gran) * gran; mcProp.size = mcSize; CUmemGenericAllocationHandle handle; - //only one rank creates the multicast object - if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); - + // only one rank creates the multicast object + if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); + int fd, peerfd; fd = 0; peerfd = 0; - if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); - - //some ugly UDS business - // Borrow ipcsocket.{c,h} from nccl code - //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles - // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node - + if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); + + // some ugly UDS business + // Borrow ipcsocket.{c,h} from nccl code + // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the + // exported handles + // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node + volatile uint32_t abortFlag = 0; - struct ncclIpcSocket ipcSock = { 0 }; - uint64_t opId=0xdeadcafebeef; + struct ncclIpcSocket ipcSock = {0}; + uint64_t opId = 0xdeadcafebeef; // ncclResult_t ret = ncclSuccess; ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag); MPI_Barrier(MPI_COMM_WORLD); - if(!myrank) { - for(int p=1;p>>((float*)mc_va); cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); - testing<<<1, 1>>>((float*)mc_va, 1, myrank, nranks); + int rept = 10; + int nblocks = 16; + int blocksize = 1024; + // warmup + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); + } cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); + double st = MPI_Wtime(); + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); + } + cudaDeviceSynchronize(); + double en = MPI_Wtime(); + double time = (en - st) / rept; + if (!myrank) printf("Time = %f, bw = %f\n", time, size / 1024. / 1024. / 1024. / time); + MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); -} +} //........ - - From 4ff8a8954cb36db089222717e327b273cc6c25a1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Tue, 2 Jan 2024 23:13:02 +0000 Subject: [PATCH 03/67] clean up --- nvls/test.cu | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/nvls/test.cu b/nvls/test.cu index 08a156dfc..effd7653b 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -158,22 +158,24 @@ int main() { cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); int rept = 10; - int nblocks = 16; - int blocksize = 1024; - // warmup - for (int i = 0; i < rept; i++) { - testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); - } - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - double st = MPI_Wtime(); - for (int i = 0; i < rept; i++) { - testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); + for (int input_size = 1024*1024*8; input_size <= size; input_size *= 2){ + int block_size = 1024; + int nblocks = 16; + // warmup + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); + } + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + double st = MPI_Wtime(); + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); + } + cudaDeviceSynchronize(); + double en = MPI_Wtime(); + double time = (en - st) / rept; + if (!myrank) printf("input_size %d | Time = %f, alg_bw = %f\n", input_size, time, input_size / 1024. / 1024. / 1024. / time); } - cudaDeviceSynchronize(); - double en = MPI_Wtime(); - double time = (en - st) / rept; - if (!myrank) printf("Time = %f, bw = %f\n", time, size / 1024. / 1024. / 1024. / time); MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); } From dfab9fe9e7c42a6baf2450c82c894f37b357d5a1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 3 Jan 2024 03:53:32 +0000 Subject: [PATCH 04/67] nvls connection wip --- include/mscclpp/core.hpp | 6 ++++-- nvls/test.cu | 42 ++++++++++++++++++++++++++++++-------- src/connection.cc | 35 +++++++++++++++++++++++++++++++ src/include/connection.hpp | 15 ++++++++++++++ 4 files changed, 87 insertions(+), 11 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 60494a099..24b54fd33 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -125,6 +125,7 @@ class TcpBootstrap : public Bootstrap { enum class Transport { Unknown, // Unknown transport type. CudaIpc, // CUDA IPC transport type. + Nvls, // NVLS transport type. IB0, // InfiniBand device 0 transport type. IB1, // InfiniBand device 1 transport type. IB2, // InfiniBand device 2 transport type. @@ -136,10 +137,11 @@ enum class Transport { NumTransports // The number of transports. }; -const std::string TransportNames[] = {"UNK", "IPC", "IB0", "IB1", "IB2", "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; +const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2", + "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; namespace detail { -const size_t TransportFlagsSize = 10; +const size_t TransportFlagsSize = 11; static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); /// Bitset for storing transport flags. diff --git a/nvls/test.cu b/nvls/test.cu index effd7653b..7bf12a699 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -30,9 +30,22 @@ : "l"(ptr) \ : "memory"); -__global__ void testing2(float* uc_ptr) { uc_ptr[0] = 1.0; } +__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) { + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){ + uc_ptr[idx] = myrank + idx; + } +} + +__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) { + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){ + float expected = (float)((nranks * (nranks-1)) / 2 + nranks * idx); + if (abs(uc_ptr[idx] - expected) > 0.01 * expected){ + printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected); + } + } +} + -#define UNROLL 8 __global__ void testing(float* mc_ptr, int size, int myrank, int nranks) { // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction // line is assumed to be 16B 4 ints of 8 halves @@ -72,7 +85,7 @@ int main() { CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - if (!myrank) printf("gran = %lu, minGrad = %lu\n", gran, minGran); + if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran); size_t mcSize = ((size + gran - 1) / gran) * gran; mcProp.size = mcSize; @@ -154,13 +167,24 @@ int main() { CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0)); // set access on MC address CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1)); - testing2<<<1, 1>>>((float*)mc_va); + + int rept = 10; + int block_size = 1024; + int nblocks = 16; + cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); - int rept = 10; - for (int input_size = 1024*1024*8; input_size <= size; input_size *= 2){ - int block_size = 1024; - int nblocks = 16; + init_kernel<<>>((float*)uc_va, size/sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + check_correctness<<>>((float*)uc_va, size/sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + + for (int input_size = 1024*3; input_size <= size; input_size *= 2){ // warmup for (int i = 0; i < rept; i++) { testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); @@ -174,7 +198,7 @@ int main() { cudaDeviceSynchronize(); double en = MPI_Wtime(); double time = (en - st) / rept; - if (!myrank) printf("input_size %d | Time = %f, alg_bw = %f\n", input_size, time, input_size / 1024. / 1024. / 1024. / time); + if (!myrank) printf("input_size %d | Time = %f us, alg_bw = %f (GBps)\n", input_size, time*1e6, input_size / 1e9 / time); } MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); diff --git a/src/connection.cc b/src/connection.cc index 834a1456c..f1331e679 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -92,6 +92,41 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { INFO(MSCCLPP_P2P, "CudaIpcConnection flushing connection"); } +// NVLS + +NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints) { + if (localEndpoint.transport() != Transport::Nvls) { + throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage); + } + for (auto remoteEndpoint : remoteEndpoints) { + if (remoteEndpoint.transport() != Transport::Nvls) { + throw mscclpp::Error("NVLS connection can only be made to a NVLS endpoint", ErrorCode::InvalidUsage); + } + // sanity check: make sure the IPC connection is being made within a node + if (getImpl(remoteEndpoint)->hostHash_ != getImpl(localEndpoint)->hostHash_) { + std::stringstream ss; + ss << "NVLS connection can only be made within a node: " << std::hex << getImpl(remoteEndpoint)->hostHash_ + << " != " << std::hex << getImpl(localEndpoint)->hostHash_; + throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage); + } + } + INFO(MSCCLPP_P2P, "NVLS connection created"); +} + +Transport NvlsConnection::transport() { return Transport::Nvls; } + +Transport NvlsConnection::remoteTransport() { return Transport::Nvls; } + +void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) { + throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage); +} + +void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64_t) { + throw Error("NVLS does not have a CPU updateAndSync API", ErrorCode::InvalidUsage); +} + +void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); } + // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 47b154758..5f9108753 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -31,6 +31,21 @@ class CudaIpcConnection : public Connection { void flush(int64_t timeoutUsec) override; }; +class NvlsConnection : public Connection { + public: + NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints); + + Transport transport() override; + + Transport remoteTransport() override; + + void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset, + uint64_t size) override; + void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; + + void flush(int64_t timeoutUsec) override; +}; + class IBConnection : public Connection { Transport transport_; Transport remoteTransport_; From 643f124e4b648f48b2cd9ff9e8a67cbcf1ca48ef Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 5 Jan 2024 00:55:39 +0000 Subject: [PATCH 05/67] wip --- src/connection.cc | 47 +++++++++++++++++++++++++++++++++++++- src/include/connection.hpp | 10 +++++++- src/registered_memory.cc | 7 ++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index f1331e679..a2a2f12f9 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -94,7 +94,9 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // NVLS -NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints) { +NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints, size_t bufferSize, + bool isRoot) + : isRoot_(isRoot) { if (localEndpoint.transport() != Transport::Nvls) { throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage); } @@ -110,6 +112,47 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector rem throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage); } } + int nDevices = 1 + remoteEndpoints.size(); + MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId_)); + + CUmulticastObjectProp mcProp = {}; + mcProp.numDevices = nDevices; + mcProp.size = bufferSize; + mcProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + size_t minGran = 0; + size_t gran = 0; + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + // only root needs to create the multicast handle + if (isRoot_) { + size_t mcSize = ((bufferSize + gran - 1) / gran) * gran; + mcProp.size = mcSize; + + MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp)); + } + + // Allocate physical memory + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = cudaDeviceId_; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + // allocate physical memory (data buffer) + MSCCLPP_CUTHROW(cuMemCreate(&memHandle_, bufferSize, &prop, 0 /*flags*/)); + + // usual VA business: map both MC and PA to two different VA addresses + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = cudaDeviceId_; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + // Map a VA to UC space + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, minGran, 0U, 0)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0)); + // set access on UC address + MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1)); + INFO(MSCCLPP_P2P, "NVLS connection created"); } @@ -127,6 +170,8 @@ void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64 void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); } +void* NvlsConnection::getDevicePointer() { return deviceBuffer_; } + // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 5f9108753..f15283b28 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -32,8 +32,14 @@ class CudaIpcConnection : public Connection { }; class NvlsConnection : public Connection { + int cudaDeviceId_; + bool isRoot_; + CUmemGenericAllocationHandle mcHandle_; + CUmemGenericAllocationHandle memHandle_; + void* deviceBuffer_; + public: - NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints); + NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints, size_t bufferSize, bool isRoot); Transport transport() override; @@ -44,6 +50,8 @@ class NvlsConnection : public Connection { void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override; void flush(int64_t timeoutUsec) override; + + void* getDevicePointer(); }; class IBConnection : public Connection { diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 6d5fd79f5..043268821 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -20,6 +20,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, hostHash(getHostHash()), pidHash(getPidHash()), transports(transports) { + // CUDA IPC if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; transportInfo.transport = Transport::CudaIpc; @@ -34,6 +35,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr; this->transportInfos.push_back(transportInfo); } + + // IB if ((transports & AllIBTransports).any()) { auto addIb = [&](Transport ibTransport) { TransportInfo transportInfo; @@ -54,6 +57,10 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, if (transports.has(Transport::IB6)) addIb(Transport::IB6); if (transports.has(Transport::IB7)) addIb(Transport::IB7); } + + // NVLS + // if ((transports.has(Transport::NVLS))) { + // } } MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl_(pimpl) {} From be04f9e72c665013f600f654d63a617fc767146d Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 5 Jan 2024 02:27:45 +0000 Subject: [PATCH 06/67] wip --- src/include/registered_memory.hpp | 3 +++ src/registered_memory.cc | 30 ++++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 11cd30231..62594a855 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -27,6 +27,9 @@ struct TransportInfo { const IbMr* ibMr; IbMrInfo ibMrInfo; }; + struct { + int fileDesciptor; + }; }; }; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 043268821..f41cfda0a 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -59,8 +59,19 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, } // NVLS - // if ((transports.has(Transport::NVLS))) { - // } + if ((transports.has(Transport::Nvls))) { + if (size != sizeof(CUmemGenericAllocationHandle)) { + throw mscclpp::Error("data must be an element of type CUmemGenericAllocationHandle", ErrorCode::InvalidUsage); + } + if ((transports & AllIBTransports).any() || (transports.has(Transport::CudaIpc))) { + throw mscclpp::Error("NVLS transport can only be used by itself", ErrorCode::InvalidUsage); + } + TransportInfo transportInfo; + MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesciptor, + *reinterpret_cast(data), + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); + this->transportInfos.push_back(transportInfo); + } } MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl_(pimpl) {} @@ -95,6 +106,9 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() { std::back_inserter(result)); } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); + } else if (entry.transport == Transport::Nvls) { + std::copy_n(reinterpret_cast(&entry.fileDesciptor), sizeof(entry.fileDesciptor), + std::back_inserter(result)); } else { throw mscclpp::Error("Unknown transport", ErrorCode::InternalError); } @@ -136,6 +150,9 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast(&transportInfo.ibMrInfo)); it += sizeof(transportInfo.ibMrInfo); transportInfo.ibLocal = false; + } else if (transportInfo.transport == Transport::Nvls) { + std::copy_n(it, sizeof(transportInfo.fileDesciptor), reinterpret_cast(&transportInfo.fileDesciptor)); + it += sizeof(transportInfo.fileDesciptor); } else { throw mscclpp::Error("Unknown transport", ErrorCode::InternalError); } @@ -156,6 +173,12 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess)); this->data = static_cast(base) + entry.cudaIpcOffsetFromBase; INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data); + } else if (transports.has(Transport::Nvls) && getHostHash() == this->hostHash) { + auto entry = getTransportInfo(Transport::Nvls); + this->data = new CUmemGenericAllocationHandle; + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(reinterpret_cast(this->data), + reinterpret_cast(entry.fileDesciptor), + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); } else { // No valid data pointer can be set this->data = nullptr; @@ -174,6 +197,9 @@ RegisteredMemory::Impl::~Impl() { } data = nullptr; } + if (data && transports.has(Transport::Nvls)) { + delete reinterpret_cast(this->data); + } } const TransportInfo& RegisteredMemory::Impl::getTransportInfo(Transport transport) const { From 2211a1456ffac05d3dd13176fec1a1d853a236bf Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 5 Jan 2024 11:44:13 -0800 Subject: [PATCH 07/67] restoring registered memory changes --- src/include/registered_memory.hpp | 3 --- src/registered_memory.cc | 33 ------------------------------- 2 files changed, 36 deletions(-) diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 62594a855..11cd30231 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -27,9 +27,6 @@ struct TransportInfo { const IbMr* ibMr; IbMrInfo ibMrInfo; }; - struct { - int fileDesciptor; - }; }; }; diff --git a/src/registered_memory.cc b/src/registered_memory.cc index f41cfda0a..6d5fd79f5 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -20,7 +20,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, hostHash(getHostHash()), pidHash(getPidHash()), transports(transports) { - // CUDA IPC if (transports.has(Transport::CudaIpc)) { TransportInfo transportInfo; transportInfo.transport = Transport::CudaIpc; @@ -35,8 +34,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr; this->transportInfos.push_back(transportInfo); } - - // IB if ((transports & AllIBTransports).any()) { auto addIb = [&](Transport ibTransport) { TransportInfo transportInfo; @@ -57,21 +54,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, if (transports.has(Transport::IB6)) addIb(Transport::IB6); if (transports.has(Transport::IB7)) addIb(Transport::IB7); } - - // NVLS - if ((transports.has(Transport::Nvls))) { - if (size != sizeof(CUmemGenericAllocationHandle)) { - throw mscclpp::Error("data must be an element of type CUmemGenericAllocationHandle", ErrorCode::InvalidUsage); - } - if ((transports & AllIBTransports).any() || (transports.has(Transport::CudaIpc))) { - throw mscclpp::Error("NVLS transport can only be used by itself", ErrorCode::InvalidUsage); - } - TransportInfo transportInfo; - MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesciptor, - *reinterpret_cast(data), - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); - this->transportInfos.push_back(transportInfo); - } } MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr pimpl) : pimpl_(pimpl) {} @@ -106,9 +88,6 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() { std::back_inserter(result)); } else if (AllIBTransports.has(entry.transport)) { std::copy_n(reinterpret_cast(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result)); - } else if (entry.transport == Transport::Nvls) { - std::copy_n(reinterpret_cast(&entry.fileDesciptor), sizeof(entry.fileDesciptor), - std::back_inserter(result)); } else { throw mscclpp::Error("Unknown transport", ErrorCode::InternalError); } @@ -150,9 +129,6 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast(&transportInfo.ibMrInfo)); it += sizeof(transportInfo.ibMrInfo); transportInfo.ibLocal = false; - } else if (transportInfo.transport == Transport::Nvls) { - std::copy_n(it, sizeof(transportInfo.fileDesciptor), reinterpret_cast(&transportInfo.fileDesciptor)); - it += sizeof(transportInfo.fileDesciptor); } else { throw mscclpp::Error("Unknown transport", ErrorCode::InternalError); } @@ -173,12 +149,6 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess)); this->data = static_cast(base) + entry.cudaIpcOffsetFromBase; INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data); - } else if (transports.has(Transport::Nvls) && getHostHash() == this->hostHash) { - auto entry = getTransportInfo(Transport::Nvls); - this->data = new CUmemGenericAllocationHandle; - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(reinterpret_cast(this->data), - reinterpret_cast(entry.fileDesciptor), - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); } else { // No valid data pointer can be set this->data = nullptr; @@ -197,9 +167,6 @@ RegisteredMemory::Impl::~Impl() { } data = nullptr; } - if (data && transports.has(Transport::Nvls)) { - delete reinterpret_cast(this->data); - } } const TransportInfo& RegisteredMemory::Impl::getTransportInfo(Transport transport) const { From f5acce87f81b32c09f8611ee9809dada3598f27b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 10 Jan 2024 20:15:03 -0800 Subject: [PATCH 08/67] wip --- include/mscclpp/core.hpp | 21 ++++++++++++++++++--- src/core.cc | 2 ++ src/endpoint.cc | 27 +++++++++++++++++++++++++++ src/include/endpoint.hpp | 7 +++++++ 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 24b54fd33..1d12a4083 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -125,7 +125,8 @@ class TcpBootstrap : public Bootstrap { enum class Transport { Unknown, // Unknown transport type. CudaIpc, // CUDA IPC transport type. - Nvls, // NVLS transport type. + NvlsRoot, // NVLS for root transport type. + NvlsNonRoot, // NVLS for non-root transport type. IB0, // InfiniBand device 0 transport type. IB1, // InfiniBand device 1 transport type. IB2, // InfiniBand device 2 transport type. @@ -137,11 +138,11 @@ enum class Transport { NumTransports // The number of transports. }; -const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2", +const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", "IB0", "IB1", "IB2", "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; namespace detail { -const size_t TransportFlagsSize = 11; +const size_t TransportFlagsSize = 13; static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); /// Bitset for storing transport flags. @@ -460,6 +461,8 @@ struct EndpointConfig { int ibMaxSendWr = DefaultMaxSendWr; int ibMaxWrPerSend = DefaultMaxWrPerSend; + size_t nvlsBufferSize; + /// Default constructor. Sets transport to Transport::Unknown. EndpointConfig() : transport(Transport::Unknown) {} @@ -467,6 +470,15 @@ struct EndpointConfig { /// /// @param transport The transport to use. EndpointConfig(Transport transport) : transport(transport) {} + + /// Constructor for NVLS explicitly + /// @param transport must be either NvlsRoot or NvlsNonRoot + /// @param nvlsBufferSize is the buffer to be alloced on each device + EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) { + if (!AllNvlsTransports.has(transport)) { + throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage); + } + } }; /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases @@ -688,6 +700,9 @@ extern const TransportFlags NoTransports; /// A constant TransportFlags object representing all InfiniBand transports. extern const TransportFlags AllIBTransports; +/// A constant TransportFlags object representing all NVLS transports. +extern const TransportFlags AllNvlsTransports; + /// A constant TransportFlags object representing all transports. extern const TransportFlags AllTransports; diff --git a/src/core.cc b/src/core.cc index 4d89250d0..84faf4783 100644 --- a/src/core.cc +++ b/src/core.cc @@ -87,6 +87,8 @@ const TransportFlags NoTransports = TransportFlags(); const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transport::IB2 | Transport::IB3 | Transport::IB4 | Transport::IB5 | Transport::IB6 | Transport::IB7; +const TransportFlags AllNvlsTransports = Transport::NvlsNonRoot | Transport::NvlsRoot; + const TransportFlags AllTransports = AllIBTransports | Transport::CudaIpc; void Setuppable::beginSetup(std::shared_ptr) {} diff --git a/src/endpoint.cc b/src/endpoint.cc index dbc773898..350cba07e 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -16,6 +16,23 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) ->createQp(config.ibMaxCqSize, config.ibMaxCqPollNum, config.ibMaxSendWr, 0, config.ibMaxWrPerSend); ibQpInfo_ = ibQp_->getInfo(); } + + if (AllNvlsTransports.has(transport_)) { + minMcGran_ = 0; + mcGran_ = 0; + mcProp_.size = config.nvlsBufferSize; + mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; + // create the mc handle now only on the root + if (transport_ == Transport::NvlsRoot){ + MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); + + fileDesc_ = 0; + MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&fileDesc_, handle, handleType, 0 /*flags*/)); + } + } } MSCCLPP_API_CPP Transport Endpoint::transport() { return pimpl_->transport_; } @@ -27,6 +44,10 @@ MSCCLPP_API_CPP std::vector Endpoint::serialize() { if (AllIBTransports.has(pimpl_->transport_)) { std::copy_n(reinterpret_cast(&pimpl_->ibQpInfo_), sizeof(pimpl_->ibQpInfo_), std::back_inserter(data)); } + + if (transport_ == Transport::NvlsRoot) { + std::copy_n(reinterpret_cast(&pimpl_->fileDesc_), sizeof(pimpl_->fileDesc_), std::back_inserter(data)); + } return data; } @@ -45,6 +66,12 @@ Endpoint::Impl::Impl(const std::vector& serialization) { std::copy_n(it, sizeof(ibQpInfo_), reinterpret_cast(&ibQpInfo_)); it += sizeof(ibQpInfo_); } + if (transport_ == Transport::NvlsNonRoot) { + fileDesc_ = 0; + std::copy_n(it, sizeof(fileDesc_), reinterpret_cast(&fileDesc_)); + it += sizeof(fileDesc_); + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)fileDesc_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + } } MSCCLPP_API_CPP Endpoint::Endpoint(std::shared_ptr pimpl) : pimpl_(pimpl) {} diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp index 311fa9982..00322674e 100644 --- a/src/include/endpoint.hpp +++ b/src/include/endpoint.hpp @@ -22,6 +22,13 @@ struct Endpoint::Impl { bool ibLocal_; IbQp* ibQp_; IbQpInfo ibQpInfo_; + + // These are only defined for multicast (NVLS) capability + CUmulticastObjectProp mcProp_; + CUmemGenericAllocationHandle mcHandle_; + size_t minMcGran_; + size_t mcGran_; + int fileDesc_; }; } // namespace mscclpp From 985d8c78aa024655f2cf47492a94956c08cd765b Mon Sep 17 00:00:00 2001 From: "Saeed Maleki (saemal)" Date: Sun, 14 Jan 2024 17:20:53 -0800 Subject: [PATCH 09/67] testing pid_getfd --- nvls/test.cu | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nvls/test.cu b/nvls/test.cu index 7bf12a699..48da87eef 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -120,9 +120,16 @@ int main() { } ncclIpcSocketClose(&ipcSock); + pid_t currentPid = getpid(); + MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Bcast(¤tPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD); + // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); // everyone else would now have same multicast object if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerfd, handleType)); + int peerFd = 0; + if (myrank) peerFd = pidfd_getfd(currendPid, fd, 0); + printf("peerFd = %d\n", peerFd); // if(myrank) // close(peerfd); From 5cc805ba980acf0661cf397e1b20e93108c464e1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 15 Jan 2024 02:12:21 +0000 Subject: [PATCH 10/67] removing ipc dep --- nvls/test.cu | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/nvls/test.cu b/nvls/test.cu index 48da87eef..bbbc3e391 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -2,11 +2,10 @@ #include #include #include -// #include #include #include - -#include "ipcsocket.cc" +#include +#include #define CUCHECK(cmd) \ do { \ @@ -104,32 +103,17 @@ int main() { // exported handles // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node - volatile uint32_t abortFlag = 0; - struct ncclIpcSocket ipcSock = {0}; - uint64_t opId = 0xdeadcafebeef; - // ncclResult_t ret = ncclSuccess; - - ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag); - MPI_Barrier(MPI_COMM_WORLD); - if (!myrank) { - for (int p = 1; p < nranks; p++) { - ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId); - } - } else { - ncclIpcSocketRecvFd(&ipcSock, &peerfd); - } - ncclIpcSocketClose(&ipcSock); - pid_t currentPid = getpid(); MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); MPI_Bcast(¤tPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD); + int pidFd = syscall(SYS_pidfd_open, currentPid, 0); // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); // everyone else would now have same multicast object - if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerfd, handleType)); int peerFd = 0; - if (myrank) peerFd = pidfd_getfd(currendPid, fd, 0); - printf("peerFd = %d\n", peerFd); + peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0); + if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerFd, handleType)); + MPI_Barrier(MPI_COMM_WORLD); // if(myrank) // close(peerfd); From abfba07519b4df93414b154ac384c8ec1a28f8b0 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 15 Jan 2024 02:12:48 +0000 Subject: [PATCH 11/67] clean up --- nvls/align.h | 47 -- nvls/alloc.h | 270 ----------- nvls/argcheck.h | 16 - nvls/bootstrap.h | 32 -- nvls/channel.h | 48 -- nvls/checks.h | 160 ------- nvls/coll_net.h | 35 -- nvls/collectives.h | 48 -- nvls/comm.h | 473 ------------------- nvls/core.h | 41 -- nvls/cpuset.h | 61 --- nvls/cudawrap.h | 129 ------ nvls/debug.h | 48 -- nvls/device.h | 463 ------------------- nvls/enqueue.h | 26 -- nvls/gdrwrap.h | 252 ----------- nvls/graph.h | 116 ----- nvls/group.h | 137 ------ nvls/ibvcore.h | 1058 ------------------------------------------- nvls/ibvsymbols.h | 46 -- nvls/ibvwrap.h | 92 ---- nvls/info.h | 134 ------ nvls/ipcsocket.cc | 232 ---------- nvls/ipcsocket.h | 38 -- nvls/nccl_common.h | 33 -- nvls/nccl_net.h | 333 -------------- nvls/nccl_tuner.h | 55 --- nvls/net.h | 27 -- nvls/net_device.h | 29 -- nvls/nvmlwrap.h | 214 --------- nvls/nvtx.h | 85 ---- nvls/p2p.h | 29 -- nvls/param.h | 30 -- nvls/profiler.h | 37 -- nvls/proxy.h | 296 ------------ nvls/shm.h | 25 - nvls/socket.h | 97 ---- nvls/strongstream.h | 140 ------ nvls/timer.h | 60 --- nvls/transport.h | 128 ------ nvls/trees.h | 13 - nvls/tuner.h | 22 - nvls/utils.h | 524 --------------------- 43 files changed, 6179 deletions(-) delete mode 100644 nvls/align.h delete mode 100644 nvls/alloc.h delete mode 100644 nvls/argcheck.h delete mode 100644 nvls/bootstrap.h delete mode 100644 nvls/channel.h delete mode 100644 nvls/checks.h delete mode 100644 nvls/coll_net.h delete mode 100644 nvls/collectives.h delete mode 100644 nvls/comm.h delete mode 100644 nvls/core.h delete mode 100644 nvls/cpuset.h delete mode 100644 nvls/cudawrap.h delete mode 100644 nvls/debug.h delete mode 100644 nvls/device.h delete mode 100644 nvls/enqueue.h delete mode 100644 nvls/gdrwrap.h delete mode 100644 nvls/graph.h delete mode 100644 nvls/group.h delete mode 100644 nvls/ibvcore.h delete mode 100644 nvls/ibvsymbols.h delete mode 100644 nvls/ibvwrap.h delete mode 100644 nvls/info.h delete mode 100644 nvls/ipcsocket.cc delete mode 100644 nvls/ipcsocket.h delete mode 100644 nvls/nccl_common.h delete mode 100644 nvls/nccl_net.h delete mode 100644 nvls/nccl_tuner.h delete mode 100644 nvls/net.h delete mode 100644 nvls/net_device.h delete mode 100644 nvls/nvmlwrap.h delete mode 100644 nvls/nvtx.h delete mode 100644 nvls/p2p.h delete mode 100644 nvls/param.h delete mode 100644 nvls/profiler.h delete mode 100644 nvls/proxy.h delete mode 100644 nvls/shm.h delete mode 100644 nvls/socket.h delete mode 100644 nvls/strongstream.h delete mode 100644 nvls/timer.h delete mode 100644 nvls/transport.h delete mode 100644 nvls/trees.h delete mode 100644 nvls/tuner.h delete mode 100644 nvls/utils.h diff --git a/nvls/align.h b/nvls/align.h deleted file mode 100644 index 2a71dd1bc..000000000 --- a/nvls/align.h +++ /dev/null @@ -1,47 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_ALIGN_H_ -#define NCCL_ALIGN_H_ - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) - -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_POWER(x, y) \ - ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x)))) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -#if !__CUDA_ARCH__ - #ifndef __host__ - #define __host__ - #endif - #ifndef __device__ - #define __device__ - #endif -#endif - -template -__host__ __device__ constexpr Z divUp(X x, Y y) { - return (x+y-1)/y; -} - -template -__host__ __device__ constexpr Z roundUp(X x, Y y) { - return (x+y-1) - (x+y-1)%y; -} - -// assumes second argument is a power of 2 -template -__host__ __device__ constexpr Z alignUp(X x, int a) { - return (x+a-1) & Z(-a); -} - -#endif diff --git a/nvls/alloc.h b/nvls/alloc.h deleted file mode 100644 index f8d954469..000000000 --- a/nvls/alloc.h +++ /dev/null @@ -1,270 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_ALLOC_H_ -#define NCCL_ALLOC_H_ - -#include "nccl.h" -#include "checks.h" -#include "align.h" -#include "utils.h" -#include "p2p.h" -#include -#include -#include -#include - -uint64_t clockNano(); // from utils.h with which we have a circular dependency - -template -ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish); - memset(*ptr, 0, nelem*sizeof(T)); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return result; -} -#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return ncclSuccess; -} - -template -ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p); - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} -#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) { - if (nelem < oldNelem) return ncclInternalError; - if (nelem == oldNelem) return ncclSuccess; - - T* oldp = *ptr; - T* p = (T*)malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - memcpy(p, oldp, oldNelem*sizeof(T)); - free(oldp); - memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T)); - *ptr = (T*)p; - INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr); - return ncclSuccess; -} - -#if CUDART_VERSION >= 11030 - -#include -#include "cudawrap.h" - -static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) { - ncclResult_t result = ncclSuccess; - size_t granularity = 0; - CUdevice currentDev; - CUmemAllocationProp prop = {}; - CUmemAccessDesc accessDesc = {}; - CUmemGenericAllocationHandle handle; - int cudaDev; - int flag = 0; - CUDACHECK(cudaGetDevice(&cudaDev)); - CUCHECK(cuDeviceGet(¤tDev, cudaDev)); - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported - prop.location.id = currentDev; - // Query device to see if RDMA support is available - CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev)); - if (flag) prop.allocFlags.gpuDirectRDMACapable = 1; - CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); - ALIGN_SIZE(size, granularity); - /* Allocate the physical memory on the device */ - CUCHECK(cuMemCreate(&handle, size, &prop, 0)); - /* Reserve a virtual address range */ - CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0)); - /* Map the virtual address range to the physical allocation */ - CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0)); - /* Now allow RW access to the newly mapped memory */ - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = currentDev; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1)); - if (handlep) *handlep = handle; - TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle); - return result; -} - -static inline ncclResult_t ncclCuMemFree(void *ptr) { - if (ptr == NULL) return ncclSuccess; - ncclResult_t result = ncclSuccess; - CUmemGenericAllocationHandle handle; - size_t size = 0; - CUCHECK(cuMemRetainAllocationHandle(&handle, ptr)); - CUCHECK(cuMemRelease(handle)); - CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr)); - TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle); - CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size)); - CUCHECK(cuMemRelease(handle)); - CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size)); - return result; -} - -#else - -extern int ncclCuMemEnable(); - -static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) { - WARN("CUMEM not supported prior to CUDA 11.3"); - return ncclInternalError; -} -static inline ncclResult_t ncclCuMemFree(void *ptr) { - WARN("CUMEM not supported prior to CUDA 11.3"); - return ncclInternalError; -} - -#endif - -template -ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - } -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return result; -} -#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - } - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); - CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return result; -} -#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - *ptr = nullptr; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish); - } else { - CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish); - } - CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T)); - INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr); - return result; -} -#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__) - -template -ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - // Need a side stream so as not to interfere with graph capture. - cudaStream_t stream; - CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish); - NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish); - CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish); - CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -template -ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish); -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -template -ncclResult_t ncclCudaFree(T* ptr) { - ncclResult_t result = ncclSuccess; - cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed; - TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr); - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - if (ncclCuMemEnable()) { - NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish); - } else { - CUDACHECKGOTO(cudaFree(ptr), result, finish); - } -finish: - CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode)); - return result; -} - -// Allocate memory to be potentially ibv_reg_mr'd. This needs to be -// allocated on separate pages as those pages will be marked DONTFORK -// and if they are shared, that could cause a crash in a child process -inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) { - size_t page_size = sysconf(_SC_PAGESIZE); - void* p; - int size_aligned = ROUNDUP(size, page_size); - int ret = posix_memalign(&p, page_size, size_aligned); - if (ret != 0) return ncclSystemError; - memset(p, 0, size); - *ptr = p; - INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr); - return ncclSuccess; -} -#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__) - -#endif diff --git a/nvls/argcheck.h b/nvls/argcheck.h deleted file mode 100644 index 8d8b74e8e..000000000 --- a/nvls/argcheck.h +++ /dev/null @@ -1,16 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_ARGCHECK_H_ -#define NCCL_ARGCHECK_H_ - -#include "core.h" -#include "info.h" - -ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); -ncclResult_t ArgsCheck(struct ncclInfo* info); - -#endif diff --git a/nvls/bootstrap.h b/nvls/bootstrap.h deleted file mode 100644 index 400a479fb..000000000 --- a/nvls/bootstrap.h +++ /dev/null @@ -1,32 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_BOOTSTRAP_H_ -#define NCCL_BOOTSTRAP_H_ - -#include "nccl.h" -#include "comm.h" - -struct ncclBootstrapHandle { - uint64_t magic; - union ncclSocketAddress addr; -}; -static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID"); - -ncclResult_t bootstrapNetInit(); -ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv); -ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle); -ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm); -ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks); -ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); -ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); -ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); -ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag); -ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size); -ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size); -ncclResult_t bootstrapClose(void* commState); -ncclResult_t bootstrapAbort(void* commState); -#endif diff --git a/nvls/channel.h b/nvls/channel.h deleted file mode 100644 index adc38749a..000000000 --- a/nvls/channel.h +++ /dev/null @@ -1,48 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_CHANNEL_H_ -#define NCCL_CHANNEL_H_ -#include "comm.h" - -ncclResult_t initChannel(struct ncclComm* comm, int channelid); -ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); -ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share); -ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks); -static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) { - int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2; - int peerNode = comm->rankToNode[peer]; - int peerIndex = comm->rankToLocalRank[peer]; - int nsteps = comm->maxLocalRanks; - int rankIndex = comm->rankToLocalRank[comm->rank]; - int step, delta; - if (coll == ncclFuncSend) { - step = (nsteps + peerIndex - rankIndex)%nsteps; - delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes; - } else if (coll == ncclFuncRecv) { - step = (nsteps + rankIndex - peerIndex)%nsteps; - delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes; - } else { - return ncclInternalError; - } - *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step; - return ncclSuccess; -} - -static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) { - //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels; - *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels; - return ncclSuccess; -} - -static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) { - int base; - NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base)); - NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId)); - return ncclSuccess; -} - -#endif diff --git a/nvls/checks.h b/nvls/checks.h deleted file mode 100644 index c9fd16176..000000000 --- a/nvls/checks.h +++ /dev/null @@ -1,160 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_CHECKS_H_ -#define NCCL_CHECKS_H_ - -#include "debug.h" - -// Check CUDA RT calls -#define CUDACHECK(cmd) do { \ - cudaError_t err = cmd; \ - if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, RES, label) do { \ - cudaError_t err = cmd; \ - if( err != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(err)); \ - RES = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -// Report failure but clear error and continue -#define CUDACHECKIGNORE(cmd) do { \ - cudaError_t err = cmd; \ - if( err != cudaSuccess ) { \ - INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \ - (void) cudaGetLastError(); \ - } \ -} while(false) - -#include -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -#define SYSCHECKGOTO(statement, RES, label) do { \ - if ((statement) == -1) { \ - /* Print the back trace*/ \ - RES = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ - goto label; \ - } \ -} while (0); - -#define NEQCHECK(statement, value) do { \ - if ((statement) != value) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (0); - -#define NEQCHECKGOTO(statement, value, RES, label) do { \ - if ((statement) != value) { \ - /* Print the back trace*/ \ - RES = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ - goto label; \ - } \ -} while (0); - -#define EQCHECK(statement, value) do { \ - if ((statement) == value) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (0); - -#define EQCHECKGOTO(statement, value, RES, label) do { \ - if ((statement) == value) { \ - /* Print the back trace*/ \ - RES = ncclSystemError; \ - INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \ - goto label; \ - } \ -} while (0); - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t RES = call; \ - if (RES != ncclSuccess && RES != ncclInProgress) { \ - /* Print the back trace*/ \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ - return RES; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, RES, label) do { \ - RES = call; \ - if (RES != ncclSuccess && RES != ncclInProgress) { \ - /* Print the back trace*/ \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ - goto label; \ - } \ -} while (0); - -#define NCCLWAIT(call, cond, abortFlagPtr) do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - ncclResult_t RES = call; \ - if (RES != ncclSuccess && RES != ncclInProgress) { \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ - return ncclInternalError; \ - } \ - if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \ -} while (!(cond)); - -#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \ - volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \ - RES = call; \ - if (RES != ncclSuccess && RES != ncclInProgress) { \ - if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \ - goto label; \ - } \ - if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \ -} while (!(cond)); - -#define NCCLCHECKTHREAD(a, args) do { \ - if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \ - return args; \ - } \ -} while(0) - -#define CUDACHECKTHREAD(a) do { \ - if ((a) != cudaSuccess) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ - args->ret = ncclUnhandledCudaError; \ - return args; \ - } \ -} while(0) - -#endif diff --git a/nvls/coll_net.h b/nvls/coll_net.h deleted file mode 100644 index f4b540866..000000000 --- a/nvls/coll_net.h +++ /dev/null @@ -1,35 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef COLL_NET_H_ -#define COLL_NET_H_ - -#include "nccl.h" -#include "nccl_net.h" - -typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; - -// Translation to external API -static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; } -static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; } -static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; } -static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; } -static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; } -static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; } -/* DMA-BUF support */ -static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; } -static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; } -static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) { - NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; } -static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; } -static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; } -static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; } - -static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; } - -#endif diff --git a/nvls/collectives.h b/nvls/collectives.h deleted file mode 100644 index 0f965276a..000000000 --- a/nvls/collectives.h +++ /dev/null @@ -1,48 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_COLLECTIVES_H_ -#define NCCL_COLLECTIVES_H_ - -#include "nccl.h" - -// CHUNKSIZE must be a multiple of SLICESIZE -#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) -#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) -#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) -#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) -#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) -#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) -#define BROADCAST_SLICESTEPS 1 -#define BROADCAST_CHUNKSTEPS 1 -#define REDUCE_SLICESTEPS 1 -#define REDUCE_CHUNKSTEPS 1 -#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above - -inline int ncclTypeSize(ncclDataType_t type) { - switch (type) { - case ncclInt8: - case ncclUint8: - return 1; - case ncclFloat16: - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - #endif - return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: - return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: - return 8; - default: - return -1; - } -} - -#endif diff --git a/nvls/comm.h b/nvls/comm.h deleted file mode 100644 index 328ffef3b..000000000 --- a/nvls/comm.h +++ /dev/null @@ -1,473 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_COMM_H_ -#define NCCL_COMM_H_ - -#include "transport.h" -#include "p2p.h" -#include "collectives.h" -#include "nccl_tuner.h" -#include "proxy.h" -#include "strongstream.h" -#include "nccl_net.h" - -#if CUDART_VERSION < 9000 -struct cudaLaunchParams { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; -}; -#endif - -#define CACHE_LINE_SIZE 128 -#define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL - -// Channels / LL tuning -#define NCCL_LL_THREAD_THRESHOLD 8 -#define NCCL_LL128_THREAD_THRESHOLD 8 -#define NCCL_SIMPLE_THREAD_THRESHOLD 64 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - uint64_t redOpArgExchange[2]; - char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)]; - int offsFifo[NCCL_STEPS]; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[NCCL_STEPS]; - int offsFifo[NCCL_STEPS]; - int flush; // For GDRCopy-based flush - }; - char pad4[MEM_ALIGN]; - }; -}; - -enum helperThreadState {ThreadStart, ThreadStop}; - -#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS) - -struct ncclGraphHelperResources { - ncclComm* comm; - pthread_mutex_t threadLock; - pthread_cond_t threadCond; - enum helperThreadState threadState; - void* ipcBases[NCCL_IPC_POOL_SIZE]; - int ipcTail; - int ipcHead; -}; - -struct ncclUserRedOp { - int freeNext; // -1=allocated, otherwise index of next free entry in array - ncclDataType_t datatype; - ncclDevRedOpFull opFull; -}; - -struct ncclNodeRanks { - int localRanks; - int* localRankToRank; -}; - -struct ncclDestructor { - struct ncclDestructor* next; - void* obj; - ncclResult_t(*fn)(struct ncclDestructor* me); -}; - -struct ncclCommCallback { - struct ncclCommCallback* next; - ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb); -}; - -struct ncclSharedResources { - int refCount; - struct ncclComm* owner; /* comm which creates this shared res. */ - struct ncclChannelPeer* peers[MAXCHANNELS]; - struct ncclDevChannelPeer* devPeers[MAXCHANNELS]; - /* P2P operation counter, one per channel */ - uint64_t p2pOpCount[MAXCHANNELS]; - /* Collective operation counter */ - uint64_t collOpCount; - int tpNRanks; - int tpNLocalRanks; - int tpNChannels; - int tpP2pNChannels; - int tpP2pChunkSize; - uint64_t magic; - - // top parent rank to localRank translation table - int* tpRankToLocalRank; - // Internal streams - struct ncclStrongStream deviceStream, hostStream; - - /* proxy related shared res */ - struct ncclProxyState* proxyState; -}; - -struct ncclChannel { - struct ncclChannelPeer** peers; - struct ncclDevChannelPeer** devPeers; - /* devPeer pointer array used for host side access */ - struct ncclDevChannelPeer** devPeersHostPtr; - struct ncclRing ring; - int* devRingUserRanks; - struct ncclTree tree; - - struct ncclTree collnetChain; - struct ncclDirect collnetDirect; - - struct ncclNvls nvls; - - int id; // index of this channel - uint32_t workFifoSent; // last used work index+1 - - /* comm split sharable resources */ - struct ncclChannelPeer* collnetPeers; - struct ncclDevChannelPeer* collnetDevPeers; - struct ncclChannelPeer* nvlsPeers; - struct ncclDevChannelPeer* nvlsDevPeers; -}; - -struct ncclWorkList { - struct ncclWorkList* next; - struct ncclWork work; -}; - -struct ncclPointerList { - struct ncclPointerList* next; - void *ptr; -}; - -struct ncclNvlsMcHandleList { - struct ncclNvlsMcHandleList *next; - CUmemGenericAllocationHandle mcHandle; - CUdeviceptr ptr; - int dev; - size_t size; -}; - -struct ncclKernelPlan { - // A kernel plan is also a callback that reclaims itself. Hence this must - // be the first member. - struct ncclCommCallback reclaimer; - struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup - - struct ncclComm* comm; - struct ncclKernelPlan* next; - - bool persistent; // aka captured in a graph - bool kernelSpecialized; - void *kernelFn; - int channelUbound; // only channels c < channelUbound are present - int channelCount; // number of channels present - uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask) - bool hasProxyOps; // does any channel have a non-empty proxyOpQueue - int threadPerBlock; - // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel() - struct ncclWork* workHead; - - int collOpCount; // zero based for this plan - - struct ncclIntruQueue ipcMemQueue; - struct ncclIntruQueue nvlsMcHandleQueue; - - struct Channel { - int nWork; - union { - int nWorkElem; // used for coll and reg coll - int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1 - }; - size_t collBytes; - struct ncclIntruQueue workQueue; - struct ncclIntruQueue proxyOpQueue; - } channels[MAXCHANNELS]; -}; - -struct ncclRegRequest { - uintptr_t buff; - size_t size; - struct ncclRegRequest *next; -}; - -struct ncclRegRecord { - uintptr_t buff; - size_t size; - CUdeviceptr regAddr; - size_t regSize; - int dev; - CUmemGenericAllocationHandle mcHandle; - uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */ - struct ncclRegRecord *next; -}; - -struct ncclComm { - struct ncclMemoryStack memPermanent, memScoped; - // List of destructors to run when comm is destructed - struct ncclDestructor* destructorHead; - - struct ncclSharedResources* sharedRes; - /* map to top parent ranks. */ - int* topParentRanks; - int* topParentLocalRanks; - struct ncclChannel channels[MAXCHANNELS]; - struct ncclPeerInfo* peerInfo; - struct ncclTopoSystem* topo; - - ncclNet_t* ncclNet; - ncclCollNet_t* ncclCollNet; - void* bootstrap; - // Bitmasks for ncclTransportP2pSetup - uint64_t* connectSend; - uint64_t* connectRecv; - - uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. - - uint64_t commHash; - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int nvmlDev; // my nvml device index - int compCap; // compute capability of the GPU - int minCompCap, maxCompCap; // min/max compute capability in the communicator - int64_t busId; // my PCI bus ID in int format - cpu_set_t cpuAffinity; // CPU affinity of the GPU - int cudaArch; // matches __CUDA_ARCH__ of device - - int node; - int nNodes; - int localRank; - int localRanks; - int maxLocalRanks; - int* rankToNode; - int* rankToLocalRank; - int* localRankToRank; - // localRanks and localRanktoRank for all nodes - struct ncclNodeRanks* nodeRanks; - - bool checkPointers; - bool dmaBufSupport; - - // Counter for tracking CUDA launches (P2P and collectives included) - uint64_t opCount; - - // Channels for collectives - int nChannels; - int nvlsChannels; - int collNetChannels; - // Channels (per peer) for p2p - int p2pnChannels; - int p2pnChannelsPerPeer; - int p2pChannels[MAXCHANNELS]; - - // Should this comm allocate LL buffers for network P2P connections? - bool allocP2pNetLLBuffers; - - // Buffer sizes - int buffSizes[NCCL_NUM_PROTOCOLS]; - int p2pChunkSize; - - // Algorithm/Protocols thresholds - ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS]; - int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; - - /* This attribute can indicate the states of communicators and return code of - * asynchronous NCCL operations. */ - ncclResult_t asyncResult; - - // Flag to ask NCCL kernels to abort - volatile uint32_t *abortFlag; - volatile uint32_t *childAbortFlag; - uint32_t *abortFlagRefCount; - - // Device side of the communicator (for cudaFree's) - struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm - - // Operation pool. - int workFifoDepth; // size of workFifoHeap[], power of 2 - struct ncclWork* workFifoHeap; - struct ncclWork* devWorkFifoHeap; - void* workFifoHeapGdrHandle; - - // Work completion notificaion - uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory - uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot. - uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels. - - // Intra-process sync - struct ncclComm* intraComm0; // leader of intra-process comms (self possible) - struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head - int intraRank; - int intraRanks; - uint32_t intraBarrierPhase; - char intraPad1[64 - sizeof(uint64_t)]; - uint64_t intraBarrierCounter; // only used if this is intraComm0 - char intraPad2[64 - sizeof(uint64_t)]; - uint64_t intraBarrierGate; // only used if this is intraComm0 - - struct ncclProxyState* proxyState; - int proxyRefCountOld; /* store proxy post-atomic-sub refcount */ - // Whether this communicator uses collNet - int collNetSupport; - uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes]; - int intraHighestTransportType; - int* collNetHeads; - int collNetHeadsNum; - /* sharable collNet proxy progress resource. */ - struct ncclCollNetSharedRes* collNetSharedRes; - - // NVLink SHARP (NVLS) support - int nvlsSupport; - int nvlsRegSupport; - /* sharable NVLS resource. */ - struct ncclNvlsSharedRes* nvlsResources; - - ssize_t channelSize; // User requested work size (bytes) for channel partitions - - // pools backed by comm->memPermanent - struct ncclMemoryPool memPool_ncclProxyOp; - struct ncclMemoryPool memPool_ncclKernelPlan; - struct ncclMemoryPool memPool_ncclPointerList; - struct ncclMemoryPool memPool_ncclNvlsHandleList; - // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when - // this comm is not yet in a group. - struct ncclComm* groupNext; - // Subset of those in groupNext list. Holds 0x1 if not needing preconnect. - struct ncclComm* preconnectNext; - int persistentRefs; // number of persistent plan-lists capturing this comm - struct ncclTasks tasks; - - // user-created reduction ops - int userRedOpCapacity, userRedOpFreeHead; - ncclUserRedOp *userRedOps; - - // Queue of things for the main thread to do - struct ncclIntruQueueMpsc callbackQueue; - - // List of kernel plans built form tasks. - struct ncclIntruQueue planQueue; - // First of the unlaunched kernels in `planQueue` - struct ncclKernelPlan* unlaunchedPlansHead; - - ncclConfig_t config; - // initState is to more conveniently reclaim resources when errors happen. - ncclResult_t initState; - // flag to indicate if ncclCommFinalize() is called - bool finalizeCalled; - // shared structures for finalization - int finalizeRankCnt; - // group job to support multi-thread FT - struct ncclGroupJob *groupJob; - - /* store to buffer register request */ - struct ncclIntruQueue regRequestQueue; - /* store registered buffer */ - struct ncclIntruQueue regRecordQueue; - - // Tuning plugin - ncclTuner_t* tuner; -}; - -enum ncclLaunchMode { - ncclLaunchModeInvalid=0, - ncclLaunchModeParallel, - ncclLaunchModeGroup -}; -extern enum ncclLaunchMode ncclParamLaunchMode; - -void ncclCommPushFree(struct ncclComm* comm, void* buf); -void ncclCommPushCudaFree(struct ncclComm* comm, void* buf); -void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf); -void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle); - -inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) { - ncclResult_t result = ncclSuccess; - struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome); - while (cb != nullptr) { - struct ncclCommCallback* next = cb->next; - ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb - if (res1 != ncclSuccess) result = res1; - cb = next; - } - NCCLCHECK(result); - return ncclSuccess; -} - -inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) { - int phase = comm->intraBarrierPhase; - if (comm->intraRanks == 1) { - // Release everyone (just me). - comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1); - } else { - struct ncclComm* comm0 = comm->intraComm0; - uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE); - if (uint32_t(count) == uint32_t(comm->intraRanks)) { - // Reset. - __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED); - // Release everyone. - __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE); - } - } -} - -// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x) -inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) { - struct ncclComm* comm0 = comm->intraComm0; - comm->intraBarrierPhase ^= 1; - uint32_t phase = comm->intraBarrierPhase; - uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); - if ((gate & 1) != phase) { - uint64_t t0 = clockNano(); - do { - // Spin vigorously for first 5us. - if (clockNano()-t0 >= 5*1000) sched_yield(); - gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED); - } while ((gate & 1) != phase); - } - if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE); - return gate>>32; -} - -// Scrambles the bits of non-builtin values of ncclRedOp_t according to the -// communicator memory address. Used to catch bugs so that integer handles -// associated with this communicator won't collide with handles of other -// communicatrs. This function is its own inverse. -static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) { - // Preserve the built-in values. - if(int(op) < int(ncclNumOps)) - return op; - uint64_t h = reinterpret_cast(comm); - h ^= h >> 32; - h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant - h >>= 32; // h is now an excellent 32-bit hash of the comm pointer - h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1 - int op1 = int(h) ^ int(op); - // Since builtin values are preserved, we also have to preserve their preimage. - return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1); -} - -ncclResult_t ncclCommEnsureReady(ncclComm_t comm); -ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState); - -#endif diff --git a/nvls/core.h b/nvls/core.h deleted file mode 100644 index a1754beeb..000000000 --- a/nvls/core.h +++ /dev/null @@ -1,41 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_CORE_H_ -#define NCCL_CORE_H_ - -#include -#include -#include -#include -#include // For std::min/std::max -#include "nccl.h" - -#ifdef PROFAPI -#define NCCL_API(ret, func, args...) \ - __attribute__ ((visibility("default"))) \ - __attribute__ ((alias(#func))) \ - ret p##func (args); \ - extern "C" \ - __attribute__ ((visibility("default"))) \ - __attribute__ ((weak)) \ - ret func(args) -#else -#define NCCL_API(ret, func, args...) \ - extern "C" \ - __attribute__ ((visibility("default"))) \ - ret func(args) -#endif // end PROFAPI - -#include "debug.h" -#include "checks.h" -#include "cudawrap.h" -#include "alloc.h" -#include "utils.h" -#include "param.h" -#include "nvtx.h" - -#endif // end include guard diff --git a/nvls/cpuset.h b/nvls/cpuset.h deleted file mode 100644 index ec55cbc54..000000000 --- a/nvls/cpuset.h +++ /dev/null @@ -1,61 +0,0 @@ -/************************************************************************* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_CPUSET_H_ -#define NCCL_CPUSET_H_ - -// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t - -static int hexToInt(char c) { - int v = c - '0'; - if (v < 0) return -1; - if (v > 9) v = 10 + c - 'a'; - if ((v < 0) || (v > 15)) return -1; - return v; -} - -#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) - -static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) { - uint32_t cpumasks[CPU_SET_N_U32]; - int m = CPU_SET_N_U32-1; - cpumasks[m] = 0; - for (int o=0; o=0; o--) { - if (c == 0 && m8[o] == 0) continue; - sprintf(str+c, "%02x", m8[o]); - c+=2; - if (o && o%4 == 0) { - sprintf(str+c, ","); - c++; - } - } - str[c] = '\0'; - return ncclSuccess; -} - -#endif diff --git a/nvls/cudawrap.h b/nvls/cudawrap.h deleted file mode 100644 index cc363c1ac..000000000 --- a/nvls/cudawrap.h +++ /dev/null @@ -1,129 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_CUDAWRAP_H_ -#define NCCL_CUDAWRAP_H_ - -#include -#include -#include "checks.h" - -// Is cuMem API usage enabled -extern int ncclCuMemEnable(); - -#if CUDART_VERSION >= 11030 -#include -#else -typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags); -typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion); -typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); -#endif - -#define CUPFN(symbol) pfn_##symbol - -// Check CUDA PFN driver calls -#define CUCHECK(cmd) do { \ - CUresult err = pfn_##cmd; \ - if( err != CUDA_SUCCESS ) { \ - const char *errStr; \ - (void) pfn_cuGetErrorString(err, &errStr); \ - WARN("Cuda failure %d '%s'", err, errStr); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUCHECKGOTO(cmd, res, label) do { \ - CUresult err = pfn_##cmd; \ - if( err != CUDA_SUCCESS ) { \ - const char *errStr; \ - (void) pfn_cuGetErrorString(err, &errStr); \ - WARN("Cuda failure %d '%s'", err, errStr); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -// Report failure but clear error and continue -#define CUCHECKIGNORE(cmd) do { \ - CUresult err = pfn_##cmd; \ - if( err != CUDA_SUCCESS ) { \ - const char *errStr; \ - (void) pfn_cuGetErrorString(err, &errStr); \ - INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \ - } \ -} while(false) - -#define CUCHECKTHREAD(cmd, args) do { \ - CUresult err = pfn_##cmd; \ - if (err != CUDA_SUCCESS) { \ - INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \ - args->ret = ncclUnhandledCudaError; \ - return args; \ - } \ -} while(0) - -#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol - -#if CUDART_VERSION >= 11030 -/* CUDA Driver functions loaded with cuGetProcAddress for versioning */ -DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000); -DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000); -DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020); -DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020); -DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000); -DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000); -DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000); -DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000); -DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000); -// cuMem API support -DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000); -DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020); -DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020); -#if CUDA_VERSION >= 11070 -DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support -#endif -#if CUDA_VERSION >= 12010 -/* NVSwitch Multicast support */ -DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010); -DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010); -DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010); -DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010); -DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010); -DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010); -#endif -#endif - -/* CUDA Driver functions loaded with dlsym() */ -DECLARE_CUDA_PFN_EXTERN(cuInit, 2000); -DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020); -DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030); - - -ncclResult_t ncclCudaLibraryInit(void); - -extern int ncclCudaDriverVersionCache; -extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit() - -inline ncclResult_t ncclCudaDriverVersion(int* driver) { - int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED); - if (version == -1) { - CUDACHECK(cudaDriverGetVersion(&version)); - __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED); - } - *driver = version; - return ncclSuccess; -} -#endif diff --git a/nvls/debug.h b/nvls/debug.h deleted file mode 100644 index d10217856..000000000 --- a/nvls/debug.h +++ /dev/null @@ -1,48 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_INT_DEBUG_H_ -#define NCCL_INT_DEBUG_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include -#include -#include - -#include -#include -#include - -// Conform to pthread and NVTX standard -#define NCCL_THREAD_NAMELEN 16 - -extern int ncclDebugLevel; -extern uint64_t ncclDebugMask; -extern pthread_mutex_t ncclDebugLock; -extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); - -void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); - -// Let code temporarily downgrade WARN into INFO -extern thread_local int ncclDebugNoWarn; -extern char ncclLastError[]; - -#define WARN(...) printf(__VA_ARGS__) -#define INFO(FLAGS, ...) printf(__VA_ARGS__) -#define TRACE_CALL(...) printf(__VA_ARGS__) - -#ifdef ENABLE_TRACE -#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) -extern std::chrono::steady_clock::time_point ncclEpoch; -#else -#define TRACE(...) -#endif - -void ncclSetThreadName(pthread_t thread, const char *fmt, ...); - -#endif diff --git a/nvls/device.h b/nvls/device.h deleted file mode 100644 index 56f8039f3..000000000 --- a/nvls/device.h +++ /dev/null @@ -1,463 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_DEVICE_H_ -#define NCCL_DEVICE_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include "align.h" -#include - -extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS]; - -extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS]; - -extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS]; - -#define NCCL_MAX_OPS 2048 -#define NCCL_STEPS 8 - -#include "net_device.h" - -enum ncclDevRedOp_t { - ncclDevSum, ncclDevProd, ncclDevMinMax, - ncclDevPreMulSum, ncclDevSumPostDiv, - ncclNumDevRedOps -}; -struct ncclDevRedOpFull { - ncclDevRedOp_t op; - ncclRedOp_t proxyOp; - bool scalarArgIsPtr; - uint64_t scalarArg; -}; - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -#define WARP_SIZE 32 -#define MAXCHANNELS 32 -#define NCCL_MAX_NTHREADS 640 -#define NCCL_SIMPLE_MAX_NTHREADS 512 -#define NCCL_LL_MAX_NTHREADS 512 -#define NCCL_LL_LINES_PER_THREAD 8 -#ifdef TEST_LL_CLEANUP -#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup -#define NCCL_LL_FLAG_MAX 0x100 -#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX)) -#else -#define NCCL_LL_CLEAN_MASK 0x7ffffff8 -#define NCCL_LL_FLAG(a) ((uint32_t)(a)) -#endif -// Make sure the clean mask will last for at least NCCL_NSTEPS -static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); - -#define NCCL_LL128_LINESIZE 128 -#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t)) -#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1) - -#define NCCL_LL128_MAX_NTHREADS 640 -#define NCCL_LL128_ELEMS_PER_THREAD 120 - -#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8 -#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) - -#define NCCL_DIRECT_WRITE 0x01 -#define NCCL_DIRECT_READ 0x02 -#define NCCL_DIRECT_NIC 0x04 -#define NCCL_IPC_WRITE 0x08 -#define NCCL_IPC_READ 0x10 -#define NCCL_NVLS_MIN_POLL 0x20 - -struct ncclConnInfo { - // Regular comm mechanism - char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send - void* mhandles[NCCL_NUM_PROTOCOLS]; - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - - int flags; // Direct communication / other flags - int shared; // Buffers are shared - void **ptrExchange; // Pointer exchange for direct communication - uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case - - int *sizesFifo; // Sizes fifo from GPU to proxy - int *offsFifo; // Buffer fifo from proxy to GPU - - uint64_t step; // Keep where we are - uint64_t llLastCleaning; - ncclNetDeviceHandle_t netDeviceHandle; -}; - -struct ncclProxyConnector { - int tpRank; - int tpLocalRank; - int sameProcess; - struct ncclProxyConnection* connection; - ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary -}; - -struct ncclConnector { - int connected; - struct ncclProxyConnector proxyConn; - struct ncclTransportComm* transportComm; - void* transportResources; - struct ncclConnInfo conn; -}; - -struct ncclRing { - // Shortcuts for userRanks[1] and userRanks[n-1] - int prev; - int next; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - - int index; // This rank's index in the ring -}; - - -// The root of each tree only has one node down (+1 intra-node). -#define NCCL_MAX_TREE_ARITY_TOP 2 -// Nodes inside the binary tree can have to two nodes down (+1 intra-node). -#define NCCL_MAX_TREE_ARITY 3 -struct ncclTree { - int depth; - int up; - int down[NCCL_MAX_TREE_ARITY]; -}; - -#define NCCL_MAX_DIRECT_ARITY 7 -struct ncclDirect { - int depth; - int out; - int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down - int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) - int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads - int up[NCCL_MAX_DIRECT_ARITY]; - int down[NCCL_MAX_DIRECT_ARITY]; -}; - -#define NCCL_MAX_NVLS_ARITY 8 -#define NCCL_MAX_NVLS_TREE_ARITY 3 -struct ncclNvls { - int out; - int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down - int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC) - int up[NCCL_MAX_NVLS_ARITY]; - int down; - int treeUp; - int treeDown[NCCL_MAX_NVLS_TREE_ARITY]; - int node; - int nNodes; -}; - -#define NCCL_MAX_CONNS 2 -struct ncclChannelPeer { - struct ncclConnector send[NCCL_MAX_CONNS]; - struct ncclConnector recv[NCCL_MAX_CONNS]; - int refCount; -}; - -struct ncclDevComm; - -/* ncclWork is to be a power of two, currently 8x64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclWorkElem. */ -#define NCCL_WORK_SIZE 512 - -enum ncclWorkType : uint8_t { - ncclWorkTypeUnused=0, - ncclWorkTypeColl=1, - ncclWorkTypeP2p=2, - ncclWorkTypeRegColl=3 -}; -enum ncclWorkP2PType : uint8_t { - ncclWorkP2pTypeUnused=0, - ncclWorkP2pTypeSend, - ncclWorkP2pTypeRecv -}; - -struct ncclWorkHeader { - union { - int32_t workNext; // when isLast=0: Offset from kernel argument workHead - uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back. - }; - uint16_t funcIndex; - uint8_t isLast:1; // last work for this kernel - uint8_t inFifo:1; // is this work in the fifo - enum ncclWorkType type; -}; - -struct ncclWorkElem { - union { - uint8_t flagBits; - struct { - uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1; - }; - }; - uint8_t nWarps; - uint8_t direct; - - const void * sendbuff; - void * recvbuff; - - size_t count; - size_t lastChunkSize; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint64_t redOpArg; -}; - -#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem)) -static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9"); - -struct ncclWorkElemP2p { - int peer : 30; - int proto : 2; - - enum ncclWorkP2PType p2pType; - uint8_t nWarps; - uint8_t warpStart; - uint8_t ngroups; - // Important not to use any fields with greater than 4-byte alignment since - // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if - // there were 8-byte fields. - //void* buff; - uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32; - //size_t count; - uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32; - int chunkSize; -}; - -static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16"); -#define NCCL_MAX_WORK_ELEMENTS_P2P 16 - -struct ncclWorkElemReg { - struct ncclWorkElem elem; - void* dnInputs[NCCL_MAX_DIRECT_ARITY+1]; - void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1]; - void* upOutputs[NCCL_MAX_DIRECT_ARITY+1]; -}; - -#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg)) -static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2"); - -// Number of named barriers supported by CUDA -#define NCCL_MAX_GROUPS 16 - -struct ncclWork { - struct ncclWorkHeader header; - union { - char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)]; - struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS]; - struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P]; - struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG]; - }; -}; -static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE"); -static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0"); - -struct ncclDevChannelPeer { - // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo - // instead of the full ncclConnector. - struct ncclConnInfo send[NCCL_MAX_CONNS]; - struct ncclConnInfo recv[NCCL_MAX_CONNS]; -}; - -struct alignas(16) ncclDevChannel { - struct ncclDevChannelPeer** peers; - struct ncclRing ring; - struct ncclTree tree; - struct ncclTree collnetChain; - struct ncclDirect collnetDirect; - struct ncclNvls nvls; - uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed -}; - -struct ncclDevComm { - int rank; - int nRanks; - int buffSizes[NCCL_NUM_PROTOCOLS]; - int p2pChunkSize; - - // Operation list for aggregation - int workFifoDepth; - struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory - - // Flag to ask NCCL kernels to abort - volatile uint32_t* abortFlag; - - // Channels, device side - struct ncclDevChannel* channels/*[MAXCHANNELS]*/; -}; - -struct alignas(16) ncclDevCommAndChannels { - struct ncclDevComm comm; - struct ncclDevChannel channels[MAXCHANNELS]; -}; - -#ifdef __CUDA_ARCH__ - #define NCCL_CUDA_ARCH __CUDA_ARCH__ -#else - #define NCCL_CUDA_ARCH 0 -#endif - -template -__host__ __device__ constexpr T min_constexpr(T a) { return a; } -template -__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) { - return min_constexpr((a < b ? a : b), c...); -} - -template -__host__ __device__ constexpr T max_constexpr(T a) { return a; } -template -__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) { - return max_constexpr((a > b ? a : b), c...); -} - -// Calculate the unroll factor given: -// * bytePerPack: number of bytes accessed per instruction -// * insns: max permissible unroll value -// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack) -__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) { - return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack); -} - -// Note that all unroll value logic should depend on a given cudaArch argument -// and not __CUDA_ARCH__ since these need to be host-side executable where the -// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device -// side code can elide passing the arch for brevity. - -__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) { - // Our collective unroll should move to the same bytes&insns model as NVLS. - return cudaArch >= 800 ? 8 : 4; -} - -__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; } -__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; } - -__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) { - return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch)); -} - -// The amount of dynamic shmem per warp -__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) { - return (max_constexpr( - /*LL */0, - /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t), - /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16, - // NVLS needs an extra 16B to read unaligned data. - /*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16 - ) + 15) & -16; // pad to 16 bytes -} - -// The amount of dynamic shmem per block -__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { - return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE); -} - -// Host-side table of kernel function pointers. -extern int const ncclDevKernelCount; -extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; - -// Table of most specialized kernel function to run given func index. -extern int const ncclDevFuncRowToId[]; -extern void* const ncclDevKernelForFunc[/*funcIndex*/]; -extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/]; - -// Launch a one-rank reduction on stream. -ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream); - -// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py" -inline bool ncclNvlsSupported(int devRedOp, int type) { - switch (type) { - case ncclInt32: - case ncclUint32: - case ncclInt64: - case ncclUint64: - case ncclFloat16: - #if defined(__CUDA_BF16_TYPES_EXIST__) - case ncclBfloat16: - #endif - return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax; - case ncclFloat: - case ncclDouble: - return devRedOp == ncclDevSum; - default: - return false; - } -} - -// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py" -inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) { - #if defined(__CUDA_BF16_TYPES_EXIST__) - constexpr int NumTypes = ncclNumTypes; - #else - constexpr int NumTypes = ncclNumTypes + 1; - #endif - - int row = 0; // ncclDevFuncIndex_P2p - if (coll == ncclFuncSendRecv) goto have_row; - row += 1; - - if (coll == ncclFuncAllGather) { - int algo1 = algo == NCCL_ALGO_RING ? 0 : - /*algo == NCCL_ALGO_NVLS*/ 1; - row += algo1*NCCL_NUM_PROTOCOLS + proto; - goto have_row; - } - row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; - - if (coll == ncclFuncBroadcast) { - row += proto; - goto have_row; - } - row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; - - if (coll == ncclFuncAllReduce) { - row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto; - goto have_row; - } - row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS; - - if (coll == ncclFuncReduce) { - row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto; - goto have_row; - } - row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS; - - if (coll == ncclFuncReduceScatter) { - int algo1 = algo == NCCL_ALGO_RING ? 0 : - /*algo == NCCL_ALGO_NVLS*/ 1; - row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto; - goto have_row; - } - row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS; - -have_row: - return ncclDevFuncRowToId[row]; -} - -inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; } - -#endif diff --git a/nvls/enqueue.h b/nvls/enqueue.h deleted file mode 100644 index 634f037cb..000000000 --- a/nvls/enqueue.h +++ /dev/null @@ -1,26 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_ENQUEUE_H_ -#define NCCL_ENQUEUE_H_ - -#include "comm.h" -#include "group.h" -#include "collectives.h" -#include "utils.h" - -#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) -#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ - -ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize); -ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); -ncclResult_t ncclLaunchPrepare(struct ncclComm* comm); -ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); -ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan); -ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan); -ncclResult_t ncclLaunchFinish(struct ncclComm* comm); - -#endif // End include guard diff --git a/nvls/gdrwrap.h b/nvls/gdrwrap.h deleted file mode 100644 index a64674cc5..000000000 --- a/nvls/gdrwrap.h +++ /dev/null @@ -1,252 +0,0 @@ -/************************************************************************* - * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_GDRWRAP_H_ -#define NCCL_GDRWRAP_H_ - -#include "nccl.h" -#include // for standard [u]intX_t types -#include -#include - -// These can be used if the GDR library isn't thread safe -#include -extern pthread_mutex_t gdrLock; -#define GDRLOCK() pthread_mutex_lock(&gdrLock) -#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) -#define GDRLOCKCALL(cmd, ret) do { \ - GDRLOCK(); \ - ret = cmd; \ - GDRUNLOCK(); \ -} while(false) - -#define GDRCHECK(cmd) do { \ - int e; \ - /* GDRLOCKCALL(cmd, e); */ \ - e = cmd; \ - if( e != 0 ) { \ - WARN("GDRCOPY failure %d", e); \ - return ncclSystemError; \ - } \ -} while(false) - -// This is required as the GDR memory is mapped WC -#if !defined(__NVCC__) -#if defined(__PPC__) -static inline void wc_store_fence(void) { asm volatile("sync") ; } -#elif defined(__x86_64__) -#include -static inline void wc_store_fence(void) { _mm_sfence(); } -#elif defined(__aarch64__) -#ifdef __cplusplus -#include -static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); } -#else -#include -static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); } -#endif -#endif -#endif - -//#define GDR_DIRECT 1 -#ifdef GDR_DIRECT -// Call the GDR API library code directly rather than via -// dlopen() wrappers -#include - -static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; } -static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; } -static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; } -static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { - GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle)); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { - GDRCHECK(gdr_unpin_buffer(g, handle)); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { - GDRCHECK(gdr_get_info(g, handle, info)); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { - GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size)); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { - GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size)); - return ncclSuccess; -} -static void wrap_gdr_runtime_get_version(int *major, int *minor) { - gdr_runtime_get_version(major, minor); - return ncclSuccess; -} -static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { - gdr_driver_get_version(g, major, minor); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { - GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size)); - return ncclSuccess; -} -static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { - GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size)); - return ncclSuccess; -} - -#else -// Dynamically handle dependency the GDR API library - -/* Extracted from gdrapi.h (v2.1 Nov 2020) */ - -#define GPU_PAGE_SHIFT 16 -#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) -#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) -#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) - -struct gdr; -typedef struct gdr *gdr_t; - -typedef struct gdr_mh_s { - unsigned long h; -} gdr_mh_t; - -struct gdr_info { - uint64_t va; - uint64_t mapped_size; - uint32_t page_size; - uint64_t tm_cycles; - uint32_t cycles_per_ms; - unsigned mapped:1; - unsigned wc_mapping:1; -}; -typedef struct gdr_info gdr_info_t; - -/* End of gdrapi.h */ - -ncclResult_t wrap_gdr_symbols(void); - -gdr_t wrap_gdr_open(void); -ncclResult_t wrap_gdr_close(gdr_t g); -ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); -ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); -ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); -ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size); -ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size); -ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor); -ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor); -ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); -ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); - -#endif // GDR_DIRECT - -// Global GDR driver handle -extern gdr_t ncclGdrCopy; - -#include "alloc.h" - -typedef struct gdr_mem_desc { - void *gdrDevMem; - void *gdrMap; - size_t gdrOffset; - size_t gdrMapSize; - gdr_mh_t gdrMh; -} gdr_mem_desc_t; - -static gdr_t ncclGdrInit() { - int libMajor, libMinor, drvMajor, drvMinor; - gdr_t handle = NULL; - // Dynamically load the GDRAPI library symbols - if (wrap_gdr_symbols() == ncclSuccess) { - handle = wrap_gdr_open(); - - if (handle != NULL) { - ncclResult_t res; - - // Query the version of libgdrapi - NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); - - // Query the version of gdrdrv driver - NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); - - // Only support GDRAPI 2.1 and later - if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { - goto error; - } - else - INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); - } - } - return handle; -error: - if (handle != NULL) (void) wrap_gdr_close(handle); - return NULL; -} - -template -static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { - gdr_info_t info; - size_t mapSize; - gdr_mh_t mh; - char *devMem; - void *gdrMap; - - mapSize = sizeof(T)*nelem; - - // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE - ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); - // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too - NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1)); - uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; - size_t align = alignedAddr - (uint64_t)devMem; - - //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize); - NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); - - NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); - //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap); - - NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info)); - - // Will offset ever be non zero ? - ssize_t off = info.va - alignedAddr; - - gdr_mem_desc_t* md; - NCCLCHECK(ncclCalloc(&md, 1)); - md->gdrDevMem = devMem; - md->gdrMap = gdrMap; - md->gdrMapSize = mapSize; - md->gdrOffset = off+align; - md->gdrMh = mh; - *gdrHandle = md; - - *ptr = (T *)((char *)gdrMap+off); - if (devPtr) *devPtr = (T *)(devMem+off+align); - - TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", - md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); - - return ncclSuccess; -} - -template -static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { - gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; - NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T))); - return ncclSuccess; -} - -static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { - gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; - NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); - NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); - NCCLCHECK(ncclCudaFree(md->gdrDevMem)); - free(md); - - return ncclSuccess; -} - -#endif // End include guard diff --git a/nvls/graph.h b/nvls/graph.h deleted file mode 100644 index fdd634894..000000000 --- a/nvls/graph.h +++ /dev/null @@ -1,116 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_GRAPH_H_ -#define NCCL_GRAPH_H_ - -#include "nccl.h" -#include "device.h" -#include -#include -#include -#include -#include - -ncclResult_t ncclTopoCudaPath(int cudaDev, char** path); - -struct ncclTopoSystem; -// Build the topology -ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system); -ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system); -ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system); - -ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm); -void ncclTopoFree(struct ncclTopoSystem* system); -ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm); -ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); -ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks); -int ncclTopoPathAllNVLink(struct ncclTopoSystem* system); - -// Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank); -ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); -ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); -ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush); -ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net); -int ncclPxnDisable(struct ncclComm* comm); -ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks); - -// Find CPU affinity -ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity); - -#define NCCL_TOPO_CPU_ARCH_X86 1 -#define NCCL_TOPO_CPU_ARCH_POWER 2 -#define NCCL_TOPO_CPU_ARCH_ARM 3 -#define NCCL_TOPO_CPU_VENDOR_INTEL 1 -#define NCCL_TOPO_CPU_VENDOR_AMD 2 -#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 -#define NCCL_TOPO_CPU_TYPE_BDW 1 -#define NCCL_TOPO_CPU_TYPE_SKL 2 -#define NCCL_TOPO_CPU_TYPE_YONGFENG 1 -ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); -ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); -ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); -ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); -ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id); -ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex); - -#define NCCL_TOPO_MAX_NODES 256 - -// Init search. Needs to be done before calling ncclTopoCompute -ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system); - -#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU) -#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU) -#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU -#define NCCL_TOPO_PATTERN_RING 4 // Ring -#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree -struct ncclTopoGraph { - // Input / output - int id; // ring : 0, tree : 1, collnet : 2 - int pattern; - int crossNic; - int collNet; - int minChannels; - int maxChannels; - // Output - int nChannels; - float bwIntra; - float bwInter; - float latencyInter; - int typeIntra; - int typeInter; - int sameChannels; - int nHops; - int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES]; - int inter[MAXCHANNELS*2]; -}; -ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); - -ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph); -ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs); - -struct ncclTopoRanks { - int ringRecv[MAXCHANNELS]; - int ringSend[MAXCHANNELS]; - int ringPrev[MAXCHANNELS]; - int ringNext[MAXCHANNELS]; - int treeToParent[MAXCHANNELS]; - int treeToChild0[MAXCHANNELS]; - int treeToChild1[MAXCHANNELS]; - int nvlsHeads[MAXCHANNELS]; -}; - -ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks); - -ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, - struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs); - -ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs); -#include "info.h" -ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL); - -#endif diff --git a/nvls/group.h b/nvls/group.h deleted file mode 100644 index 72251147f..000000000 --- a/nvls/group.h +++ /dev/null @@ -1,137 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_GROUP_H_ -#define NCCL_GROUP_H_ - -#include "nccl.h" -#include "comm.h" - -ncclResult_t ncclGroupErrCheck(ncclResult_t ret); -void ncclGroupCommJoin(struct ncclComm* comm); -void ncclGroupCommPreconnect(struct ncclComm* comm); -ncclResult_t ncclGroupCommLeave(struct ncclComm* comm); -ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob); -ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob); - -typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); - -ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev); - -typedef enum ncclGroupJobState { - ncclGroupJobRunning = 0, - ncclGroupJobDone = 1, - ncclGroupJobJoined = 2, -} ncclGroupJobState_t; - -struct ncclAsyncJob { - struct ncclAsyncJob* next; - pthread_t thread; - ncclResult_t result; - ncclResult_t(*func)(struct ncclAsyncJob*); - void(*undo)(struct ncclAsyncJob*); - void(*destructor)(void*); - ncclGroupJobState_t state; - volatile uint32_t *abortFlag; /* point to comm abortFlag */ - volatile uint32_t *childAbortFlag; /* point to child abortFlag */ - ncclComm_t comm; -}; - -ncclResult_t ncclAsyncLaunch( - struct ncclAsyncJob* job, - ncclResult_t(*func)(struct ncclAsyncJob*), - void(*undo)(struct ncclAsyncJob*), - void(*destructor)(void*), ncclComm_t comm -); - -struct ncclGroupJob { - struct ncclAsyncJob base; - struct ncclComm **groupCommHeadPtr; - struct ncclComm **groupCommPreconnectHeadPtr; - ncclResult_t *groupErrorPtr; - volatile bool *abortFlagPtr; - int *groupBlockingPtr; - struct ncclIntruQueue *asyncJobsPtr; - bool initialized; -}; - -ncclResult_t ncclGroupStartInternal(); -ncclResult_t ncclGroupEndInternal(); -ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job); - -//////////////////////////////////////////////////////////////////////////////// - -extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting -extern __thread ncclResult_t ncclGroupError; -extern __thread struct ncclComm* ncclGroupCommHead; -extern __thread struct ncclComm* ncclGroupCommPreconnectHead; -extern __thread int ncclGroupBlocking; -extern __thread struct ncclGroupJob *ncclGroupJobMainPtr; -extern __thread struct ncclGroupJob ncclGroupJobMain; - -static inline void groupResetJobState() { - ncclGroupBlocking = -1; - ncclGroupJobMainPtr = NULL; - memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob)); - return; -} - -static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) { - ncclResult_t ret = ncclSuccess; - if (job) { - ret = ncclAsyncJobComplete(&job->base); - groupResetJobState(); - } - return ret; -} - -inline ncclResult_t ncclGroupStartInternal() { - ncclGroupDepth++; - return ncclSuccess; -} - -inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { - if (ncclGroupDepth > 0) { - if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; - } - return ret; -} - -// Add comm to this thread's group -inline void ncclGroupCommJoin(struct ncclComm* comm) { - if (comm->groupNext == reinterpret_cast(0x1)) { - // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves - // the users program order yet insures siblings occur consecutively. This - // is required by doLaunches() in "group.cc". - struct ncclComm** pp = &ncclGroupCommHead; - while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0) - pp = &(*pp)->groupNext; - comm->groupNext = *pp; - *pp = comm; - // Comms gets a new memory stack scope upon joining. Each task batched for - // this comm is allocated there. - ncclMemoryStackPush(&comm->memScoped); - } - - ncclGroupBlocking = comm->config.blocking; -} - -// Add comm to this thread's group needing preconnect -inline void ncclGroupCommPreconnect(struct ncclComm* comm) { - if (comm->preconnectNext == reinterpret_cast(0x1)) { - comm->preconnectNext = ncclGroupCommPreconnectHead; - ncclGroupCommPreconnectHead = comm; - } -} - -// Comm has left group -inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) { - comm->groupNext = reinterpret_cast(0x1); - ncclMemoryStackPop(&comm->memScoped); - return ncclSuccess; -} - -#endif diff --git a/nvls/ibvcore.h b/nvls/ibvcore.h deleted file mode 100644 index 8d8ecf1ec..000000000 --- a/nvls/ibvcore.h +++ /dev/null @@ -1,1058 +0,0 @@ -#ifndef NCCL_IBV_CORE_H_ -#define NCCL_IBV_CORE_H_ - -/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without - * explicit including of IB verbs header. - */ - -#include -#include -#include -#include - -#if __GNUC__ >= 3 -# define __attribute_const __attribute__((const)) -#else -# define __attribute_const -#endif - -union ibv_gid { - uint8_t raw[16]; - struct { - uint64_t subnet_prefix; - uint64_t interface_id; - } global; -}; - -#ifndef container_of -/** - * container_of - cast a member of a structure out to the containing structure - * @ptr: the pointer to the member. - * @type: the type of the container struct this is embedded in. - * @member: the name of the member within the struct. - * - */ -#define container_of(ptr, type, member) \ - ((type *) ((uint8_t *)(ptr) - offsetof(type, member))) -#endif - -#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) - -/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1; - -enum ibv_node_type { - IBV_NODE_UNKNOWN = -1, - IBV_NODE_CA = 1, - IBV_NODE_SWITCH, - IBV_NODE_ROUTER, - IBV_NODE_RNIC, - - /* Leave a gap for future node types before starting with - * experimental node types. - */ - IBV_EXP_NODE_TYPE_START = 32, - IBV_EXP_NODE_MIC = IBV_EXP_NODE_TYPE_START -}; - -enum ibv_transport_type { - IBV_TRANSPORT_UNKNOWN = -1, - IBV_TRANSPORT_IB = 0, - IBV_TRANSPORT_IWARP, - - /* Leave a gap for future transport types before starting with - * experimental transport types. - */ - IBV_EXP_TRANSPORT_TYPE_START = 32, - IBV_EXP_TRANSPORT_SCIF = IBV_EXP_TRANSPORT_TYPE_START -}; - -enum ibv_device_cap_flags { - IBV_DEVICE_RESIZE_MAX_WR = 1, - IBV_DEVICE_BAD_PKEY_CNTR = 1 << 1, - IBV_DEVICE_BAD_QKEY_CNTR = 1 << 2, - IBV_DEVICE_RAW_MULTI = 1 << 3, - IBV_DEVICE_AUTO_PATH_MIG = 1 << 4, - IBV_DEVICE_CHANGE_PHY_PORT = 1 << 5, - IBV_DEVICE_UD_AV_PORT_ENFORCE = 1 << 6, - IBV_DEVICE_CURR_QP_STATE_MOD = 1 << 7, - IBV_DEVICE_SHUTDOWN_PORT = 1 << 8, - IBV_DEVICE_INIT_TYPE = 1 << 9, - IBV_DEVICE_PORT_ACTIVE_EVENT = 1 << 10, - IBV_DEVICE_SYS_IMAGE_GUID = 1 << 11, - IBV_DEVICE_RC_RNR_NAK_GEN = 1 << 12, - IBV_DEVICE_SRQ_RESIZE = 1 << 13, - IBV_DEVICE_N_NOTIFY_CQ = 1 << 14, - IBV_DEVICE_XRC = 1 << 20, - IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29 -}; - -enum ibv_atomic_cap { - IBV_ATOMIC_NONE, - IBV_ATOMIC_HCA, - IBV_ATOMIC_GLOB -}; - -struct ibv_device_attr { - char fw_ver[64]; - uint64_t node_guid; - uint64_t sys_image_guid; - uint64_t max_mr_size; - uint64_t page_size_cap; - uint32_t vendor_id; - uint32_t vendor_part_id; - uint32_t hw_ver; - int max_qp; - int max_qp_wr; - int device_cap_flags; - int max_sge; - int max_sge_rd; - int max_cq; - int max_cqe; - int max_mr; - int max_pd; - int max_qp_rd_atom; - int max_ee_rd_atom; - int max_res_rd_atom; - int max_qp_init_rd_atom; - int max_ee_init_rd_atom; - enum ibv_atomic_cap atomic_cap; - int max_ee; - int max_rdd; - int max_mw; - int max_raw_ipv6_qp; - int max_raw_ethy_qp; - int max_mcast_grp; - int max_mcast_qp_attach; - int max_total_mcast_qp_attach; - int max_ah; - int max_fmr; - int max_map_per_fmr; - int max_srq; - int max_srq_wr; - int max_srq_sge; - uint16_t max_pkeys; - uint8_t local_ca_ack_delay; - uint8_t phys_port_cnt; -}; - -enum ibv_mtu { - IBV_MTU_256 = 1, - IBV_MTU_512 = 2, - IBV_MTU_1024 = 3, - IBV_MTU_2048 = 4, - IBV_MTU_4096 = 5 -}; - -enum ibv_port_state { - IBV_PORT_NOP = 0, - IBV_PORT_DOWN = 1, - IBV_PORT_INIT = 2, - IBV_PORT_ARMED = 3, - IBV_PORT_ACTIVE = 4, - IBV_PORT_ACTIVE_DEFER = 5 -}; - -enum { - IBV_LINK_LAYER_UNSPECIFIED, - IBV_LINK_LAYER_INFINIBAND, - IBV_LINK_LAYER_ETHERNET, - - /* Leave a gap for future link layer types before starting with - * experimental link layer. - */ - IBV_EXP_LINK_LAYER_START = 32, - IBV_EXP_LINK_LAYER_SCIF = IBV_EXP_LINK_LAYER_START -}; - -enum ibv_port_cap_flags { - IBV_PORT_SM = 1 << 1, - IBV_PORT_NOTICE_SUP = 1 << 2, - IBV_PORT_TRAP_SUP = 1 << 3, - IBV_PORT_OPT_IPD_SUP = 1 << 4, - IBV_PORT_AUTO_MIGR_SUP = 1 << 5, - IBV_PORT_SL_MAP_SUP = 1 << 6, - IBV_PORT_MKEY_NVRAM = 1 << 7, - IBV_PORT_PKEY_NVRAM = 1 << 8, - IBV_PORT_LED_INFO_SUP = 1 << 9, - IBV_PORT_SYS_IMAGE_GUID_SUP = 1 << 11, - IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP = 1 << 12, - IBV_PORT_EXTENDED_SPEEDS_SUP = 1 << 14, - IBV_PORT_CM_SUP = 1 << 16, - IBV_PORT_SNMP_TUNNEL_SUP = 1 << 17, - IBV_PORT_REINIT_SUP = 1 << 18, - IBV_PORT_DEVICE_MGMT_SUP = 1 << 19, - IBV_PORT_VENDOR_CLASS = 1 << 24, - IBV_PORT_CLIENT_REG_SUP = 1 << 25, - IBV_PORT_IP_BASED_GIDS = 1 << 26, -}; - -struct ibv_port_attr { - enum ibv_port_state state; - enum ibv_mtu max_mtu; - enum ibv_mtu active_mtu; - int gid_tbl_len; - uint32_t port_cap_flags; - uint32_t max_msg_sz; - uint32_t bad_pkey_cntr; - uint32_t qkey_viol_cntr; - uint16_t pkey_tbl_len; - uint16_t lid; - uint16_t sm_lid; - uint8_t lmc; - uint8_t max_vl_num; - uint8_t sm_sl; - uint8_t subnet_timeout; - uint8_t init_type_reply; - uint8_t active_width; - uint8_t active_speed; - uint8_t phys_state; - uint8_t link_layer; - uint8_t reserved; -}; - -enum ibv_event_type { - IBV_EVENT_CQ_ERR, - IBV_EVENT_QP_FATAL, - IBV_EVENT_QP_REQ_ERR, - IBV_EVENT_QP_ACCESS_ERR, - IBV_EVENT_COMM_EST, - IBV_EVENT_SQ_DRAINED, - IBV_EVENT_PATH_MIG, - IBV_EVENT_PATH_MIG_ERR, - IBV_EVENT_DEVICE_FATAL, - IBV_EVENT_PORT_ACTIVE, - IBV_EVENT_PORT_ERR, - IBV_EVENT_LID_CHANGE, - IBV_EVENT_PKEY_CHANGE, - IBV_EVENT_SM_CHANGE, - IBV_EVENT_SRQ_ERR, - IBV_EVENT_SRQ_LIMIT_REACHED, - IBV_EVENT_QP_LAST_WQE_REACHED, - IBV_EVENT_CLIENT_REREGISTER, - IBV_EVENT_GID_CHANGE, - - /* new experimental events start here leaving enough - * room for 14 events which should be enough - */ - IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32, - IBV_EXP_EVENT_DCT_ACCESS_ERR, - IBV_EXP_EVENT_DCT_REQ_ERR, -}; - -struct ibv_async_event { - union { - struct ibv_cq *cq; - struct ibv_qp *qp; - struct ibv_srq *srq; - struct ibv_exp_dct *dct; - int port_num; - /* For source compatible with Legacy API */ - uint32_t xrc_qp_num; - } element; - enum ibv_event_type event_type; -}; - -enum ibv_wc_status { - IBV_WC_SUCCESS, - IBV_WC_LOC_LEN_ERR, - IBV_WC_LOC_QP_OP_ERR, - IBV_WC_LOC_EEC_OP_ERR, - IBV_WC_LOC_PROT_ERR, - IBV_WC_WR_FLUSH_ERR, - IBV_WC_MW_BIND_ERR, - IBV_WC_BAD_RESP_ERR, - IBV_WC_LOC_ACCESS_ERR, - IBV_WC_REM_INV_REQ_ERR, - IBV_WC_REM_ACCESS_ERR, - IBV_WC_REM_OP_ERR, - IBV_WC_RETRY_EXC_ERR, - IBV_WC_RNR_RETRY_EXC_ERR, - IBV_WC_LOC_RDD_VIOL_ERR, - IBV_WC_REM_INV_RD_REQ_ERR, - IBV_WC_REM_ABORT_ERR, - IBV_WC_INV_EECN_ERR, - IBV_WC_INV_EEC_STATE_ERR, - IBV_WC_FATAL_ERR, - IBV_WC_RESP_TIMEOUT_ERR, - IBV_WC_GENERAL_ERR -}; -const char *ibv_wc_status_str(enum ibv_wc_status status); - -enum ibv_wc_opcode { - IBV_WC_SEND, - IBV_WC_RDMA_WRITE, - IBV_WC_RDMA_READ, - IBV_WC_COMP_SWAP, - IBV_WC_FETCH_ADD, - IBV_WC_BIND_MW, -/* - * Set value of IBV_WC_RECV so consumers can test if a completion is a - * receive by testing (opcode & IBV_WC_RECV). - */ - IBV_WC_RECV = 1 << 7, - IBV_WC_RECV_RDMA_WITH_IMM -}; - -enum ibv_wc_flags { - IBV_WC_GRH = 1 << 0, - IBV_WC_WITH_IMM = 1 << 1 -}; - -struct ibv_wc { - uint64_t wr_id; - enum ibv_wc_status status; - enum ibv_wc_opcode opcode; - uint32_t vendor_err; - uint32_t byte_len; - uint32_t imm_data; /* in network byte order */ - uint32_t qp_num; - uint32_t src_qp; - int wc_flags; - uint16_t pkey_index; - uint16_t slid; - uint8_t sl; - uint8_t dlid_path_bits; -}; - -enum ibv_access_flags { - IBV_ACCESS_LOCAL_WRITE = 1, - IBV_ACCESS_REMOTE_WRITE = (1<<1), - IBV_ACCESS_REMOTE_READ = (1<<2), - IBV_ACCESS_REMOTE_ATOMIC = (1<<3), - IBV_ACCESS_MW_BIND = (1<<4), - IBV_ACCESS_RELAXED_ORDERING = (1<<20), -}; - -struct ibv_pd { - struct ibv_context *context; - uint32_t handle; -}; - -enum ibv_xrcd_init_attr_mask { - IBV_XRCD_INIT_ATTR_FD = 1 << 0, - IBV_XRCD_INIT_ATTR_OFLAGS = 1 << 1, - IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2 -}; - -struct ibv_xrcd_init_attr { - uint32_t comp_mask; - int fd; - int oflags; -}; - -struct ibv_xrcd { - struct ibv_context *context; -}; - -enum ibv_rereg_mr_flags { - IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), - IBV_REREG_MR_CHANGE_PD = (1 << 1), - IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), - IBV_REREG_MR_KEEP_VALID = (1 << 3) -}; - -struct ibv_mr { - struct ibv_context *context; - struct ibv_pd *pd; - void *addr; - size_t length; - uint32_t handle; - uint32_t lkey; - uint32_t rkey; -}; - -enum ibv_mw_type { - IBV_MW_TYPE_1 = 1, - IBV_MW_TYPE_2 = 2 -}; - -struct ibv_mw { - struct ibv_context *context; - struct ibv_pd *pd; - uint32_t rkey; -}; - -struct ibv_global_route { - union ibv_gid dgid; - uint32_t flow_label; - uint8_t sgid_index; - uint8_t hop_limit; - uint8_t traffic_class; -}; - -struct ibv_grh { - uint32_t version_tclass_flow; - uint16_t paylen; - uint8_t next_hdr; - uint8_t hop_limit; - union ibv_gid sgid; - union ibv_gid dgid; -}; - -enum ibv_rate { - IBV_RATE_MAX = 0, - IBV_RATE_2_5_GBPS = 2, - IBV_RATE_5_GBPS = 5, - IBV_RATE_10_GBPS = 3, - IBV_RATE_20_GBPS = 6, - IBV_RATE_30_GBPS = 4, - IBV_RATE_40_GBPS = 7, - IBV_RATE_60_GBPS = 8, - IBV_RATE_80_GBPS = 9, - IBV_RATE_120_GBPS = 10, - IBV_RATE_14_GBPS = 11, - IBV_RATE_56_GBPS = 12, - IBV_RATE_112_GBPS = 13, - IBV_RATE_168_GBPS = 14, - IBV_RATE_25_GBPS = 15, - IBV_RATE_100_GBPS = 16, - IBV_RATE_200_GBPS = 17, - IBV_RATE_300_GBPS = 18 -}; - -/** - * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the - * base rate of 2.5 Gbit/sec. For example, IBV_RATE_5_GBPS will be - * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec. - * @rate: rate to convert. - */ -int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const; - -/** - * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum. - * @mult: multiple to convert. - */ -enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const; - -/** - * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec. - * For example, IBV_RATE_5_GBPS will return the value 5000. - * @rate: rate to convert. - */ -int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const; - -/** - * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum. - * @mbps: value to convert. - */ -enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const; - -struct ibv_ah_attr { - struct ibv_global_route grh; - uint16_t dlid; - uint8_t sl; - uint8_t src_path_bits; - uint8_t static_rate; - uint8_t is_global; - uint8_t port_num; -}; - -enum ibv_srq_attr_mask { - IBV_SRQ_MAX_WR = 1 << 0, - IBV_SRQ_LIMIT = 1 << 1 -}; - -struct ibv_srq_attr { - uint32_t max_wr; - uint32_t max_sge; - uint32_t srq_limit; -}; - -struct ibv_srq_init_attr { - void *srq_context; - struct ibv_srq_attr attr; -}; - -enum ibv_srq_type { - IBV_SRQT_BASIC, - IBV_SRQT_XRC -}; - -enum ibv_srq_init_attr_mask { - IBV_SRQ_INIT_ATTR_TYPE = 1 << 0, - IBV_SRQ_INIT_ATTR_PD = 1 << 1, - IBV_SRQ_INIT_ATTR_XRCD = 1 << 2, - IBV_SRQ_INIT_ATTR_CQ = 1 << 3, - IBV_SRQ_INIT_ATTR_RESERVED = 1 << 4 -}; - -struct ibv_srq_init_attr_ex { - void *srq_context; - struct ibv_srq_attr attr; - - uint32_t comp_mask; - enum ibv_srq_type srq_type; - struct ibv_pd *pd; - struct ibv_xrcd *xrcd; - struct ibv_cq *cq; -}; - -enum ibv_qp_type { - IBV_QPT_RC = 2, - IBV_QPT_UC, - IBV_QPT_UD, - /* XRC compatible code */ - IBV_QPT_XRC, - IBV_QPT_RAW_PACKET = 8, - IBV_QPT_RAW_ETH = 8, - IBV_QPT_XRC_SEND = 9, - IBV_QPT_XRC_RECV, - - /* Leave a gap for future qp types before starting with - * experimental qp types. - */ - IBV_EXP_QP_TYPE_START = 32, - IBV_EXP_QPT_DC_INI = IBV_EXP_QP_TYPE_START -}; - -struct ibv_qp_cap { - uint32_t max_send_wr; - uint32_t max_recv_wr; - uint32_t max_send_sge; - uint32_t max_recv_sge; - uint32_t max_inline_data; -}; - -struct ibv_qp_init_attr { - void *qp_context; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - struct ibv_qp_cap cap; - enum ibv_qp_type qp_type; - int sq_sig_all; - /* Below is needed for backwards compatabile */ - struct ibv_xrc_domain *xrc_domain; -}; - -enum ibv_qp_init_attr_mask { - IBV_QP_INIT_ATTR_PD = 1 << 0, - IBV_QP_INIT_ATTR_XRCD = 1 << 1, - IBV_QP_INIT_ATTR_RESERVED = 1 << 2 -}; - -struct ibv_qp_init_attr_ex { - void *qp_context; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - struct ibv_qp_cap cap; - enum ibv_qp_type qp_type; - int sq_sig_all; - - uint32_t comp_mask; - struct ibv_pd *pd; - struct ibv_xrcd *xrcd; -}; - -enum ibv_qp_open_attr_mask { - IBV_QP_OPEN_ATTR_NUM = 1 << 0, - IBV_QP_OPEN_ATTR_XRCD = 1 << 1, - IBV_QP_OPEN_ATTR_CONTEXT = 1 << 2, - IBV_QP_OPEN_ATTR_TYPE = 1 << 3, - IBV_QP_OPEN_ATTR_RESERVED = 1 << 4 -}; - -struct ibv_qp_open_attr { - uint32_t comp_mask; - uint32_t qp_num; - struct ibv_xrcd *xrcd; - void *qp_context; - enum ibv_qp_type qp_type; -}; - -enum ibv_qp_attr_mask { - IBV_QP_STATE = 1 << 0, - IBV_QP_CUR_STATE = 1 << 1, - IBV_QP_EN_SQD_ASYNC_NOTIFY = 1 << 2, - IBV_QP_ACCESS_FLAGS = 1 << 3, - IBV_QP_PKEY_INDEX = 1 << 4, - IBV_QP_PORT = 1 << 5, - IBV_QP_QKEY = 1 << 6, - IBV_QP_AV = 1 << 7, - IBV_QP_PATH_MTU = 1 << 8, - IBV_QP_TIMEOUT = 1 << 9, - IBV_QP_RETRY_CNT = 1 << 10, - IBV_QP_RNR_RETRY = 1 << 11, - IBV_QP_RQ_PSN = 1 << 12, - IBV_QP_MAX_QP_RD_ATOMIC = 1 << 13, - IBV_QP_ALT_PATH = 1 << 14, - IBV_QP_MIN_RNR_TIMER = 1 << 15, - IBV_QP_SQ_PSN = 1 << 16, - IBV_QP_MAX_DEST_RD_ATOMIC = 1 << 17, - IBV_QP_PATH_MIG_STATE = 1 << 18, - IBV_QP_CAP = 1 << 19, - IBV_QP_DEST_QPN = 1 << 20 -}; - -enum ibv_qp_state { - IBV_QPS_RESET, - IBV_QPS_INIT, - IBV_QPS_RTR, - IBV_QPS_RTS, - IBV_QPS_SQD, - IBV_QPS_SQE, - IBV_QPS_ERR, - IBV_QPS_UNKNOWN -}; - -enum ibv_mig_state { - IBV_MIG_MIGRATED, - IBV_MIG_REARM, - IBV_MIG_ARMED -}; - -struct ibv_qp_attr { - enum ibv_qp_state qp_state; - enum ibv_qp_state cur_qp_state; - enum ibv_mtu path_mtu; - enum ibv_mig_state path_mig_state; - uint32_t qkey; - uint32_t rq_psn; - uint32_t sq_psn; - uint32_t dest_qp_num; - int qp_access_flags; - struct ibv_qp_cap cap; - struct ibv_ah_attr ah_attr; - struct ibv_ah_attr alt_ah_attr; - uint16_t pkey_index; - uint16_t alt_pkey_index; - uint8_t en_sqd_async_notify; - uint8_t sq_draining; - uint8_t max_rd_atomic; - uint8_t max_dest_rd_atomic; - uint8_t min_rnr_timer; - uint8_t port_num; - uint8_t timeout; - uint8_t retry_cnt; - uint8_t rnr_retry; - uint8_t alt_port_num; - uint8_t alt_timeout; -}; - -enum ibv_wr_opcode { - IBV_WR_RDMA_WRITE, - IBV_WR_RDMA_WRITE_WITH_IMM, - IBV_WR_SEND, - IBV_WR_SEND_WITH_IMM, - IBV_WR_RDMA_READ, - IBV_WR_ATOMIC_CMP_AND_SWP, - IBV_WR_ATOMIC_FETCH_AND_ADD -}; - -enum ibv_send_flags { - IBV_SEND_FENCE = 1 << 0, - IBV_SEND_SIGNALED = 1 << 1, - IBV_SEND_SOLICITED = 1 << 2, - IBV_SEND_INLINE = 1 << 3 -}; - -struct ibv_sge { - uint64_t addr; - uint32_t length; - uint32_t lkey; -}; - -struct ibv_send_wr { - uint64_t wr_id; - struct ibv_send_wr *next; - struct ibv_sge *sg_list; - int num_sge; - enum ibv_wr_opcode opcode; - int send_flags; - uint32_t imm_data; /* in network byte order */ - union { - struct { - uint64_t remote_addr; - uint32_t rkey; - } rdma; - struct { - uint64_t remote_addr; - uint64_t compare_add; - uint64_t swap; - uint32_t rkey; - } atomic; - struct { - struct ibv_ah *ah; - uint32_t remote_qpn; - uint32_t remote_qkey; - } ud; - } wr; - union { - union { - struct { - uint32_t remote_srqn; - } xrc; - } qp_type; - - uint32_t xrc_remote_srq_num; - }; -}; - -struct ibv_recv_wr { - uint64_t wr_id; - struct ibv_recv_wr *next; - struct ibv_sge *sg_list; - int num_sge; -}; - -struct ibv_mw_bind { - uint64_t wr_id; - struct ibv_mr *mr; - void *addr; - size_t length; - int send_flags; - int mw_access_flags; -}; - -struct ibv_srq { - struct ibv_context *context; - void *srq_context; - struct ibv_pd *pd; - uint32_t handle; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; - - /* below are for source compatabilty with legacy XRC, - * padding based on ibv_srq_legacy. - */ - uint32_t xrc_srq_num_bin_compat_padding; - struct ibv_xrc_domain *xrc_domain_bin_compat_padding; - struct ibv_cq *xrc_cq_bin_compat_padding; - void *ibv_srq_padding; - - /* legacy fields */ - uint32_t xrc_srq_num; - struct ibv_xrc_domain *xrc_domain; - struct ibv_cq *xrc_cq; -}; - -/* Not in use in new API, needed for compilation as part of source compat layer */ -enum ibv_event_flags { - IBV_XRC_QP_EVENT_FLAG = 0x80000000, -}; - - - -struct ibv_qp { - struct ibv_context *context; - void *qp_context; - struct ibv_pd *pd; - struct ibv_cq *send_cq; - struct ibv_cq *recv_cq; - struct ibv_srq *srq; - uint32_t handle; - uint32_t qp_num; - enum ibv_qp_state state; - enum ibv_qp_type qp_type; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t events_completed; -}; - -struct ibv_comp_channel { - struct ibv_context *context; - int fd; - int refcnt; -}; - -struct ibv_cq { - struct ibv_context *context; - struct ibv_comp_channel *channel; - void *cq_context; - uint32_t handle; - int cqe; - - pthread_mutex_t mutex; - pthread_cond_t cond; - uint32_t comp_events_completed; - uint32_t async_events_completed; -}; - -struct ibv_ah { - struct ibv_context *context; - struct ibv_pd *pd; - uint32_t handle; -}; - -enum ibv_flow_flags { - IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1, - IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1, -}; - -enum ibv_flow_attr_type { - /* steering according to rule specifications */ - IBV_FLOW_ATTR_NORMAL = 0x0, - /* default unicast and multicast rule - - * receive all Eth traffic which isn't steered to any QP - */ - IBV_FLOW_ATTR_ALL_DEFAULT = 0x1, - /* default multicast rule - - * receive all Eth multicast traffic which isn't steered to any QP - */ - IBV_FLOW_ATTR_MC_DEFAULT = 0x2, -}; - -enum ibv_flow_spec_type { - IBV_FLOW_SPEC_ETH = 0x20, - IBV_FLOW_SPEC_IPV4 = 0x30, - IBV_FLOW_SPEC_TCP = 0x40, - IBV_FLOW_SPEC_UDP = 0x41, -}; - -struct ibv_flow_eth_filter { - uint8_t dst_mac[6]; - uint8_t src_mac[6]; - uint16_t ether_type; - /* - * same layout as 802.1q: prio 3, cfi 1, vlan id 12 - */ - uint16_t vlan_tag; -}; - -struct ibv_flow_spec_eth { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_eth_filter val; - struct ibv_flow_eth_filter mask; -}; - -struct ibv_flow_ipv4_filter { - uint32_t src_ip; - uint32_t dst_ip; -}; - -struct ibv_flow_spec_ipv4 { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_ipv4_filter val; - struct ibv_flow_ipv4_filter mask; -}; - -struct ibv_flow_tcp_udp_filter { - uint16_t dst_port; - uint16_t src_port; -}; - -struct ibv_flow_spec_tcp_udp { - enum ibv_flow_spec_type type; - uint16_t size; - struct ibv_flow_tcp_udp_filter val; - struct ibv_flow_tcp_udp_filter mask; -}; - -struct ibv_flow_spec { - union { - struct { - enum ibv_flow_spec_type type; - uint16_t size; - } hdr; - struct ibv_flow_spec_eth eth; - struct ibv_flow_spec_ipv4 ipv4; - struct ibv_flow_spec_tcp_udp tcp_udp; - }; -}; - -struct ibv_flow_attr { - uint32_t comp_mask; - enum ibv_flow_attr_type type; - uint16_t size; - uint16_t priority; - uint8_t num_of_specs; - uint8_t port; - uint32_t flags; - /* Following are the optional layers according to user request - * struct ibv_flow_spec_xxx [L2] - * struct ibv_flow_spec_yyy [L3/L4] - */ -}; - -struct ibv_flow { - uint32_t comp_mask; - struct ibv_context *context; - uint32_t handle; -}; - -struct ibv_device; -struct ibv_context; - -struct ibv_device_ops { - struct ibv_context * (*alloc_context)(struct ibv_device *device, int cmd_fd); - void (*free_context)(struct ibv_context *context); -}; - -enum { - IBV_SYSFS_NAME_MAX = 64, - IBV_SYSFS_PATH_MAX = 256 -}; - -struct ibv_device { - struct ibv_device_ops ops; - enum ibv_node_type node_type; - enum ibv_transport_type transport_type; - /* Name of underlying kernel IB device, eg "mthca0" */ - char name[IBV_SYSFS_NAME_MAX]; - /* Name of uverbs device, eg "uverbs0" */ - char dev_name[IBV_SYSFS_NAME_MAX]; - /* Path to infiniband_verbs class device in sysfs */ - char dev_path[IBV_SYSFS_PATH_MAX]; - /* Path to infiniband class device in sysfs */ - char ibdev_path[IBV_SYSFS_PATH_MAX]; -}; - -struct verbs_device { - struct ibv_device device; /* Must be first */ - size_t sz; - size_t size_of_context; - int (*init_context)(struct verbs_device *device, - struct ibv_context *ctx, int cmd_fd); - void (*uninit_context)(struct verbs_device *device, - struct ibv_context *ctx); - /* future fields added here */ -}; - -struct ibv_context_ops { - int (*query_device)(struct ibv_context *context, - struct ibv_device_attr *device_attr); - int (*query_port)(struct ibv_context *context, uint8_t port_num, - struct ibv_port_attr *port_attr); - struct ibv_pd * (*alloc_pd)(struct ibv_context *context); - int (*dealloc_pd)(struct ibv_pd *pd); - struct ibv_mr * (*reg_mr)(struct ibv_pd *pd, void *addr, size_t length, - int access); - struct ibv_mr * (*rereg_mr)(struct ibv_mr *mr, - int flags, - struct ibv_pd *pd, void *addr, - size_t length, - int access); - int (*dereg_mr)(struct ibv_mr *mr); - struct ibv_mw * (*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type); - int (*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw, - struct ibv_mw_bind *mw_bind); - int (*dealloc_mw)(struct ibv_mw *mw); - struct ibv_cq * (*create_cq)(struct ibv_context *context, int cqe, - struct ibv_comp_channel *channel, - int comp_vector); - int (*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc); - int (*req_notify_cq)(struct ibv_cq *cq, int solicited_only); - void (*cq_event)(struct ibv_cq *cq); - int (*resize_cq)(struct ibv_cq *cq, int cqe); - int (*destroy_cq)(struct ibv_cq *cq); - struct ibv_srq * (*create_srq)(struct ibv_pd *pd, - struct ibv_srq_init_attr *srq_init_attr); - int (*modify_srq)(struct ibv_srq *srq, - struct ibv_srq_attr *srq_attr, - int srq_attr_mask); - int (*query_srq)(struct ibv_srq *srq, - struct ibv_srq_attr *srq_attr); - int (*destroy_srq)(struct ibv_srq *srq); - int (*post_srq_recv)(struct ibv_srq *srq, - struct ibv_recv_wr *recv_wr, - struct ibv_recv_wr **bad_recv_wr); - struct ibv_qp * (*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr); - int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, - int attr_mask, - struct ibv_qp_init_attr *init_attr); - int (*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, - int attr_mask); - int (*destroy_qp)(struct ibv_qp *qp); - int (*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr, - struct ibv_send_wr **bad_wr); - int (*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr, - struct ibv_recv_wr **bad_wr); - struct ibv_ah * (*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr); - int (*destroy_ah)(struct ibv_ah *ah); - int (*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, - uint16_t lid); - int (*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid, - uint16_t lid); - void (*async_event)(struct ibv_async_event *event); -}; - -struct ibv_context { - struct ibv_device *device; - struct ibv_context_ops ops; - int cmd_fd; - int async_fd; - int num_comp_vectors; - pthread_mutex_t mutex; - void *abi_compat; -}; - -enum verbs_context_mask { - VERBS_CONTEXT_XRCD = (uint64_t)1 << 0, - VERBS_CONTEXT_SRQ = (uint64_t)1 << 1, - VERBS_CONTEXT_QP = (uint64_t)1 << 2, - VERBS_CONTEXT_RESERVED = (uint64_t)1 << 3, - VERBS_CONTEXT_EXP = (uint64_t)1 << 62 -}; - -struct verbs_context { - /* "grows up" - new fields go here */ - int (*_reserved_2) (void); - int (*destroy_flow) (struct ibv_flow *flow); - int (*_reserved_1) (void); - struct ibv_flow * (*create_flow) (struct ibv_qp *qp, - struct ibv_flow_attr *flow_attr); - struct ibv_qp * (*open_qp)(struct ibv_context *context, - struct ibv_qp_open_attr *attr); - struct ibv_qp * (*create_qp_ex)(struct ibv_context *context, - struct ibv_qp_init_attr_ex *qp_init_attr_ex); - int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); - struct ibv_srq * (*create_srq_ex)(struct ibv_context *context, - struct ibv_srq_init_attr_ex *srq_init_attr_ex); - struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context, - struct ibv_xrcd_init_attr *xrcd_init_attr); - int (*close_xrcd)(struct ibv_xrcd *xrcd); - uint64_t has_comp_mask; - size_t sz; /* Must be immediately before struct ibv_context */ - struct ibv_context context;/* Must be last field in the struct */ -}; - -/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/ -/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx) -{ - return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ? - NULL : container_of(ctx, struct verbs_context, context); -} - -#define verbs_get_ctx_op(ctx, op) ({ \ - struct verbs_context *_vctx = verbs_get_ctx(ctx); \ - (!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \ - !_vctx->op) ? NULL : _vctx; })*/ - -#define verbs_set_ctx_op(_vctx, op, ptr) ({ \ - struct verbs_context *vctx = _vctx; \ - if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \ - vctx->op = ptr; }) - -static inline struct verbs_device *verbs_get_device(struct ibv_device *dev) -{ - return (dev->ops.alloc_context) ? - NULL : container_of(dev, struct verbs_device, device); -} - -static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { - return qp->context->ops.post_send(qp, wr, bad_wr); -} - -struct ibv_ece { - /* - * Unique identifier of the provider vendor on the network. - * The providers will set IEEE OUI here to distinguish - * itself in non-homogenius network. - */ - uint32_t vendor_id; - /* - * Provider specific attributes which are supported or - * needed to be enabled by ECE users. - */ - uint32_t options; - uint32_t comp_mask; -}; - -#endif // NCCL_IBV_CORE_H_ diff --git a/nvls/ibvsymbols.h b/nvls/ibvsymbols.h deleted file mode 100644 index 906b0df74..000000000 --- a/nvls/ibvsymbols.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef NCCL_IBV_SYMBOLS_H_ -#define NCCL_IBV_SYMBOLS_H_ - -#ifdef NCCL_BUILD_RDMA_CORE -#include -#else -#include "ibvcore.h" -#endif - -#include "nccl.h" - -/* IB Verbs Function Pointers*/ -struct ncclIbvSymbols { - int (*ibv_internal_fork_init)(void); - struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices); - void (*ibv_internal_free_device_list)(struct ibv_device **list); - const char * (*ibv_internal_get_device_name)(struct ibv_device *device); - struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device); - int (*ibv_internal_close_device)(struct ibv_context *context); - int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event); - void (*ibv_internal_ack_async_event)(struct ibv_async_event *event); - int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr); - int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); - int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); - int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); - struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context); - int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd); - struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access); - struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access); - /* DMA-BUF support */ - struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); - int (*ibv_internal_dereg_mr)(struct ibv_mr *mr); - struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); - int (*ibv_internal_destroy_cq)(struct ibv_cq *cq); - struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); - int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); - int (*ibv_internal_destroy_qp)(struct ibv_qp *qp); - const char * (*ibv_internal_event_type_str)(enum ibv_event_type event); - int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); - int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); -}; - -/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */ -ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols); - -#endif // NCCL_IBV_SYMBOLS_H_ diff --git a/nvls/ibvwrap.h b/nvls/ibvwrap.h deleted file mode 100644 index c3709584c..000000000 --- a/nvls/ibvwrap.h +++ /dev/null @@ -1,92 +0,0 @@ -/************************************************************************* - * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2004, 2011-2012 Intel Corporation. All rights reserved. - * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2005 PathScale, Inc. All rights reserved. - * - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_IBVWRAP_H_ -#define NCCL_IBVWRAP_H_ - -#ifdef NCCL_BUILD_RDMA_CORE -#include -#else -#include "ibvcore.h" -#endif - -#include "core.h" -#include -#include - -typedef enum ibv_return_enum -{ - IBV_SUCCESS = 0, //!< The operation was successful -} ibv_return_t; - -ncclResult_t wrap_ibv_symbols(void); -/* NCCL wrappers of IB verbs functions */ -ncclResult_t wrap_ibv_fork_init(void); -ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices); -ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list); -const char *wrap_ibv_get_device_name(struct ibv_device *device); -ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device); -ncclResult_t wrap_ibv_close_device(struct ibv_context *context); -ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event); -ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event); -ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr); -ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); -ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); -ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr); -ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context); -ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd); -ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access); -struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access); -ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access); -/* DMA-BUF support */ -ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); -struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access); -ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr); -ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context); -ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel); -ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector); -ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq); -static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) { - int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/ - if (done < 0) { - WARN("Call to ibv_poll_cq() returned %d", done); - return ncclSystemError; - } - *num_done = done; - return ncclSuccess; -} -ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr); -ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask); -ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp); -ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); -ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported); - -static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { - int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - if (ret != IBV_SUCCESS) { - WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); - return ncclSystemError; - } - return ncclSuccess; -} - -static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { - int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ - if (ret != IBV_SUCCESS) { - WARN("ibv_post_recv() failed with error %s", strerror(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event); - -#endif //End include guard diff --git a/nvls/info.h b/nvls/info.h deleted file mode 100644 index f65ed2e69..000000000 --- a/nvls/info.h +++ /dev/null @@ -1,134 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_INFO_H_ -#define NCCL_INFO_H_ - -#include "nccl.h" -#include "device.h" -#include "collectives.h" -#include "core.h" -#include "utils.h" -#include "strongstream.h" - -typedef enum : uint8_t { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown, - ncclPatternCollnetChain, - ncclPatternCollnetDirect, - ncclPatternNvls, - ncclPatternNvlsTree, - ncclPatternSend, - ncclPatternRecv -} ncclPattern_t; - -// Used to pass NCCL call information between functions -struct ncclInfo { - ncclFunc_t coll; - const char* opName; - // NCCL Coll Args - const void* sendbuff; - void* recvbuff; - size_t count; - ncclDataType_t datatype; - ncclRedOp_t op; - int root; // peer for p2p operations - ncclComm_t comm; - cudaStream_t stream; - // Algorithm details - int chunkSteps; - int sliceSteps; - // Computed later - ncclDevRedOpFull opFull; - int algorithm; - int protocol; - ncclPattern_t pattern; - int nChannels; - int nThreads; - size_t nBytes; - size_t sendbuffSize; - size_t recvbuffSize; - int nstepsPerLoop; - int nchunksPerLoop; - int chunkSize; - int channelId; -}; - -inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) { - info->nBytes = info->count * ncclTypeSize(info->datatype); - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) { - info->count = info->nBytes; - info->datatype = ncclInt8; - } - if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank - - /* compute buffer size for NVLS buffer registration */ - if (info->coll == ncclFuncAllGather) { - info->sendbuffSize = info->count * ncclTypeSize(info->datatype); - info->recvbuffSize = info->sendbuffSize * nRanks; - } else if (info->coll == ncclFuncReduceScatter) { - info->recvbuffSize = info->count * ncclTypeSize(info->datatype); - info->sendbuffSize = info->recvbuffSize * nRanks; - } else { - info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype); - } - return ncclSuccess; -} - -struct ncclTaskColl { - struct ncclTaskColl* next; - ncclFunc_t func; - void const* sendbuff; - void* recvbuff; - size_t count; - int root; - ncclDataType_t datatype; - ncclDevRedOpFull op; - int chunkSteps, sliceSteps; -}; -struct ncclTaskP2p { - ncclTaskP2p *next; - void *buff; - size_t bytes; - // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track - // of where it left off. - int chunk; -}; - -struct ncclCudaStreamList { - struct ncclCudaStreamList *next; - cudaStream_t stream; -}; -struct ncclTasks { - struct Peer { - bool sendSeen, recvSeen; - struct ncclIntruQueue sendQueue; - struct ncclIntruQueue recvQueue; - }; - struct ncclIntruQueue collQueue; - size_t collBytesTotal; - struct Peer* peers/*[nRanks]*/; - int *p2pSendOrder, *p2pRecvOrder; - int p2pOrderSteps; - int nTasksColl, nTasksP2p; - - // The list of user streams aggregated over all tasks present. - struct ncclCudaStreamList* streams; - // The most recent user stream. Ignored if streams==nullptr - cudaStream_t streamRecent; - // The graph capturing all user streams or invalid if none. Thus we restrict the - // user that all streams must be captured in the same graph or not captured - // at all. Technically we could probably relax this, but that would mean - // collecting a different `ncclTasks` per graph and one for non-graph. - struct ncclCudaGraph capturingGraph; -}; - -#endif diff --git a/nvls/ipcsocket.cc b/nvls/ipcsocket.cc deleted file mode 100644 index 9d66ac719..000000000 --- a/nvls/ipcsocket.cc +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. - * - * See COPYRIGHT for license information - */ - -#include "ipcsocket.h" -#include "utils.h" -#include -#include -#include - -// Enable Linux abstract socket naming -#define USE_ABSTRACT_SOCKET - -#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx" - -/* - * Create a Unix Domain Socket - */ -ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) { - int fd = -1; - struct sockaddr_un cliaddr; - char temp[NCCL_IPC_SOCKNAME_LEN] = ""; - - if (handle == NULL) { - return ncclInternalError; - } - - handle->fd = -1; - handle->socketName[0] = '\0'; - if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { - WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno); - return ncclSystemError; - } - - bzero(&cliaddr, sizeof(cliaddr)); - cliaddr.sun_family = AF_UNIX; - - // Create unique name for the socket. - int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); - if (len > (sizeof(cliaddr.sun_path) - 1)) { - WARN("UDS: Cannot bind provided name to socket. Name too large"); - return ncclInternalError; - } -#ifndef USE_ABSTRACT_SOCKET - unlink(temp); -#endif - - TRACE(NCCL_INIT, "UDS: Creating socket %s", temp); - - strncpy(cliaddr.sun_path, temp, len); -#ifdef USE_ABSTRACT_SOCKET - cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick -#endif - if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) { - WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno); - close(fd); - return ncclSystemError; - } - - handle->fd = fd; - strcpy(handle->socketName, temp); - - handle->abortFlag = abortFlag; - // Mark socket as non-blocking - if (handle->abortFlag) { - int flags; - EQCHECK(flags = fcntl(fd, F_GETFL), -1); - SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl"); - } - - return ncclSuccess; -} - -ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) { - if (handle == NULL) { - WARN("ncclSocketGetFd: pass NULL socket"); - return ncclInvalidArgument; - } - if (fd) *fd = handle->fd; - return ncclSuccess; -} - -ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) { - if (handle == NULL) { - return ncclInternalError; - } - if (handle->fd <= 0) { - return ncclSuccess; - } -#ifndef USE_ABSTRACT_SOCKET - if (handle->socketName[0] != '\0') { - unlink(handle->socketName); - } -#endif - close(handle->fd); - - return ncclSuccess; -} - -ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) { - struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; - struct iovec iov[1]; - - // Union to guarantee alignment requirements for control array - union { - struct cmsghdr cm; - char control[CMSG_SPACE(sizeof(int))]; - } control_un; - - struct cmsghdr *cmptr; - char dummy_buffer[1]; - int ret; - - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); - - if (hdr == NULL) { - iov[0].iov_base = (void *)dummy_buffer; - iov[0].iov_len = sizeof(dummy_buffer); - } else { - iov[0].iov_base = hdr; - iov[0].iov_len = hdrLen; - } - - msg.msg_iov = iov; - msg.msg_iovlen = 1; - - while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { - WARN("UDS: Receiving data over socket failed : %d", errno); - return ncclSystemError; - } - if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; - } - - if (recvFd != NULL) { - if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { - if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { - WARN("UDS: Receiving data over socket failed"); - return ncclSystemError; - } - - memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd)); - } else { - WARN("UDS: Receiving data over socket %s failed", handle->socketName); - return ncclSystemError; - } - TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName); - } - - return ncclSuccess; -} - -ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) { - return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd); -} - -ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) { - struct msghdr msg = {0, 0, 0, 0, 0, 0, 0}; - struct iovec iov[1]; - char temp[NCCL_IPC_SOCKNAME_LEN]; - - union { - struct cmsghdr cm; - char control[CMSG_SPACE(sizeof(int))]; - } control_un; - - struct cmsghdr *cmptr; - char dummy_buffer[1]; - struct sockaddr_un cliaddr; - - // Construct client address to send this shareable handle to - bzero(&cliaddr, sizeof(cliaddr)); - cliaddr.sun_family = AF_UNIX; - - int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash); - if (len > (sizeof(cliaddr.sun_path) - 1)) { - WARN("UDS: Cannot connect to provided name for socket. Name too large"); - return ncclInternalError; - } - (void) strncpy(cliaddr.sun_path, temp, len); - -#ifdef USE_ABSTRACT_SOCKET - cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick -#endif - - TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp); - - if (sendFd != -1) { - TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp); - - msg.msg_control = control_un.control; - msg.msg_controllen = sizeof(control_un.control); - - cmptr = CMSG_FIRSTHDR(&msg); - cmptr->cmsg_len = CMSG_LEN(sizeof(int)); - cmptr->cmsg_level = SOL_SOCKET; - cmptr->cmsg_type = SCM_RIGHTS; - memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd)); - } - - msg.msg_name = (void *)&cliaddr; - msg.msg_namelen = sizeof(struct sockaddr_un); - - if (hdr == NULL) { - iov[0].iov_base = (void *)dummy_buffer; - iov[0].iov_len = sizeof(dummy_buffer); - } else { - iov[0].iov_base = hdr; - iov[0].iov_len = hdrLen; - } - msg.msg_iov = iov; - msg.msg_iovlen = 1; - msg.msg_flags = 0; - - ssize_t sendResult; - while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { - WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno); - return ncclSystemError; - } - if (handle->abortFlag && *handle->abortFlag) return ncclInternalError; - } - - return ncclSuccess; -} - -ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) { - return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); -} diff --git a/nvls/ipcsocket.h b/nvls/ipcsocket.h deleted file mode 100644 index ccecde84c..000000000 --- a/nvls/ipcsocket.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved. - * - * See COPYRIGHT for license information - */ - -#ifndef NCCL_IPCSOCKET_H -#define NCCL_IPCSOCKET_H - -#include "nccl.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define NCCL_IPC_SOCKNAME_LEN 64 - -struct ncclIpcSocket { - int fd; - char socketName[NCCL_IPC_SOCKNAME_LEN]; - volatile uint32_t* abortFlag; -}; - -ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag); -ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle); -ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd); - -ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd); -ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash); - -#endif /* NCCL_IPCSOCKET_H */ diff --git a/nvls/nccl_common.h b/nvls/nccl_common.h deleted file mode 100644 index a37ac203e..000000000 --- a/nvls/nccl_common.h +++ /dev/null @@ -1,33 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_DEBUG_H_ -#define NCCL_DEBUG_H_ - -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys; - -typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); - -#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now -typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t; - -#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet* -#define NCCL_ALGO_UNDEF -1 -#define NCCL_ALGO_TREE 0 -#define NCCL_ALGO_RING 1 -#define NCCL_ALGO_COLLNET_DIRECT 2 -#define NCCL_ALGO_COLLNET_CHAIN 3 -#define NCCL_ALGO_NVLS 4 -#define NCCL_ALGO_NVLS_TREE 5 - -#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128 -#define NCCL_PROTO_UNDEF -1 -#define NCCL_PROTO_LL 0 -#define NCCL_PROTO_LL128 1 -#define NCCL_PROTO_SIMPLE 2 - -#endif diff --git a/nvls/nccl_net.h b/nvls/nccl_net.h deleted file mode 100644 index 9b3e6719f..000000000 --- a/nvls/nccl_net.h +++ /dev/null @@ -1,333 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ - -#include "nccl.h" -#include "nccl_common.h" -#include "net_device.h" -#include - -#define NCCL_NET_HANDLE_MAXSIZE 128 - -#define NCCL_PTR_HOST 0x1 -#define NCCL_PTR_CUDA 0x2 -#define NCCL_PTR_DMABUF 0x4 - -// Maximum number of requests per comm object -#define NCCL_NET_MAX_REQUESTS 32 - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload -} ncclNetProperties_v7_t; - -typedef ncclNetProperties_v7_t ncclNetProperties_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection - ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); - - // Copy the given mhandle to a dptr in a format usable by this plugin's device code - ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle); - - // Notify the plugin that a recv has completed by the device - ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request); -} ncclNet_v7_t; - -typedef ncclNet_v7_t ncclNet_t; - -#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7 - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7 - -#define NCCL_NET_MAX_REQUESTS_V6 8 - -// v6 struct for backwards compatibility -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. -} ncclNetProperties_v6_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v6_t; - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v7_t; - -typedef ncclCollNet_v7_t ncclCollNet_t; - -// v6 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v6_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v5_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v5_t; - -#endif // end include guard diff --git a/nvls/nccl_tuner.h b/nvls/nccl_tuner.h deleted file mode 100644 index b4a696e38..000000000 --- a/nvls/nccl_tuner.h +++ /dev/null @@ -1,55 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TUNER_H_ -#define NCCL_TUNER_H_ - -#include "nccl.h" -#include "nccl_common.h" - -// API to be implemented by external tuner -typedef struct { - // Name of the tuner - const char* name; - - // Initializes tuner states. - // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner. - // nNodes: number of nodes in current communicator. - // logFunction: a logFunction can be useful to integrate logging together with NCCL core. - ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction); - - // Gets info (algo, protocol, number of ctas and threads) for a given collective. - // Inputs: - // - collType: collective type , e.g., allreduce, allgather… - // - nBytes: collective size in bytes - // - collNetTypeSupport: whether collnet supports this type - // - nvlsTypeSupport: whether nvlink sharp supports this time - // - numPipeOps: number of operations in the group - // - // Outputs: - // - algorithm: selected algorithm to be used for the given collective - // - protocol: selected protocol to be used for the given collective - // - nChannels: number of channels (hence SMs) to be used. - // - // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the - // default tuning for the given collective. - // Also, the plugin is allowed to not set any output, or set only the - // algorithm and protocol, but not only the algorithm or only the protocol. - // Unset fields will be set automatically by NCCL. - ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes, - int collNetSupport, int nvlsSupport, int numPipeOps, - int *algorithm, int *protocol, int* nChannels); - - // Terminates the plugin and cleans up any resources that the plugin allocated. - ncclResult_t (*destroy)(); -} ncclTuner_v1_t; - -typedef ncclTuner_v1_t ncclTuner_t; - -#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1" - -#endif diff --git a/nvls/net.h b/nvls/net.h deleted file mode 100644 index b5df58968..000000000 --- a/nvls/net.h +++ /dev/null @@ -1,27 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_INT_NET_H_ -#define NCCL_INT_NET_H_ - -#include "nccl.h" -#include "nccl_net.h" -#include "comm.h" -#include "checks.h" - -typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; - -ncclResult_t ncclNetPluginInit(); -ncclResult_t ncclNetInit(struct ncclComm* comm); -int ncclNetVersion(struct ncclComm* comm); - -// Test whether the current GPU support GPU Direct RDMA. -ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport); - -extern ncclNet_t ncclNetIb; -extern ncclNet_t ncclNetSocket; - -#endif diff --git a/nvls/net_device.h b/nvls/net_device.h deleted file mode 100644 index 8f7c0d6e1..000000000 --- a/nvls/net_device.h +++ /dev/null @@ -1,29 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NET_DEVICE_H_ -#define NCCL_NET_DEVICE_H_ - -#define NCCL_NET_DEVICE_INVALID_VERSION 0x0 -#define NCCL_NET_MTU_SIZE 4096 - -// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin -// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version. -#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7 - -typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType; - -typedef struct { - ncclNetDeviceType netDeviceType; // Network offload type - int netDeviceVersion; // Version number for network offload - void* handle; - size_t size; - int needsProxyProgress; -} ncclNetDeviceHandle_v7_t; - -typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t; - -#endif diff --git a/nvls/nvmlwrap.h b/nvls/nvmlwrap.h deleted file mode 100644 index 2ab8e3a2b..000000000 --- a/nvls/nvmlwrap.h +++ /dev/null @@ -1,214 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NVMLWRAP_H_ -#define NCCL_NVMLWRAP_H_ - -#include "nccl.h" - -//#define NCCL_NVML_DIRECT 1 -#ifndef NCCL_NVML_DIRECT -#define NCCL_NVML_DIRECT 0 -#endif - -#if NCCL_NVML_DIRECT -#include "nvml.h" -#else -// Dynamically handle dependencies on NVML - -/* Extracted from nvml.h */ -typedef struct nvmlDevice_st* nvmlDevice_t; -#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16 - -typedef enum nvmlEnableState_enum -{ - NVML_FEATURE_DISABLED = 0, //!< Feature disabled - NVML_FEATURE_ENABLED = 1 //!< Feature enabled -} nvmlEnableState_t; - -typedef enum nvmlNvLinkCapability_enum -{ - NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported - NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported - NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported - NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported - NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link - NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device - // should be last - NVML_NVLINK_CAP_COUNT -} nvmlNvLinkCapability_t; - -typedef enum nvmlReturn_enum -{ - NVML_SUCCESS = 0, //!< The operation was successful - NVML_ERROR_UNINITIALIZED = 1, //!< NVML was not first initialized with nvmlInit() - NVML_ERROR_INVALID_ARGUMENT = 2, //!< A supplied argument is invalid - NVML_ERROR_NOT_SUPPORTED = 3, //!< The requested operation is not available on target device - NVML_ERROR_NO_PERMISSION = 4, //!< The current user does not have permission for operation - NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting - NVML_ERROR_NOT_FOUND = 6, //!< A query to find an object was unsuccessful - NVML_ERROR_INSUFFICIENT_SIZE = 7, //!< An input argument is not large enough - NVML_ERROR_INSUFFICIENT_POWER = 8, //!< A device's external power cables are not properly attached - NVML_ERROR_DRIVER_NOT_LOADED = 9, //!< NVIDIA driver is not loaded - NVML_ERROR_TIMEOUT = 10, //!< User provided timeout passed - NVML_ERROR_IRQ_ISSUE = 11, //!< NVIDIA Kernel detected an interrupt issue with a GPU - NVML_ERROR_LIBRARY_NOT_FOUND = 12, //!< NVML Shared Library couldn't be found or loaded - NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function - NVML_ERROR_CORRUPTED_INFOROM = 14, //!< infoROM is corrupted - NVML_ERROR_GPU_IS_LOST = 15, //!< The GPU has fallen off the bus or has otherwise become inaccessible - NVML_ERROR_RESET_REQUIRED = 16, //!< The GPU requires a reset before it can be used again - NVML_ERROR_OPERATING_SYSTEM = 17, //!< The GPU control device has been blocked by the operating system/cgroups - NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18, //!< RM detects a driver/library version mismatch - NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use - NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred -} nvmlReturn_t; - -typedef struct nvmlPciInfo_st -{ - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) - unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffff - unsigned int bus; //!< The bus on which the device resides, 0 to 0xff - unsigned int device; //!< The device's id on the bus, 0 to 31 - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - - // Added in NVML 2.285 API - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - - // NVIDIA reserved for internal use only - unsigned int reserved0; - unsigned int reserved1; - unsigned int reserved2; - unsigned int reserved3; -} nvmlPciInfo_t; - -/* P2P Capability Index Status*/ -typedef enum nvmlGpuP2PStatus_enum -{ - NVML_P2P_STATUS_OK = 0, - NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, - NVML_P2P_STATUS_GPU_NOT_SUPPORTED, - NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, - NVML_P2P_STATUS_DISABLED_BY_REGKEY, - NVML_P2P_STATUS_NOT_SUPPORTED, - NVML_P2P_STATUS_UNKNOWN -} nvmlGpuP2PStatus_t; - -/* P2P Capability Index*/ -typedef enum nvmlGpuP2PCapsIndex_enum -{ - NVML_P2P_CAPS_INDEX_READ = 0, - NVML_P2P_CAPS_INDEX_WRITE, - NVML_P2P_CAPS_INDEX_NVLINK, - NVML_P2P_CAPS_INDEX_ATOMICS, - NVML_P2P_CAPS_INDEX_PROP, - NVML_P2P_CAPS_INDEX_UNKNOWN -} nvmlGpuP2PCapsIndex_t; - -/** - * Represents the type for sample value returned - */ -typedef enum nvmlValueType_enum -{ - NVML_VALUE_TYPE_DOUBLE = 0, - NVML_VALUE_TYPE_UNSIGNED_INT = 1, - NVML_VALUE_TYPE_UNSIGNED_LONG = 2, - NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, - NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, - - // Keep this last - NVML_VALUE_TYPE_COUNT -}nvmlValueType_t; - - -/** - * Union to represent different types of Value - */ -typedef union nvmlValue_st -{ - double dVal; //!< If the value is double - unsigned int uiVal; //!< If the value is unsigned int - unsigned long ulVal; //!< If the value is unsigned long - unsigned long long ullVal; //!< If the value is unsigned long long - signed long long sllVal; //!< If the value is signed long long -}nvmlValue_t; - -/** - * Field Identifiers. - * - * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change. - */ - -/* NVLink Speed */ -#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90 //!< Common NVLink Speed in MBps for active links -#define NVML_FI_DEV_NVLINK_LINK_COUNT 91 //!< Number of NVLinks present on the device - -/** - * Remote device NVLink ID - * - * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t. - */ -#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID 146 //!< Remote device NVLink ID - -/** - * NVSwitch: connected NVLink count - */ -#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT 147 //!< Number of NVLinks connected to NVSwitch - -#define NVML_FI_DEV_NVLINK_GET_SPEED 164 -#define NVML_FI_DEV_NVLINK_GET_STATE 165 -#define NVML_FI_DEV_NVLINK_GET_VERSION 166 - -#define NVML_FI_DEV_C2C_LINK_COUNT 170 //!< Number of C2C Links present on the device -#define NVML_FI_DEV_C2C_LINK_GET_STATUS 171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE -#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW 172 //!< C2C Link Speed in MBps for active links - -#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above - -/** - * Information for a Field Value Sample - */ -typedef struct nvmlFieldValue_st -{ - unsigned int fieldId; //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above. - unsigned int scopeId; //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId. - long long timestamp; //!< CPU Timestamp of this value in microseconds since 1970 - long long latencyUsec; //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call. - nvmlValueType_t valueType; //!< Type of the value stored in value - nvmlReturn_t nvmlReturn; //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS - nvmlValue_t value; //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS -} nvmlFieldValue_t; - -/* End of nvml.h */ -#endif // NCCL_NVML_DIRECT - -constexpr int ncclNvmlMaxDevices = 32; -struct ncclNvmlDeviceInfo { - nvmlDevice_t handle; - int computeCapabilityMajor, computeCapabilityMinor; -}; -struct ncclNvmlDevicePairInfo { - nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite; -}; -extern int ncclNvmlDeviceCount; -extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices]; -extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices]; - -// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly. -// Outsiders need only call it if they want to inspect the ncclNvml global -// tables above. -ncclResult_t ncclNvmlEnsureInitialized(); - -ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device); -ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index); -ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); -ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); -ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); -ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); -ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor); -ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus); -ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); - -#endif // End include guard diff --git a/nvls/nvtx.h b/nvls/nvtx.h deleted file mode 100644 index ab32ef27f..000000000 --- a/nvls/nvtx.h +++ /dev/null @@ -1,85 +0,0 @@ -/************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NVTX_H_ -#define NCCL_NVTX_H_ - -#include "nvtx3/nvtx3.hpp" - -#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14) -#define NVTX3_CONSTEXPR_IF_CPP14 constexpr -#else -#define NVTX3_CONSTEXPR_IF_CPP14 -#endif - -// Define all NCCL-provided static schema IDs here (avoid duplicates). -#define NVTX_SID_CommInitRank 0 -#define NVTX_SID_CommInitAll 1 -#define NVTX_SID_CommDestroy 2 // same schema as NVTX_SID_CommInitRank -#define NVTX_SID_CommAbort 3 // same schema as NVTX_SID_CommInitRank -#define NVTX_SID_AllGather 4 -#define NVTX_SID_AllReduce 5 -#define NVTX_SID_Broadcast 6 -#define NVTX_SID_ReduceScatter 7 -#define NVTX_SID_Reduce 8 -#define NVTX_SID_Send 9 -#define NVTX_SID_Recv 10 - -// Define static schema ID for the reduction operation. -#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START - -extern const nvtxDomainHandle_t ncclNvtxDomainHandle; - -struct nccl_domain{static constexpr char const* name{"NCCL"};}; - -class payload_schema { - public: - explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept - { - schema_attr.name = schemaName; - schema_attr.entries = entries; - schema_attr.numEntries = numEntries; - schema_attr.schemaId = schemaId; - nvtxPayloadSchemaRegister(nvtx3::domain::get(), &schema_attr); - } - - payload_schema() = delete; - ~payload_schema() = default; - payload_schema(payload_schema const&) = default; - payload_schema& operator=(payload_schema const&) = default; - payload_schema(payload_schema&&) = default; - payload_schema& operator=(payload_schema&&) = default; - - private: - nvtxPayloadSchemaAttr_t schema_attr{ - NVTX_PAYLOAD_SCHEMA_ATTR_TYPE | - NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES | - NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES | - NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE | - NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID, - nullptr, - NVTX_PAYLOAD_SCHEMA_TYPE_STATIC, - NVTX_PAYLOAD_SCHEMA_FLAG_NONE, - nullptr, 0, 0, 0}; -}; - -// Create NVTX push/pop range with parameters -// @param name of the operation (see `NVTX_SID_*`) -// @param N schema name -// @param S schema (entries) -// @param P payload (struct) -#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ - static const payload_schema schema{S, std::extent::value, \ - NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \ - static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ - nvtxPayloadData_t nvtx3_bpl__[] = { \ - {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ - ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \ - ::nvtx3::v1::scoped_range_in const nvtx3_range__{nvtx3_func_attr__}; - -extern void initNvtxRegisteredEnums(); - -#endif diff --git a/nvls/p2p.h b/nvls/p2p.h deleted file mode 100644 index 6ffba4b0e..000000000 --- a/nvls/p2p.h +++ /dev/null @@ -1,29 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include - -#ifndef NCCL_P2P_H_ -#define NCCL_P2P_H_ - -#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR - -typedef struct { - uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support -} ncclCuDesc; - -typedef union { - // Legacy CUDA IPC - cudaIpcMemHandle_t devIpc; - // cuMem API support - ncclCuDesc cuDesc; -} ncclIpcDesc; - -ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr); -ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc); -ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr); - -#endif diff --git a/nvls/param.h b/nvls/param.h deleted file mode 100644 index 963da9d17..000000000 --- a/nvls/param.h +++ /dev/null @@ -1,30 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_PARAM_H_ -#define NCCL_PARAM_H_ - -#include - -const char* userHomeDir(); -void setEnvFile(const char* fileName); -void initEnv(); -const char *ncclGetEnv(const char *name); - -void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache); - -#define NCCL_PARAM(name, env, deftVal) \ - int64_t ncclParam##name() { \ - constexpr int64_t uninitialized = INT64_MIN; \ - static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \ - static int64_t cache = uninitialized; \ - if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \ - ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \ - } \ - return cache; \ - } - -#endif diff --git a/nvls/profiler.h b/nvls/profiler.h deleted file mode 100644 index 103af99ad..000000000 --- a/nvls/profiler.h +++ /dev/null @@ -1,37 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_PROFILER_H_ -#define NCCL_PROFILER_H_ - -#include "proxy.h" - -enum ncclProxyProfileState { - ncclProxyProfileBegin = 0, - - ncclProxyProfileSendGPUWait = 1, - ncclProxyProfileSendWait = 2, - - ncclProxyProfileRecvWait = 1, - ncclProxyProfileRecvFlushWait = 2, - ncclProxyProfileRecvGPUWait = 3, - - ncclProxyProfileEnd = 4, - - ncclProxyProfileSleep = 8, - ncclProxyProfileWakeup = 9, - - ncclProxyProfileIdle = 16, - ncclProxyProfileActive = 17, - - ncclProxyProfileAppend = 24, - ncclProxyProfileAppendEnd = 25 -}; - -ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state); -void ncclProfilingDump(); - -#endif diff --git a/nvls/proxy.h b/nvls/proxy.h deleted file mode 100644 index 8093c0ce6..000000000 --- a/nvls/proxy.h +++ /dev/null @@ -1,296 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_PROXY_H_ -#define NCCL_PROXY_H_ - -#include "device.h" -#include "info.h" -#include "socket.h" -#include "ipcsocket.h" -#include "nccl_net.h" -#include -#include "shm.h" -#include "p2p.h" - -enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; - -struct ncclProxyArgs; -typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*); - -#define NCCL_PROXY_MAX_SUBS MAXCHANNELS -static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); - -struct ncclProxyOp { - struct ncclProxyConnection* connection; - int channelId; - int nsteps; - ssize_t nbytes; - int root; - int next; - - uint64_t opCount; - int sliceSteps; - int chunkSteps; - int chunkSize; - uint8_t /*ncclDataType_t*/ dtype; - uint8_t /*ncclDevRedOp_t*/ redOp; - uint8_t /*ncclPattern_t*/ pattern; - uint8_t protocol; - - union { - uint64_t unused; - // For use by enqueue.cc - struct ncclProxyOp *enqNext; - }; -}; -static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch"); - -struct ncclProxySubArgs { - struct ncclProxyConnection* connection; - int channelId; - int nsteps; - ssize_t nbytes; - int peer; - - int groupSize; // Number of consecutive sub operations sharing the same recvComm - uint64_t base; - uint64_t posted; - uint64_t received; - uint64_t flushed; - uint64_t transmitted; - uint64_t done; - uint64_t end; - void* requests[NCCL_STEPS]; - void* profilingEvents[NCCL_STEPS]; - void* recvRequestsCache[NCCL_STEPS]; - int recvRequestsSubCount; -}; - -struct ncclProxyArgs { - struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; - proxyProgressFunc_t progress; - int nsubs; - int done; - uint64_t opCount; - int sliceSteps; - int chunkSteps; - int chunkSize; - uint8_t /*ncclDataType_t*/ dtype; - uint8_t /*ncclDevRedOp_t*/ redOp; - uint8_t /*ncclPattern_t*/ pattern; - uint8_t protocol; - int state; - char* sharedBuff[NCCL_STEPS]; - int sharedSize[NCCL_STEPS]; - - int idle; - - // Element linking - struct ncclProxyArgs* next; - struct ncclProxyArgs* nextPeer; - struct ncclProxyArgs** proxyAppendPtr; -}; -#define NCCL_MAX_NETDEVS 128 - -// ProxyOps are used to communicate between main thread and service thread -// Make sure we have enough to store two full rounds of operations on all channels. -// Otherwise we'd be unable to post half of them to free new elements. -#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P) -#define NCCL_MAX_LOCAL_RANKS 64 -struct ncclProxyOpsPool { - struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS]; - volatile int nextOps; - volatile int nextOpsEnd; - volatile int freeOps[NCCL_MAX_LOCAL_RANKS]; - pthread_mutex_t mutex; - pthread_cond_t cond; -}; - -struct ncclProxyOps { - ncclProxyOpsPool* pool; - ncclShmHandle_t handle; - int count; - int freeOp; - int nextOps; - int nextOpsEnd; -}; - -struct ncclProxySharedP2p { - int refcount; - int size; - char* cudaBuff; - char* hostBuff; - // CUDA IPC - ncclIpcDesc ipcDesc; - struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv -}; - -struct ncclProxyPeer { - struct ncclProxySharedP2p send; - struct ncclProxySharedP2p recv; -}; - -struct ncclSharedNetComms { - void* sendComm[MAXCHANNELS]; - void* recvComm[MAXCHANNELS]; - int sendRefCount[MAXCHANNELS]; - int recvRefCount[MAXCHANNELS]; -}; - -struct ncclProxyPool; -struct ncclProxyProgressState { - // Used by main threads to send work to progress thread - struct ncclProxyOpsPool* opsPool; - ncclShmHandle_t handle; - char opsPoolShmSuffix[6]; - - pthread_t thread; - volatile int stop; - struct ncclProxyPeer** localPeers; - struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS]; - struct ncclProxyArgs* active; - struct ncclProxyArgs* pool; - struct ncclProxyPool* pools; - int nextOps; -}; - -// Expected proxy response fifo -struct ncclExpectedProxyResponse { - void* opId; - int respSize; - bool done; - void* respBuff; - ncclResult_t res; - struct ncclExpectedProxyResponse* next; -}; - -struct ncclProxyAsyncOp { - int type; - struct ncclProxyConnection* connection; - int reqSize, respSize; - char *reqBuff, *respBuff; - void* opId; - ncclProxyAsyncOp* next; -}; - -struct ncclProxyLocalPeer { - struct ncclSocket sock; - int tpRank; - int tpLocalRank; - ncclProxyAsyncOp* asyncOps; - int asyncOpCounter; -}; - -// Common response header for all proxyOps -// We pack this into a struct to reduce the number of blocking send and recv calls -struct ncclProxyRpcResponseHeader { - void* opId; - ncclResult_t res; - int respSize; -}; - -struct ncclProxyState { - int refCount; - int tpRank; - int tpnRanks; - int tpLocalnRanks; - int cudaDev; - int p2pnChannels; - int p2pChunkSize; - int nChannels; - int buffSizes[NCCL_NUM_PROTOCOLS]; - bool allocP2pNetLLBuffers; - bool dmaBufSupport; - ncclNet_t* ncclNet; - ncclCollNet_t* ncclCollNet; - volatile uint32_t* abortFlag; - // Service thread - pthread_t thread; - struct ncclSocket* listenSock; - int stop; - CUcontext cudaCtx; - ncclResult_t asyncResult; - - // Used by main thread - union ncclSocketAddress* peerAddresses; - struct ncclSocket* peerSocks; - struct ncclProxyOps* proxyOps; - void** sharedDevMems; - struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS) - - // Progress thread - struct ncclProxyProgressState progressState; - - // Queue of expected responses from the proxy - struct ncclExpectedProxyResponse* expectedResponses; -}; - -enum proxyConnectState { - connUninitialized = 0, - connInitialized = 1, - connSharedInitialized = 2, - connSetupDone = 3, - connConnected = 4, - numConnStates = 5 -}; - -struct ncclProxyConnection { - int send, transport, shared; - int tpLocalRank, sameProcess; - struct ncclSocket* sock; - struct ncclTransportComm* tcomm; - struct ncclProxyArgs *proxyAppend; - struct ncclProxyArgs **proxyAppendPtr; - void* transportResources; - ncclNetDeviceHandle_t* netDeviceHandle; - void* mhandles[NCCL_NUM_PROTOCOLS]; - proxyConnectState state; - struct ncclCollNetSharedRes* collNet; - int needsProxyProgress; -}; - -typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); - -enum proxyMode { - proxyRing = 0, - proxyFrom = 1, - proxyTo = 2 -}; - -ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire); -ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp); -ncclResult_t ncclProxyStart(struct ncclComm* comm); -ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses); -ncclResult_t ncclProxyCreate(struct ncclComm* comm); -ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn); -enum ncclProxyMsgType { - ncclProxyMsgInit = 1, - ncclProxyMsgSharedInit = 2, - ncclProxyMsgSetup = 3, - ncclProxyMsgConnect = 4, - ncclProxyMsgStart = 5, - ncclProxyMsgClose = 6, - ncclProxyMsgAbort = 7, - ncclProxyMsgStop = 8, - ncclProxyMsgGetFd = 9, // cuMem API support (UDS) -}; - -// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types -// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of -// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed -ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId); - -// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received -ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize); -ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId); - -ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd); - -ncclResult_t ncclProxyStop(struct ncclComm* comm); -ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm); -ncclResult_t ncclProxyDestroy(struct ncclComm* comm); -#endif diff --git a/nvls/shm.h b/nvls/shm.h deleted file mode 100644 index e75caa6a6..000000000 --- a/nvls/shm.h +++ /dev/null @@ -1,25 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_SHM_H_ -#define NCCL_SHM_H_ - -#include "nccl.h" - -typedef void* ncclShmHandle_t; -ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle); -ncclResult_t ncclShmClose(ncclShmHandle_t handle); -ncclResult_t ncclShmUnlink(ncclShmHandle_t handle); - -struct ncclShmemCollBuff { - volatile size_t *cnt[2]; - volatile void *ptr[2]; - int round; -}; - -ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize); - -#endif diff --git a/nvls/socket.h b/nvls/socket.h deleted file mode 100644 index 9e5137289..000000000 --- a/nvls/socket.h +++ /dev/null @@ -1,97 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_SOCKET_H_ -#define NCCL_SOCKET_H_ - -#include "nccl.h" -#include -#include -#include -#include -#include -#include - -#define MAX_IFS 16 -#define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // connection retry sleep interval in usec -#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) -#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) -#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV) -#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL - -/* Common socket address storage structure for IPv4/IPv6 */ -union ncclSocketAddress { - struct sockaddr sa; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; -}; - -enum ncclSocketState { - ncclSocketStateNone = 0, - ncclSocketStateInitialized = 1, - ncclSocketStateAccepting = 2, - ncclSocketStateAccepted = 3, - ncclSocketStateConnecting = 4, - ncclSocketStateConnectPolling = 5, - ncclSocketStateConnected = 6, - ncclSocketStateReady = 7, - ncclSocketStateClosed = 8, - ncclSocketStateError = 9, - ncclSocketStateNum = 10 -}; - -enum ncclSocketType { - ncclSocketTypeUnknown = 0, - ncclSocketTypeBootstrap = 1, - ncclSocketTypeProxy = 2, - ncclSocketTypeNetSocket = 3, - ncclSocketTypeNetIb = 4 -}; - -struct ncclSocket { - int fd; - int acceptFd; - int timedOutRetries; - int refusedRetries; - union ncclSocketAddress addr; - volatile uint32_t* abortFlag; - int asyncFlag; - enum ncclSocketState state; - int salen; - uint64_t magic; - enum ncclSocketType type; -}; - -const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1); -ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair); -int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs); -int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs); - -// Initialize a socket -ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0); -// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call -ncclResult_t ncclSocketListen(struct ncclSocket* sock); -ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr); -// Connect to sock->addr. sock->fd is set after a successful call. -ncclResult_t ncclSocketConnect(struct ncclSocket* sock); -// Return socket connection state. -ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running); -// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr. -ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock); -ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd); -ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock); - -#define NCCL_SOCKET_SEND 0 -#define NCCL_SOCKET_RECV 1 - -ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); -ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset); -ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size); -ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size); -ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking); -ncclResult_t ncclSocketClose(struct ncclSocket* sock); -#endif diff --git a/nvls/strongstream.h b/nvls/strongstream.h deleted file mode 100644 index 0984dfe57..000000000 --- a/nvls/strongstream.h +++ /dev/null @@ -1,140 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_STRONGSTREAM_H_ -#define NCCL_STRONGSTREAM_H_ - -#include "nccl.h" -#include "checks.h" - -#include - -/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes - * easily. - */ -struct ncclCudaGraph { -#if CUDART_VERSION >= 11030 - cudaGraph_t graph; - unsigned long long graphId; -#endif -}; - -inline struct ncclCudaGraph ncclCudaGraphNone() { - struct ncclCudaGraph tmp; - #if CUDART_VERSION >= 11030 - tmp.graph = nullptr; - tmp.graphId = ULLONG_MAX; - #endif - return tmp; -} - -inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) { - #if CUDART_VERSION >= 11030 - return graph.graph != nullptr; - #else - return false; - #endif -} - -inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) { - #if CUDART_VERSION >= 11030 - return a.graphId == b.graphId; - #else - return true; - #endif -} - -ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream); -ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg); - -/* ncclStrongStream: An abstraction over CUDA streams that do not lose their - * identity while being captured. Regular streams have the deficiency that the - * captured form of a stream in one graph launch has no relation to the - * uncaptured stream or to the captured form in other graph launches. This makes - * streams unfit for the use of serializing access to a persistent resource. - * Strong streams have been introduced to address this need. - * - * - All updates to a strong stream must be enclosed by a Acquire/Release pair. - * - * - The Acquire, Release, and all updates take a ncclCudaGraph parameter - * indicating the currently capturing graph (or none). This parameter must be - * the same for the entire sequence of {Acquire; ...; Release}. - * - * - An {Acquire; ...; Release} sequence must not be concurrent with any - * other operations against the strong stream including graph launches which - * reference this stream. - */ -struct ncclStrongStream; - -ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss); -ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss); - -// Acquire-fence the strong stream. -ncclResult_t ncclStrongStreamAcquire( - struct ncclCudaGraph graph, struct ncclStrongStream* ss -); - -// Acquire-fence the strong stream assuming no graph is capturing. This permits -// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA -// calls. Strong stream still must be released via: -// ncclStrongStreamRelease(ncclCudaGraphNone(), ss); -ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss); - -// Release-fence of the strong stream. -ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss); - -// Add a host launch to the stream. -ncclResult_t ncclStrongStreamLaunchHost( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - cudaHostFn_t fn, void* arg -); -// Add a kernel launch to the stream. -ncclResult_t ncclStrongStreamLaunchKernel( - struct ncclCudaGraph graph, struct ncclStrongStream* ss, - void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes -); - -// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired. -// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus -// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the -// implementation to induce few graph dependencies. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false -); -// `b` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false -); -// `a` must be capturing within `graph`. -ncclResult_t ncclStrongStreamWaitStream( - struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false -); - -// Synchrnoization does not need the strong stream to be acquired. -ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss); - -//////////////////////////////////////////////////////////////////////////////// - -struct ncclStrongStreamGraph; // internal to ncclStrongStream - -struct ncclStrongStream { - // Used when not graph capturing. - cudaStream_t cudaStream; -#if CUDART_VERSION >= 11030 - // The event used to establish order between graphs and streams. During acquire - // this event is waited on, during release it is recorded to. - cudaEvent_t serialEvent; - // This stream ever appeared in a graph capture. - bool everCaptured; - // Tracks whether serialEvent needs to be recorded to upon Release(). - bool serialEventNeedsRecord; - struct ncclStrongStreamGraph* graphHead; -#else - cudaEvent_t scratchEvent; -#endif -}; - -#endif diff --git a/nvls/timer.h b/nvls/timer.h deleted file mode 100644 index 284fec6e0..000000000 --- a/nvls/timer.h +++ /dev/null @@ -1,60 +0,0 @@ -/************************************************************************* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TIMER_H_ -#define NCCL_TIMER_H_ -#if ENABLE_TIMER -#include -#include -#include -static double freq = -1; -static void calibrate() { - struct timeval tv; - gettimeofday(&tv, NULL); - uint64_t timeCycles = __rdtsc(); - double time = - tv.tv_sec*1E6 - tv.tv_usec; - uint64_t total = 0ULL; - for (int i=0; i<10000; i++) total += __rdtsc(); - gettimeofday(&tv, NULL); - timeCycles = __rdtsc() - timeCycles; - time += tv.tv_sec*1E6 + tv.tv_usec; - freq = timeCycles/time; -} -static inline double gettime() { - if (freq == -1) calibrate(); - return __rdtsc()/freq; -} -static uint64_t counts[8]; -static double times[8]; -static double startTimes[8]; -#define TIME_START(index) do { \ - counts[index]++; \ - startTimes[index] = gettime(); \ -} while (0); - -#define TIME_STOP(index) do { \ - times[index] += gettime() - startTimes[index]; \ -} while (0); - -#define TIME_CANCEL(index) do { \ - counts[index]--; \ -} while (0); - -#define TIME_PRINT(name) do { \ - printf("%s stats", name); \ - for (int i=0; i<8; i++) { \ - if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \ - counts[i] = 0; \ - } \ - printf("\n"); \ -} while (0); -#else -#define TIME_START(index) while(0); -#define TIME_STOP(index) while(0); -#define TIME_CANCEL(index) while(0); -#define TIME_PRINT(name) -#endif -#endif diff --git a/nvls/transport.h b/nvls/transport.h deleted file mode 100644 index 27529df5e..000000000 --- a/nvls/transport.h +++ /dev/null @@ -1,128 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TRANSPORT_H_ -#define NCCL_TRANSPORT_H_ - -#include "device.h" -#include "graph.h" -#include "nvmlwrap.h" -#include "core.h" - -#define NTRANSPORTS 4 -#define TRANSPORT_P2P 0 -#define TRANSPORT_SHM 1 -#define TRANSPORT_NET 2 -#define TRANSPORT_COLLNET 3 - -#include "proxy.h" - -extern struct ncclTransport p2pTransport; -extern struct ncclTransport shmTransport; -extern struct ncclTransport netTransport; -extern struct ncclTransport collNetTransport; - -extern struct ncclTransport* ncclTransports[]; - -// Forward declarations -struct ncclRing; -struct ncclConnector; -struct ncclComm; - -struct ncclPeerInfo { - int rank; - int cudaDev; - int nvmlDev; - int gdrSupport; - uint64_t hostHash; - uint64_t pidHash; - dev_t shmDev; - int64_t busId; - struct ncclComm* comm; - int cudaCompCap; -}; - -#define CONNECT_SIZE 128 -struct ncclConnect { - char data[CONNECT_SIZE]; -}; - -#if CUDART_VERSION >= 12010 - -#define NVLS_HANDLE_SIZE 64 -struct ncclNvlsSharedRes { - int refCount; - CUmulticastObjectProp properties; - CUmemAccessDesc accessDesc; - int dev; - size_t size; - size_t granularity; - CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer - char* mcBuff; // Multicast NVLS buffer address - CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer - char* ucBuff; // Unicast NVLS buffer address - char shareableHandle[NVLS_HANDLE_SIZE]; - size_t ucGran; - int nChannels; - struct ncclShmemCollBuff nvlsShmem; - void *nvlsShmemHandle; -}; - -#endif /* CUDART_VERSION >= 12010 */ - -struct ncclCollNetSharedRes { - int refCount; - int size; - char* cudaBuff; - char* hostBuff; - struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS]; - void* resources; - int nChannels; - size_t buffSize; -}; - -struct ncclTransportComm { - ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); - ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); - ncclResult_t (*free)(struct ncclConnector*); - ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels); - ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); - ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done); - ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState); - ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); -}; - -struct ncclTransport { - const char name[8]; - ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*); - struct ncclTransportComm send; - struct ncclTransportComm recv; -}; - -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL); - -// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange -#define USE_POSIX_FD 1 - -#if USE_POSIX_FD -#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR -#else -#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE -#endif - -ncclResult_t ncclNvlsInit(struct ncclComm* comm); -ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent); -ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); -ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv); -ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size); -ncclResult_t ncclNvlsFree(struct ncclComm* comm); - -enum { collNetRecv=0, collNetSend=1 }; -int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); -ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); -ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm); -#endif diff --git a/nvls/trees.h b/nvls/trees.h deleted file mode 100644 index ded84a667..000000000 --- a/nvls/trees.h +++ /dev/null @@ -1,13 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_TREES_H_ -#define NCCL_TREES_H_ - -ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType); -ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1); - -#endif diff --git a/nvls/tuner.h b/nvls/tuner.h deleted file mode 100644 index d8b275017..000000000 --- a/nvls/tuner.h +++ /dev/null @@ -1,22 +0,0 @@ -/************************************************************************* - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * Copyright (c) 2023, Meta Platforms, Inc. and affiliates. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_INT_TUNER_H_ -#define NCCL_INT_TUNER_H_ - -#include "nccl_tuner.h" - -// Tuning plugin to override NCCL's default algorithm/protocol tuning. - -// Attempts to load NCCL tuner from environmental variable. -// Returns ncclSuccess if the correct tuner symbol has been found and -// successully loaded. Otherwise returns an error and also logs the error. -ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner); - -// Cleans up NCCL tuner plugin. -ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner); -#endif diff --git a/nvls/utils.h b/nvls/utils.h deleted file mode 100644 index 60f6efb5f..000000000 --- a/nvls/utils.h +++ /dev/null @@ -1,524 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_UTILS_H_ -#define NCCL_UTILS_H_ - -#include "nccl.h" -#include "alloc.h" -#include "checks.h" -#include -#include -#include -#include -#include - -int ncclCudaCompCap(); - -// PCI Bus ID <-> int64 conversion functions -ncclResult_t int64ToBusId(int64_t id, char* busId); -ncclResult_t busIdToInt64(const char* busId, int64_t* id); - -ncclResult_t getBusId(int cudaDev, int64_t *busId); - -ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -uint64_t getHash(const char* string, int n); -uint64_t getHostHash(); -uint64_t getPidHash(); -ncclResult_t getRandomData(void* buffer, size_t bytes); - -struct netIf { - char prefix[64]; - int port; -}; - -int parseStringList(const char* string, struct netIf* ifList, int maxList); -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); - -static long log2i(long n) { - long l = 0; - while (n>>=1) l++; - return l; -} - -inline uint64_t clockNano() { - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec; -} - -/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else - * return -1 */ -inline ncclResult_t getRandomData(void* buffer, size_t bytes) { - ncclResult_t ret = ncclSuccess; - if (bytes > 0) { - const size_t one = 1UL; - FILE* fp = fopen("/dev/urandom", "r"); - if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError; - if (fp) fclose(fp); - } - return ret; -} - -//////////////////////////////////////////////////////////////////////////////// - -template -inline void ncclAtomicRefCountIncrement(Int* refs) { - __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED); -} - -template -inline Int ncclAtomicRefCountDecrement(Int* refs) { - return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL); -} - -//////////////////////////////////////////////////////////////////////////////// -/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that - * granularity of LIFO is not per object, instead frames containing many objects - * are pushed and popped. Therefor deallocation is extremely cheap since its - * done at the frame granularity. - * - * The initial state of the stack is with one frame, the "nil" frame, which - * cannot be popped. Therefor objects allocated in the nil frame cannot be - * deallocated sooner than stack destruction. - */ -struct ncclMemoryStack; - -void ncclMemoryStackConstruct(struct ncclMemoryStack* me); -void ncclMemoryStackDestruct(struct ncclMemoryStack* me); -void ncclMemoryStackPush(struct ncclMemoryStack* me); -void ncclMemoryStackPop(struct ncclMemoryStack* me); -template -T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1); - -//////////////////////////////////////////////////////////////////////////////// -/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for - * a pool instance to ever hold objects whose type have differing - * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by - * a backing `ncclMemoryStack` passed during Alloc(). If memory - * backing any currently held object is deallocated then it is an error to do - * anything other than reconstruct it, after which it is a valid empty pool. - */ -struct ncclMemoryPool; - -// Equivalent to zero-initialization -void ncclMemoryPoolConstruct(struct ncclMemoryPool* me); -template -T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing); -template -void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj); -void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from); - -//////////////////////////////////////////////////////////////////////////////// -/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer - * field is given via the `next` template argument. - * - * Example: - * struct Foo { - * struct Foo *next1, *next2; // can be a member of two lists at once - * }; - * ncclIntruQueue list1; - * ncclIntruQueue list2; - */ -template -struct ncclIntruQueue; - -template -void ncclIntruQueueConstruct(ncclIntruQueue *me); -template -bool ncclIntruQueueEmpty(ncclIntruQueue *me); -template -T* ncclIntruQueueHead(ncclIntruQueue *me); -template -void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x); -template -T* ncclIntruQueueDequeue(ncclIntruQueue *me); -template -T* ncclIntruQueueTryDequeue(ncclIntruQueue *me); -template -void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *memPool); - -//////////////////////////////////////////////////////////////////////////////// -/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex" - * and "cond" fields are part of the public interface. - */ -struct ncclThreadSignal { - pthread_mutex_t mutex; - pthread_cond_t cond; -}; - -// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER} -constexpr ncclThreadSignal ncclThreadSignalStaticInitializer(); - -void ncclThreadSignalConstruct(struct ncclThreadSignal* me); -void ncclThreadSignalDestruct(struct ncclThreadSignal* me); - -// A convenience instance per-thread. -extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance; - -//////////////////////////////////////////////////////////////////////////////// - -template -struct ncclIntruQueueMpsc; - -template -void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me); -template -bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me); -// Enqueue element. Returns true if queue is not abandoned. Even if queue is -// abandoned the element enqueued, so the caller needs to make arrangements for -// the queue to be tended. -template -bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc* me, T* x); -// Dequeue all elements at a glance. If there aren't any and `waitSome` is -// true then this call will wait until it can return a non empty list. -template -T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc* me, bool waitSome); -// Dequeue all elements and set queue to abandoned state. -template -T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc* me); - -//////////////////////////////////////////////////////////////////////////////// - -struct ncclMemoryStack { - struct Hunk { - struct Hunk* above; // reverse stack pointer - size_t size; // size of this allocation (including this header struct) - }; - struct Unhunk { // proxy header for objects allocated out-of-hunk - struct Unhunk* next; - void* obj; - }; - struct Frame { - struct Hunk* hunk; // top of non-empty hunks - uintptr_t bumper, end; // points into top hunk - struct Unhunk* unhunks; - struct Frame* below; - }; - - static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align); - static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align); - - struct Hunk stub; - struct Frame topFrame; -}; - -inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) { - me->stub.above = nullptr; - me->stub.size = 0; - me->topFrame.hunk = &me->stub; - me->topFrame.bumper = 0; - me->topFrame.end = 0; - me->topFrame.unhunks = nullptr; - me->topFrame.below = nullptr; -} - -inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) { - uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align); - void* obj; - if (__builtin_expect(o + size <= me->topFrame.end, true)) { - me->topFrame.bumper = o + size; - obj = reinterpret_cast(o); - } else { - obj = allocateSpilled(me, size, align); - } - return obj; -} - -template -inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) { - void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T)); - memset(obj, 0, n*sizeof(T)); - return (T*)obj; -} - -inline void ncclMemoryStackPush(struct ncclMemoryStack* me) { - using Frame = ncclMemoryStack::Frame; - Frame tmp = me->topFrame; - Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame)); - *snapshot = tmp; // C++ struct assignment - me->topFrame.unhunks = nullptr; - me->topFrame.below = snapshot; -} - -inline void ncclMemoryStackPop(struct ncclMemoryStack* me) { - ncclMemoryStack::Unhunk* un = me->topFrame.unhunks; - while (un != nullptr) { - free(un->obj); - un = un->next; - } - me->topFrame = *me->topFrame.below; // C++ struct assignment -} - - -//////////////////////////////////////////////////////////////////////////////// - -struct ncclMemoryPool { - struct Cell { - Cell *next; - }; - struct Cell* head; - struct Cell* tail; // meaningful only when head != nullptr -}; - -inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) { - me->head = nullptr; -} - -template -inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) { - using Cell = ncclMemoryPool::Cell; - Cell* cell; - if (__builtin_expect(me->head != nullptr, true)) { - cell = me->head; - me->head = cell->next; - } else { - // Use the internal allocate() since it doesn't memset to 0 yet. - size_t cellSize = std::max(sizeof(Cell), sizeof(T)); - size_t cellAlign = std::max(alignof(Cell), alignof(T)); - cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign); - } - memset(cell, 0, sizeof(T)); - return reinterpret_cast(cell); -} - -template -inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) { - using Cell = ncclMemoryPool::Cell; - Cell* cell = reinterpret_cast(obj); - cell->next = me->head; - if (me->head == nullptr) me->tail = cell; - me->head = cell; -} - -inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) { - if (from->head != nullptr) { - from->tail->next = me->head; - if (me->head == nullptr) me->tail = from->tail; - me->head = from->head; - from->head = nullptr; - } -} - -//////////////////////////////////////////////////////////////////////////////// - -template -struct ncclIntruQueue { - T *head, *tail; -}; - -template -inline void ncclIntruQueueConstruct(ncclIntruQueue *me) { - me->head = nullptr; - me->tail = nullptr; -} - -template -inline bool ncclIntruQueueEmpty(ncclIntruQueue *me) { - return me->head == nullptr; -} - -template -inline T* ncclIntruQueueHead(ncclIntruQueue *me) { - return me->head; -} - -template -inline T* ncclIntruQueueTail(ncclIntruQueue *me) { - return me->tail; -} - -template -inline void ncclIntruQueueEnqueue(ncclIntruQueue *me, T *x) { - x->*next = nullptr; - (me->head ? me->tail->*next : me->head) = x; - me->tail = x; -} - -template -inline T* ncclIntruQueueDequeue(ncclIntruQueue *me) { - T *ans = me->head; - me->head = ans->*next; - if (me->head == nullptr) me->tail = nullptr; - return ans; -} - -template -inline bool ncclIntruQueueDelete(ncclIntruQueue *me, T *x) { - T *prev = nullptr; - T *cur = me->head; - bool found = false; - - while (cur) { - if (cur == x) { - found = true; - break; - } - prev = cur; - cur = cur->*next; - } - - if (found) { - if (prev == nullptr) - me->head = cur->*next; - else - prev->*next = cur->*next; - if (cur == me->tail) - me->tail = prev; - } - return found; -} - -template -inline T* ncclIntruQueueTryDequeue(ncclIntruQueue *me) { - T *ans = me->head; - if (ans != nullptr) { - me->head = ans->*next; - if (me->head == nullptr) me->tail = nullptr; - } - return ans; -} - -template -void ncclIntruQueueFreeAll(ncclIntruQueue *me, ncclMemoryPool *pool) { - T *head = me->head; - me->head = nullptr; - me->tail = nullptr; - while (head != nullptr) { - T *tmp = head->*next; - ncclMemoryPoolFree(pool, tmp); - head = tmp; - } -} - -//////////////////////////////////////////////////////////////////////////////// - -constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() { - return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}; -} - -inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) { - pthread_mutex_init(&me->mutex, nullptr); - pthread_cond_init(&me->cond, nullptr); -} - -inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) { - pthread_mutex_destroy(&me->mutex); - pthread_cond_destroy(&me->cond); -} - -//////////////////////////////////////////////////////////////////////////////// - -template -struct ncclIntruQueueMpsc { - T* head; - uintptr_t tail; - struct ncclThreadSignal* waiting; -}; - -template -void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc* me) { - me->head = nullptr; - me->tail = 0x0; - me->waiting = nullptr; -} - -template -bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc* me) { - return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2; -} - -template -bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc* me, T* x) { - __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED); - uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast(x), __ATOMIC_ACQ_REL); - T* prev = reinterpret_cast(utail); - T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next); - __atomic_store_n(prevNext, x, __ATOMIC_RELAXED); - if (utail == 0x1) { // waiting - __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting - // This lock/unlock is essential to ensure we don't race ahead of the consumer - // and signal the cond before they begin waiting on it. - struct ncclThreadSignal* waiting = me->waiting; - pthread_mutex_lock(&waiting->mutex); - pthread_mutex_unlock(&waiting->mutex); - pthread_cond_broadcast(&waiting->cond); - } - return utail != 0x2; // not abandoned -} - -template -T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc* me, bool waitSome) { - T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); - if (head == nullptr) { - if (!waitSome) return nullptr; - uint64_t t0 = clockNano(); - bool sleeping = false; - do { - if (clockNano()-t0 >= 10*1000) { // spin for first 10us - struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance; - pthread_mutex_lock(&waitSignal->mutex); - uintptr_t expected = sleeping ? 0x1 : 0x0; - uintptr_t desired = 0x1; - me->waiting = waitSignal; // release done by successful compare exchange - if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) { - sleeping = true; - pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex); - } - pthread_mutex_unlock(&waitSignal->mutex); - } - head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); - } while (head == nullptr); - } - - __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); - uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL); - T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); - T *x = head; - while (x != tail) { - T *x1; - int spins = 0; - while (true) { - x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); - if (x1 != nullptr) break; - if (++spins == 1024) { spins = 1024-1; sched_yield(); } - } - x = x1; - } - return head; -} - -template -T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc* me) { - uintptr_t expected = 0x0; - if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) { - return nullptr; - } else { - int spins = 0; - T* head; - while (true) { - head = __atomic_load_n(&me->head, __ATOMIC_RELAXED); - if (head != nullptr) break; - if (++spins == 1024) { spins = 1024-1; sched_yield(); } - } - __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED); - uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL); - T* tail = utail <= 0x2 ? nullptr : reinterpret_cast(utail); - T *x = head; - while (x != tail) { - T *x1; - spins = 0; - while (true) { - x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED); - if (x1 != nullptr) break; - if (++spins == 1024) { spins = 1024-1; sched_yield(); } - } - x = x1; - } - return head; - } -} -#endif From c062afecfa25d63e3491c047c9de704a13963d5b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 14 Jan 2024 22:34:50 -0500 Subject: [PATCH 12/67] possibly compilable? --- include/mscclpp/core.hpp | 3 ++- src/connection.cc | 52 +++++++++++--------------------------- src/context.cc | 2 ++ src/endpoint.cc | 22 +++++++++++----- src/include/connection.hpp | 3 +-- src/include/context.hpp | 1 + src/include/endpoint.hpp | 3 ++- 7 files changed, 38 insertions(+), 48 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 1d12a4083..2b8807221 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -462,6 +462,7 @@ struct EndpointConfig { int ibMaxWrPerSend = DefaultMaxWrPerSend; size_t nvlsBufferSize; + int nvlsNumDevices; /// Default constructor. Sets transport to Transport::Unknown. EndpointConfig() : transport(Transport::Unknown) {} @@ -474,7 +475,7 @@ struct EndpointConfig { /// Constructor for NVLS explicitly /// @param transport must be either NvlsRoot or NvlsNonRoot /// @param nvlsBufferSize is the buffer to be alloced on each device - EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) { + EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) { if (!AllNvlsTransports.has(transport)) { throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage); } diff --git a/src/connection.cc b/src/connection.cc index a2a2f12f9..bcab9a829 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -94,49 +94,27 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // NVLS -NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints, size_t bufferSize, - bool isRoot) - : isRoot_(isRoot) { - if (localEndpoint.transport() != Transport::Nvls) { - throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage); +NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) { + if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { + throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage); } - for (auto remoteEndpoint : remoteEndpoints) { - if (remoteEndpoint.transport() != Transport::Nvls) { - throw mscclpp::Error("NVLS connection can only be made to a NVLS endpoint", ErrorCode::InvalidUsage); - } - // sanity check: make sure the IPC connection is being made within a node - if (getImpl(remoteEndpoint)->hostHash_ != getImpl(localEndpoint)->hostHash_) { - std::stringstream ss; - ss << "NVLS connection can only be made within a node: " << std::hex << getImpl(remoteEndpoint)->hostHash_ - << " != " << std::hex << getImpl(localEndpoint)->hostHash_; - throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage); - } - } - int nDevices = 1 + remoteEndpoints.size(); - MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId_)); - - CUmulticastObjectProp mcProp = {}; - mcProp.numDevices = nDevices; - mcProp.size = bufferSize; - mcProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - - size_t minGran = 0; - size_t gran = 0; - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - // only root needs to create the multicast handle - if (isRoot_) { - size_t mcSize = ((bufferSize + gran - 1) / gran) * gran; - mcProp.size = mcSize; - - MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp)); + if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { + throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", ErrorCode::InvalidUsage); } + mcHandle_ = localEndpoint.pimpl_.mcHandle_; + size_t bufferSize = localEndpoint.pimpl_.mcProp_; + + int cudaDeviceId; + MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); + MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); + + // Allocate physical memory CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = cudaDeviceId_; + prop.location.id = cudaDeviceId; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // allocate physical memory (data buffer) @@ -148,7 +126,7 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector rem accessDesc.location.id = cudaDeviceId_; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; // Map a VA to UC space - MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, minGran, 0U, 0)); + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0)); // set access on UC address MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1)); diff --git a/src/context.cc b/src/context.cc index d04a8e32c..afd7eba6c 100644 --- a/src/context.cc +++ b/src/context.cc @@ -49,6 +49,8 @@ MSCCLPP_API_CPP std::shared_ptr Context::connect(Endpoint localEndpo throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage); } conn = std::make_shared(localEndpoint, remoteEndpoint, *this); + } else if (AllNvlsTransports.has(localEndpoint) && AllNvlsTransports.has(remoteEndpoint)) { + conn = std::make_shared(localEndpoint, remoteEndpoint); } else { throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError); } diff --git a/src/endpoint.cc b/src/endpoint.cc index 350cba07e..4a4bfbc02 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -21,6 +21,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) minMcGran_ = 0; mcGran_ = 0; mcProp_.size = config.nvlsBufferSize; + mcProp_.numDevices = config.nvlsNumDevices; mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); @@ -29,8 +30,9 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) if (transport_ == Transport::NvlsRoot){ MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); - fileDesc_ = 0; - MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&fileDesc_, handle, handleType, 0 /*flags*/)); + mcFileDesc_ = 0; + MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&mcFileDesc_, handle, handleType, 0 /*flags*/)); + rootPid_ = getpid(); } } } @@ -46,7 +48,8 @@ MSCCLPP_API_CPP std::vector Endpoint::serialize() { } if (transport_ == Transport::NvlsRoot) { - std::copy_n(reinterpret_cast(&pimpl_->fileDesc_), sizeof(pimpl_->fileDesc_), std::back_inserter(data)); + std::copy_n(reinterpret_cast(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(data)); + std::copy_n(reinterpret_cast(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(data)); } return data; } @@ -67,10 +70,15 @@ Endpoint::Impl::Impl(const std::vector& serialization) { it += sizeof(ibQpInfo_); } if (transport_ == Transport::NvlsNonRoot) { - fileDesc_ = 0; - std::copy_n(it, sizeof(fileDesc_), reinterpret_cast(&fileDesc_)); - it += sizeof(fileDesc_); - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)fileDesc_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + mcFileDesc_ = 0; + std::copy_n(it, sizeof(mcFileDesc_), reinterpret_cast(&mcFileDesc_)); + it += sizeof(mcFileDesc_); + std::copy_n(it, sizeof(rootPid_), reinterpret_cast(&mcFileDesc_)); + it += sizeof(rootPid_); + int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); + int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + close(rootPidFd); } } diff --git a/src/include/connection.hpp b/src/include/connection.hpp index f15283b28..448fdfe7b 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -32,11 +32,10 @@ class CudaIpcConnection : public Connection { }; class NvlsConnection : public Connection { - int cudaDeviceId_; - bool isRoot_; CUmemGenericAllocationHandle mcHandle_; CUmemGenericAllocationHandle memHandle_; void* deviceBuffer_; + size_t bufferSize_; public: NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints, size_t bufferSize, bool isRoot); diff --git a/src/include/context.hpp b/src/include/context.hpp index 39a699560..abb95b27d 100644 --- a/src/include/context.hpp +++ b/src/include/context.hpp @@ -17,6 +17,7 @@ struct Context::Impl { std::vector> connections_; std::unordered_map> ibContexts_; CudaStreamWithFlags ipcStream_; + CUmemGenericAllocationHandle mcHandle_; Impl(); diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp index 00322674e..0d8f86bcd 100644 --- a/src/include/endpoint.hpp +++ b/src/include/endpoint.hpp @@ -28,7 +28,8 @@ struct Endpoint::Impl { CUmemGenericAllocationHandle mcHandle_; size_t minMcGran_; size_t mcGran_; - int fileDesc_; + pid_t rootPid_; + int mcFileDesc_; }; } // namespace mscclpp From a6e9af54f16649186bc897995e589060681b6a2b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 17 Jan 2024 18:11:32 +0000 Subject: [PATCH 13/67] lint --- include/mscclpp/core.hpp | 5 +++-- nvls/test.cu | 2 +- src/connection.cc | 7 ++++--- src/endpoint.cc | 5 +++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 2b8807221..62e772d8e 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -139,7 +139,7 @@ enum class Transport { }; const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", "IB0", "IB1", "IB2", - "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; + "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; namespace detail { const size_t TransportFlagsSize = 13; @@ -475,7 +475,8 @@ struct EndpointConfig { /// Constructor for NVLS explicitly /// @param transport must be either NvlsRoot or NvlsNonRoot /// @param nvlsBufferSize is the buffer to be alloced on each device - EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) { + EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) + : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) { if (!AllNvlsTransports.has(transport)) { throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage); } diff --git a/nvls/test.cu b/nvls/test.cu index bbbc3e391..bfb29c15c 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -175,7 +175,7 @@ int main() { cudaDeviceSynchronize(); MPI_Barrier(MPI_COMM_WORLD); - for (int input_size = 1024*3; input_size <= size; input_size *= 2){ + for (int input_size = 1024; input_size <= size; input_size *= 2){ // warmup for (int i = 0; i < rept; i++) { testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); diff --git a/src/connection.cc b/src/connection.cc index bcab9a829..76260051e 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -99,7 +99,8 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage); } if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { - throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", ErrorCode::InvalidUsage); + throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", + ErrorCode::InvalidUsage); } mcHandle_ = localEndpoint.pimpl_.mcHandle_; @@ -109,7 +110,6 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); - // Allocate physical memory CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; @@ -126,7 +126,8 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) accessDesc.location.id = cudaDeviceId_; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; // Map a VA to UC space - MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0)); + MSCCLPP_CUTHROW( + cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0)); // set access on UC address MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1)); diff --git a/src/endpoint.cc b/src/endpoint.cc index 4a4bfbc02..fbf512778 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -27,7 +27,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; // create the mc handle now only on the root - if (transport_ == Transport::NvlsRoot){ + if (transport_ == Transport::NvlsRoot) { MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); mcFileDesc_ = 0; @@ -77,7 +77,8 @@ Endpoint::Impl::Impl(const std::vector& serialization) { it += sizeof(rootPid_); int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + MSCCLPP_CUTHROW( + cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); close(rootPidFd); } } From d958a31517b80c9f833e002fb8f268a7b2a67ffd Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 17 Jan 2024 18:42:26 +0000 Subject: [PATCH 14/67] compiles --- include/mscclpp/core.hpp | 9 +++------ src/connection.cc | 17 +++++++++-------- src/context.cc | 2 +- src/endpoint.cc | 16 ++++++++++------ src/include/connection.hpp | 4 +++- src/include/endpoint.hpp | 2 ++ 6 files changed, 28 insertions(+), 22 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 62e772d8e..36fa9d298 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -142,7 +142,7 @@ const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", " "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; namespace detail { -const size_t TransportFlagsSize = 13; +const size_t TransportFlagsSize = 12; static_assert(TransportFlagsSize == static_cast(Transport::NumTransports), "TransportFlagsSize must match the number of transports"); /// Bitset for storing transport flags. @@ -399,6 +399,7 @@ class Endpoint { friend class Context; friend class Connection; + friend class NvlsConnection; }; /// Represents a connection between two processes. @@ -476,11 +477,7 @@ struct EndpointConfig { /// @param transport must be either NvlsRoot or NvlsNonRoot /// @param nvlsBufferSize is the buffer to be alloced on each device EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) - : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) { - if (!AllNvlsTransports.has(transport)) { - throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage); - } - } + : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {} }; /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases diff --git a/src/connection.cc b/src/connection.cc index 76260051e..1fb13ab49 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -94,7 +94,8 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // NVLS -NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) { +NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint) + : transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()) { if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage); } @@ -103,12 +104,12 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) ErrorCode::InvalidUsage); } - mcHandle_ = localEndpoint.pimpl_.mcHandle_; - size_t bufferSize = localEndpoint.pimpl_.mcProp_; + mcHandle_ = localEndpoint.pimpl_->mcHandle_; + size_t bufferSize = localEndpoint.pimpl_->mcProp_.size; int cudaDeviceId; MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); - MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); + MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); // Allocate physical memory CUmemAllocationProp prop = {}; @@ -123,11 +124,11 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) // usual VA business: map both MC and PA to two different VA addresses CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = cudaDeviceId_; + accessDesc.location.id = cudaDeviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; // Map a VA to UC space MSCCLPP_CUTHROW( - cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0)); + cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_->minMcGran_, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0)); // set access on UC address MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1)); @@ -135,9 +136,9 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) INFO(MSCCLPP_P2P, "NVLS connection created"); } -Transport NvlsConnection::transport() { return Transport::Nvls; } +Transport NvlsConnection::transport() { return transport_; } -Transport NvlsConnection::remoteTransport() { return Transport::Nvls; } +Transport NvlsConnection::remoteTransport() { return remoteTransport_; } void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) { throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage); diff --git a/src/context.cc b/src/context.cc index afd7eba6c..f75473487 100644 --- a/src/context.cc +++ b/src/context.cc @@ -49,7 +49,7 @@ MSCCLPP_API_CPP std::shared_ptr Context::connect(Endpoint localEndpo throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage); } conn = std::make_shared(localEndpoint, remoteEndpoint, *this); - } else if (AllNvlsTransports.has(localEndpoint) && AllNvlsTransports.has(remoteEndpoint)) { + } else if (AllNvlsTransports.has(localEndpoint.transport()) && AllNvlsTransports.has(remoteEndpoint.transport())) { conn = std::make_shared(localEndpoint, remoteEndpoint); } else { throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError); diff --git a/src/endpoint.cc b/src/endpoint.cc index fbf512778..3b740dd6a 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -1,5 +1,8 @@ #include "endpoint.hpp" +#include +#include + #include #include "api.h" @@ -22,16 +25,17 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) mcGran_ = 0; mcProp_.size = config.nvlsBufferSize; mcProp_.numDevices = config.nvlsNumDevices; - mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; // create the mc handle now only on the root if (transport_ == Transport::NvlsRoot) { MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); mcFileDesc_ = 0; - MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&mcFileDesc_, handle, handleType, 0 /*flags*/)); + MSCCLPP_CUTHROW( + cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); rootPid_ = getpid(); } } @@ -47,7 +51,7 @@ MSCCLPP_API_CPP std::vector Endpoint::serialize() { std::copy_n(reinterpret_cast(&pimpl_->ibQpInfo_), sizeof(pimpl_->ibQpInfo_), std::back_inserter(data)); } - if (transport_ == Transport::NvlsRoot) { + if (pimpl_->transport_ == Transport::NvlsRoot) { std::copy_n(reinterpret_cast(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(data)); std::copy_n(reinterpret_cast(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(data)); } @@ -76,7 +80,7 @@ Endpoint::Impl::Impl(const std::vector& serialization) { std::copy_n(it, sizeof(rootPid_), reinterpret_cast(&mcFileDesc_)); it += sizeof(rootPid_); int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); - int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + size_t mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); MSCCLPP_CUTHROW( cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); close(rootPidFd); diff --git a/src/include/connection.hpp b/src/include/connection.hpp index 448fdfe7b..6c45d2c14 100644 --- a/src/include/connection.hpp +++ b/src/include/connection.hpp @@ -32,13 +32,15 @@ class CudaIpcConnection : public Connection { }; class NvlsConnection : public Connection { + Transport transport_; + Transport remoteTransport_; CUmemGenericAllocationHandle mcHandle_; CUmemGenericAllocationHandle memHandle_; void* deviceBuffer_; size_t bufferSize_; public: - NvlsConnection(Endpoint localEndpoint, std::vector remoteEndpoints, size_t bufferSize, bool isRoot); + NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint); Transport transport() override; diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp index 0d8f86bcd..fc5b00bfa 100644 --- a/src/include/endpoint.hpp +++ b/src/include/endpoint.hpp @@ -4,6 +4,8 @@ #ifndef MSCCLPP_ENDPOINT_HPP_ #define MSCCLPP_ENDPOINT_HPP_ +#include + #include #include From 87a293fa39fbe601cbd8d683842461fbd2d7a52f Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 17 Jan 2024 18:55:55 +0000 Subject: [PATCH 15/67] wip --- python/mscclpp/core_py.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 4e92f8841..ab865bd1d 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -72,6 +72,8 @@ void register_core(nb::module_& m) { nb::enum_(m, "Transport") .value("Unknown", Transport::Unknown) .value("CudaIpc", Transport::CudaIpc) + .value("NvlsRoot", Transport::NvlsRoot) + .value("NvlsNonRoot", Transport::NvlsNonRoot) .value("IB0", Transport::IB0) .value("IB1", Transport::IB1) .value("IB2", Transport::IB2) From f1dfc0dd3edf8775f15a65cca8e9a3796f172260 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Wed, 17 Jan 2024 20:01:10 +0000 Subject: [PATCH 16/67] wip --- python/test/test_mscclpp.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index f3a7f9dd6..9aef18802 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -118,14 +118,19 @@ def init_target(): def create_and_connect(mpi_group: MpiGroup, transport: str): - if transport == "NVLink" and all_ranks_on_the_same_node(mpi_group) is False: - pytest.skip("cannot use nvlink for cross node") + if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: + pytest.skip("cannot use nvlink/nvls for cross node") group = mscclpp_comm.CommGroup(mpi_group.comm) remote_nghrs = list(range(mpi_group.comm.size)) remote_nghrs.remove(mpi_group.comm.rank) if transport == "NVLink": tran = Transport.CudaIpc + elif tranport == "NVLS": + if group.rank == 0: + tran = Transport.NvlsRoot + else: + tran = Transport.NvlsNonRoot elif transport == "IB": tran = group.my_ib_device(group.my_rank % 8) else: @@ -522,3 +527,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u proxy_service.stop_proxy() group.barrier() assert cp.array_equal(memory, memory_expected) + +@parametrize_mpi_groups(2, 4, 8, 16) +def test_simple_proxy_channel(mpi_group: MpiGroup): + group, connections = create_and_connect(mpi_group, "NVLS") From 76a6dd52573d32c8304c1b579a6b24c3f6579940 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 18 Jan 2024 00:02:16 +0000 Subject: [PATCH 17/67] wip --- include/mscclpp/core.hpp | 6 ++++-- python/mscclpp_benchmark/allreduce_bench.py | 2 +- python/test/test_mscclpp.py | 5 +++-- src/connection.cc | 7 +++++-- src/endpoint.cc | 12 ++++++++---- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 36fa9d298..0245b2b95 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -455,6 +455,8 @@ struct EndpointConfig { static const int DefaultMaxCqPollNum = 1; static const int DefaultMaxSendWr = 8192; static const int DefaultMaxWrPerSend = 64; + static const int DefaultNvlsNumDevices = 8; + static const int DefaultNvlsBufferSize = (1 << 29); Transport transport; int ibMaxCqSize = DefaultMaxCqSize; @@ -462,8 +464,8 @@ struct EndpointConfig { int ibMaxSendWr = DefaultMaxSendWr; int ibMaxWrPerSend = DefaultMaxWrPerSend; - size_t nvlsBufferSize; - int nvlsNumDevices; + size_t nvlsBufferSize = DefaultNvlsBufferSize; + int nvlsNumDevices = DefaultNvlsNumDevices; /// Default constructor. Sets transport to Transport::Unknown. EndpointConfig() : transport(Transport::Unknown) {} diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 80aa5e93a..9c5e7ca84 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -247,7 +247,7 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - for i in range(10, 29): + for i in range(10, 25): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 9aef18802..a99676603 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -126,10 +126,11 @@ def create_and_connect(mpi_group: MpiGroup, transport: str): remote_nghrs.remove(mpi_group.comm.rank) if transport == "NVLink": tran = Transport.CudaIpc - elif tranport == "NVLS": - if group.rank == 0: + elif transport == "NVLS": + if mpi_group.comm.rank == 0: tran = Transport.NvlsRoot else: + remote_nghrs = [0] tran = Transport.NvlsNonRoot elif transport == "IB": tran = group.my_ib_device(group.my_rank % 8) diff --git a/src/connection.cc b/src/connection.cc index 1fb13ab49..f3d2fe4dc 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -96,20 +96,23 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint) : transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()) { - if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { + if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsNonRoot) { throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage); } if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) { throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", ErrorCode::InvalidUsage); } - + printf("here0\n"); mcHandle_ = localEndpoint.pimpl_->mcHandle_; size_t bufferSize = localEndpoint.pimpl_->mcProp_.size; int cudaDeviceId; + printf("here1\n"); MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); + printf("here1.5 %d %d\n", (int)mcHandle_, cudaDeviceId); MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); + printf("here2\n"); // Allocate physical memory CUmemAllocationProp prop = {}; diff --git a/src/endpoint.cc b/src/endpoint.cc index 3b740dd6a..2d20f9a72 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -23,12 +23,14 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) if (AllNvlsTransports.has(transport_)) { minMcGran_ = 0; mcGran_ = 0; + mcProp_ = {}; mcProp_.size = config.nvlsBufferSize; mcProp_.numDevices = config.nvlsNumDevices; mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; + printf("---> %ld %ld | %lld %lld\n", mcProp_.size, mcProp_.numDevices, mcGran_, minMcGran_); // create the mc handle now only on the root if (transport_ == Transport::NvlsRoot) { MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); @@ -37,6 +39,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl) MSCCLPP_CUTHROW( cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); rootPid_ = getpid(); + printf("LLLLLLL %lld %lld\n", mcFileDesc_, rootPid_); } } } @@ -73,17 +76,18 @@ Endpoint::Impl::Impl(const std::vector& serialization) { std::copy_n(it, sizeof(ibQpInfo_), reinterpret_cast(&ibQpInfo_)); it += sizeof(ibQpInfo_); } - if (transport_ == Transport::NvlsNonRoot) { + if (transport_ == Transport::NvlsRoot) { mcFileDesc_ = 0; std::copy_n(it, sizeof(mcFileDesc_), reinterpret_cast(&mcFileDesc_)); it += sizeof(mcFileDesc_); - std::copy_n(it, sizeof(rootPid_), reinterpret_cast(&mcFileDesc_)); + std::copy_n(it, sizeof(rootPid_), reinterpret_cast(&rootPid_)); it += sizeof(rootPid_); int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); - size_t mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + printf("==========> %lld %lld %lld\n", rootPidFd, mcRootFileDescFd, mcFileDesc_); MSCCLPP_CUTHROW( cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); - close(rootPidFd); + // close(rootPidFd); } } From 1dc4e8350e6fdcddc2bbe4d69da83a460912978f Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 18 Jan 2024 16:37:16 -0800 Subject: [PATCH 18/67] wip --- include/mscclpp/core.hpp | 3 ++- src/communicator.cc | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 2b8807221..f7e896771 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -462,7 +462,6 @@ struct EndpointConfig { int ibMaxWrPerSend = DefaultMaxWrPerSend; size_t nvlsBufferSize; - int nvlsNumDevices; /// Default constructor. Sets transport to Transport::Unknown. EndpointConfig() : transport(Transport::Unknown) {} @@ -663,6 +662,8 @@ class Communicator { /// to the connection. NonblockingFuture> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig); + std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config); + /// Get the remote rank a connection is connected to. /// /// @param connection The connection to get the remote rank for. diff --git a/src/communicator.cc b/src/communicator.cc index d2f0e6172..5725a691d 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -105,6 +105,11 @@ MSCCLPP_API_CPP NonblockingFuture> Communicator::con return NonblockingFuture>(connector->connectionPromise_.get_future()); } +MSCCLPP_API_CPP std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config) { + +} + + MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) { return pimpl_->connectionInfos_.at(&connection).remoteRank; } From 847f1d8bf98e459f22933ac5aa01b96154a0374d Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 02:46:41 +0000 Subject: [PATCH 19/67] lint --- include/mscclpp/core.hpp | 11 ++++------- python/mscclpp_benchmark/allreduce_bench.py | 12 ++++++------ src/communicator.cc | 21 +++++++++------------ src/connection.cc | 7 ++++--- 4 files changed, 23 insertions(+), 28 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 7c5a19869..cb824f926 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -138,7 +138,7 @@ enum class Transport { }; const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2", - "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; + "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"}; namespace detail { const size_t TransportFlagsSize = 11; @@ -460,17 +460,15 @@ class NvlsConnection { // Everyone needs to synchronize after creating a NVLS connection before adding devices void addDevice(); void addDevice(int cudaDeviceId); - + void* getMultiCastPointer(); -private: + private: struct Impl; std::unique_ptr pimpl_; }; - - /// Used to configure an endpoint. struct EndpointConfig { static const int DefaultMaxCqSize = 1024; @@ -498,8 +496,7 @@ struct EndpointConfig { /// Constructor for NVLS explicitly /// @param transport must be either NvlsRoot or NvlsNonRoot /// @param nvlsBufferSize is the buffer to be alloced on each device - EndpointConfig(Transport transport, size_t nvlsBufferSize) - : transport(transport), nvlsBufferSize(nvlsBufferSize) {} + EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) {} }; /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 9c5e7ca84..5a3987cd3 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -147,12 +147,12 @@ def run_benchmark( if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: if memory.nbytes < 2**20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2**29: - mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: - proxy_service = ProxyService() - mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) - proxy_service.start_proxy() + mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) + # else: + # proxy_service = ProxyService() + # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) + # proxy_service.start_proxy() else: if memory.nbytes < 2**22: proxy_service = ProxyService() @@ -247,7 +247,7 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - for i in range(10, 25): + for i in range(10, 28): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: diff --git a/src/communicator.cc b/src/communicator.cc index 8b2a4b75a..9e6e9186e 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -105,31 +105,29 @@ MSCCLPP_API_CPP NonblockingFuture> Communicator::con return NonblockingFuture>(connector->connectionPromise_.get_future()); } -MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollective(std::vector allRanks, EndpointConfig config) { +MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollective(std::vector allRanks, + EndpointConfig config) { auto bootstrap = this->bootstrap(); int myRank = bootstrap->getRank(); bool isRoot = false; bool amongAllRanks = false; int rootRank = allRanks[0]; - for (auto nvlsRank : allRanks){ - if (nvlsRank == myRank) - amongAllRanks = true; + for (auto nvlsRank : allRanks) { + if (nvlsRank == myRank) amongAllRanks = true; rootRank = std::min(rootRank, nvlsRank); } - if (amongAllRanks == false){ + if (amongAllRanks == false) { throw Error("my rank is not among allRanks", ErrorCode::InvalidUsage); } - if (rootRank == myRank) - isRoot = true; - + if (rootRank == myRank) isRoot = true; + std::shared_ptr conn; - if (isRoot){ + if (isRoot) { conn = std::make_shared(config, allRanks.size()); auto serialized = conn->serialize(); for (auto nvlsRank : allRanks) { - if (nvlsRank != myRank) - bootstrap->send(serialized, nvlsRank, 0); + if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0); } } else { std::vector data; @@ -156,7 +154,6 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollecti return conn; } - MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) { return pimpl_->connectionInfos_.at(&connection).remoteRank; } diff --git a/src/connection.cc b/src/connection.cc index 60436b8a3..c165fb5d5 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -130,15 +130,16 @@ struct NvlsConnection::Impl { int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + MSCCLPP_CUTHROW( + cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); close(rootPidFd); INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); } }; -NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) : pimpl_(std::make_unique(bufferSize, numDevices)) { -} +NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) + : pimpl_(std::make_unique(bufferSize, numDevices)) {} NvlsConnection::addDevice() { int cudaDeviceId; MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); From 32c15b14a24e4f7ce0fcb7ca96fb0b8d1acc8b93 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 02:53:13 +0000 Subject: [PATCH 20/67] wip --- include/mscclpp/core.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index cb824f926..65346bd02 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include From 01d752b068b62db41b36f1b169fe5c0e99a5ed94 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 18 Jan 2024 18:54:13 -0800 Subject: [PATCH 21/67] wip --- include/mscclpp/core.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index cb824f926..a0aac1ac5 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -680,7 +680,7 @@ class Communicator { /// to the connection. NonblockingFuture> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig); - std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config); + std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config); /// Get the remote rank a connection is connected to. /// From 2631f990e061d6ea02a9ee96bc3ea0decdab002d Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 18 Jan 2024 19:01:52 -0800 Subject: [PATCH 22/67] wip --- src/communicator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/communicator.cc b/src/communicator.cc index 9e6e9186e..6c1849aae 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -124,7 +124,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollecti std::shared_ptr conn; if (isRoot) { - conn = std::make_shared(config, allRanks.size()); + conn = std::make_shared(config.nvlsBufferSize, allRanks.size()); auto serialized = conn->serialize(); for (auto nvlsRank : allRanks) { if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0); From 97db00cd9c156e91d181b0aa4dbddbabb615c080 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 03:02:10 +0000 Subject: [PATCH 23/67] wip --- src/context.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/context.cc b/src/context.cc index f75473487..d04a8e32c 100644 --- a/src/context.cc +++ b/src/context.cc @@ -49,8 +49,6 @@ MSCCLPP_API_CPP std::shared_ptr Context::connect(Endpoint localEndpo throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage); } conn = std::make_shared(localEndpoint, remoteEndpoint, *this); - } else if (AllNvlsTransports.has(localEndpoint.transport()) && AllNvlsTransports.has(remoteEndpoint.transport())) { - conn = std::make_shared(localEndpoint, remoteEndpoint); } else { throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError); } From caf997a9e785fdc5557b3494f086ed8839879481 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 06:31:34 +0000 Subject: [PATCH 24/67] wip --- include/mscclpp/core.hpp | 2 +- src/connection.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 67fe9020c..936a580bc 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -467,7 +467,7 @@ class NvlsConnection { private: struct Impl; - std::unique_ptr pimpl_; + std::shared_ptr pimpl_; }; /// Used to configure an endpoint. diff --git a/src/connection.cc b/src/connection.cc index c165fb5d5..15b88fa6c 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -139,7 +139,7 @@ struct NvlsConnection::Impl { }; NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) - : pimpl_(std::make_unique(bufferSize, numDevices)) {} + : pimpl_(std::make_shared(bufferSize, numDevices)) {} NvlsConnection::addDevice() { int cudaDeviceId; MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); @@ -154,7 +154,7 @@ NvlsConnection::addDevice(int cudaDeviceId) { INFO(MSCCLPP_COLL, "NVLS connection created"); } -NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(data) {} +NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(std::make_shared(data)) {} std::vector NvlsConnection::serialize() { std::vector result; From 9d5a2628a32eabdb38e0887a9ec0535333b1375d Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 06:44:46 +0000 Subject: [PATCH 25/67] compiles --- include/mscclpp/core.hpp | 2 -- src/connection.cc | 33 +++++++++++---------------------- 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 936a580bc..64520485c 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -462,8 +462,6 @@ class NvlsConnection { void addDevice(); void addDevice(int cudaDeviceId); - void* getMultiCastPointer(); - private: struct Impl; diff --git a/src/connection.cc b/src/connection.cc index 15b88fa6c..399430581 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -3,6 +3,10 @@ #include "connection.hpp" +#include +#include + +#include #include #include @@ -126,12 +130,12 @@ struct NvlsConnection::Impl { Impl(const std::vector& data) { auto it = data.begin(); - std::copy_n(it, sizeof(*this), reinterpret_cast(*this)); + std::copy_n(it, sizeof(*this), reinterpret_cast(this)); int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); - MSCCLPP_CUTHROW( - cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast(mcRootFileDescFd), + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); close(rootPidFd); INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); @@ -140,7 +144,8 @@ struct NvlsConnection::Impl { NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) : pimpl_(std::make_shared(bufferSize, numDevices)) {} -NvlsConnection::addDevice() { + +void NvlsConnection::addDevice() { int cudaDeviceId; MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId)); @@ -148,7 +153,7 @@ NvlsConnection::addDevice() { INFO(MSCCLPP_COLL, "NVLS connection created"); } -NvlsConnection::addDevice(int cudaDeviceId) { +void NvlsConnection::addDevice(int cudaDeviceId) { MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId)); INFO(MSCCLPP_COLL, "NVLS connection created"); @@ -158,26 +163,10 @@ NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(std::make std::vector NvlsConnection::serialize() { std::vector result; - std::copy_n(reinterpret_cast(pimpl_), sizeof(*pimpl_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(pimpl_.get()), sizeof(*pimpl_), std::back_inserter(result)); return result; } -Transport NvlsConnection::transport() { return transport_; } - -Transport NvlsConnection::remoteTransport() { return remoteTransport_; } - -void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) { - throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage); -} - -void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64_t) { - throw Error("NVLS does not have a CPU updateAndSync API", ErrorCode::InvalidUsage); -} - -void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); } - -void* NvlsConnection::getDevicePointer() { return deviceBuffer_; } - // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) From 87051274333adf91497b8e1adaf3dfe84d7ff1d9 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Thu, 18 Jan 2024 23:01:52 -0800 Subject: [PATCH 26/67] wip --- include/mscclpp/core.hpp | 5 +---- python/mscclpp/comm.py | 6 ++++-- python/mscclpp/core_py.cpp | 1 + python/test/test_mscclpp.py | 13 ++++++------- 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 64520485c..183d8c815 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -399,7 +399,6 @@ class Endpoint { friend class Context; friend class Connection; - friend class NvlsConnection; }; /// Represents a connection between two processes. @@ -679,6 +678,7 @@ class Communicator { /// to the connection. NonblockingFuture> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig); + /// TBD std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config); /// Get the remote rank a connection is connected to. @@ -719,9 +719,6 @@ extern const TransportFlags NoTransports; /// A constant TransportFlags object representing all InfiniBand transports. extern const TransportFlags AllIBTransports; -/// A constant TransportFlags object representing all NVLS transports. -extern const TransportFlags AllNvlsTransports; - /// A constant TransportFlags object representing all transports. extern const TransportFlags AllTransports; diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py index c01c04a2d..3085cc3df 100644 --- a/python/mscclpp/comm.py +++ b/python/mscclpp/comm.py @@ -79,10 +79,12 @@ def my_ib_device(self, local_rank: int) -> Transport: assert False # only 8 IBs are supported def make_connection( - self, remote_ranks: list[int], transports: Transport | dict[int, Transport] + self, all_ranks: list[int], transports: Transport | dict[int, Transport] ) -> dict[int, Connection]: + if transports == Transport.Nvls: + return self.communicator.connct_nvls_collective(all_ranks, transports) connections = {} - for rank in remote_ranks: + for rank in all_ranks: if type(transports) is dict: transport = transports[rank] else: diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 12a21fd44..729e8213a 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -169,6 +169,7 @@ void register_core(nb::module_& m) { .def("recv_memory_on_setup", &Communicator::recvMemoryOnSetup, nb::arg("remoteRank"), nb::arg("tag")) .def("connect_on_setup", &Communicator::connectOnSetup, nb::arg("remoteRank"), nb::arg("tag"), nb::arg("localConfig")) + .def("connct_nvls_collective", &Communicator::connctNvlsCollective, nb::arg("allRanks"), nb::arg("config")) .def("remote_rank_of", &Communicator::remoteRankOf) .def("tag_of", &Communicator::tagOf) .def("setup", &Communicator::setup); diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index a99676603..e473bae23 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -121,17 +121,16 @@ def create_and_connect(mpi_group: MpiGroup, transport: str): if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink/nvls for cross node") group = mscclpp_comm.CommGroup(mpi_group.comm) - + if transport == "NVLS": + all_ranks = list(range(mpi_group.comm.size)) + tran = Transport.Nvls + connection = group.make_connection(all_ranks, tran) + return group, connection + remote_nghrs = list(range(mpi_group.comm.size)) remote_nghrs.remove(mpi_group.comm.rank) if transport == "NVLink": tran = Transport.CudaIpc - elif transport == "NVLS": - if mpi_group.comm.rank == 0: - tran = Transport.NvlsRoot - else: - remote_nghrs = [0] - tran = Transport.NvlsNonRoot elif transport == "IB": tran = group.my_ib_device(group.my_rank % 8) else: From 8e1a2dcef6f03a34e23b8b0c600a755ed8ac39f8 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 08:18:49 +0000 Subject: [PATCH 27/67] wip --- python/test/test_mscclpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index e473bae23..9d4456695 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -529,5 +529,5 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u assert cp.array_equal(memory, memory_expected) @parametrize_mpi_groups(2, 4, 8, 16) -def test_simple_proxy_channel(mpi_group: MpiGroup): +def test_nvls(mpi_group: MpiGroup): group, connections = create_and_connect(mpi_group, "NVLS") From 4d8b214c3ef778e13df1e17f2282807be493629f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Fri, 19 Jan 2024 08:54:51 +0000 Subject: [PATCH 28/67] fix --- python/mscclpp/core_py.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 729e8213a..190a90d7b 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -125,6 +126,8 @@ void register_core(nb::module_& m) { .def("transport", &Connection::transport) .def("remote_transport", &Connection::remoteTransport); + nb::class_(m, "NvlsConnection"); + nb::class_(m, "Endpoint") .def("transport", &Endpoint::transport) .def("serialize", &Endpoint::serialize) From 6d31e6289ad742daa066fa6f5c63d38d874aeb18 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Fri, 19 Jan 2024 23:18:03 +0000 Subject: [PATCH 29/67] wip --- include/mscclpp/gpu_utils.hpp | 79 +++++++++++++++++++++++++++++++++++ nvls/test.cu | 10 +++-- python/test/test_mscclpp.py | 2 +- src/connection.cc | 4 ++ 4 files changed, 90 insertions(+), 5 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index e0cd7c3da..986893bab 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -50,6 +50,12 @@ struct CudaStreamWithFlags { cudaStream_t stream_; }; +template +struct PhysicalCudaMemory { + CUmemGenericAllocationHandle memHandle; + T* devicePtr; +}; + namespace detail { /// A wrapper of cudaMalloc that sets the allocated memory to zero. @@ -67,6 +73,42 @@ T* cudaCalloc(size_t nelem) { return ptr; } +template +PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { + AvoidCudaGraphCaptureGuard cgcGuard; + + int deviceId = -1; + MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); + + PhysicalCudaMemory* ret = new PhysicalCudaMemory(); + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = deviceId; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + size_t bufferSize = sizeof(T) * nelem; + // allocate physical memory + MSCCLPP_CUTHROW(cuMemCreate(&ret->memHandle, bufferSize, &prop, 0 /*flags*/)); + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = deviceId; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + // Map the device pointer + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&ret->devicePtr, bufferSize, gran, 0U, 0)); + MSCCLPP_CUDATHROW(cudaMemset(ret->devicePtr, 0, bufferSize)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)ret->devicePtr, bufferSize, 0, ret->memHandle, 0)); + MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)ret->devicePtr, bufferSize, &accessDesc, 1)); + + CudaStreamWithFlags stream(cudaStreamNonBlocking); + MSCCLPP_CUDATHROW(cudaMemsetAsync(ret->devicePtr, 0, nelem * sizeof(T), stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); + + return ret; +} + template T* cudaExtCalloc(size_t nelem) { AvoidCudaGraphCaptureGuard cgcGuard; @@ -118,6 +160,20 @@ Memory safeAlloc(size_t nelem) { return Memory(ptr, Deleter()); } +template +Memory safeAlloc(size_t nelem, size_t gran) { + T* ptr = nullptr; + try { + ptr = alloc(nelem, gran); + } catch (...) { + if (ptr) { + Deleter()(ptr); + } + throw; + } + return Memory(ptr, Deleter()); +} + } // namespace detail /// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr. @@ -131,6 +187,16 @@ struct CudaDeleter { } }; +template +struct CudaPhysicalDeleter { + using TPtrOrArray = + std::conditional_t>, PhysicalCudaMemory, PhysicalCudaMemory*>; + void operator()(TPtrOrArray ptr) { + AvoidCudaGraphCaptureGuard cgcGuard; + // TODO: adding free'ing stuff here + } +}; + /// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr. /// @tparam T Type of each element in the allocated memory. template @@ -151,6 +217,13 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { return detail::safeAlloc, CudaDeleter, std::shared_ptr>(count); } +/// TODO: docs... +template +std::shared_ptr allocSharedPhysicalCuda(size_t count, size_t gran) { + return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, + std::shared_ptr>>(count, gran); +} + /// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out. /// @tparam T Type of each element in the allocated memory. /// @param count Number of elements to allocate. @@ -174,6 +247,12 @@ UniqueCudaPtr allocUniqueCuda(size_t count = 1) { return detail::safeAlloc, CudaDeleter, UniqueCudaPtr>(count); } +template +std::shared_ptr allocUniquePhysicalCuda(size_t count, size_t gran) { + return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, + std::unique_ptr, CudaDeleter>>>(count, gran); +} + /// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out. /// @tparam T Type of each element in the allocated memory. /// @param count Number of elements to allocate. diff --git a/nvls/test.cu b/nvls/test.cu index bfb29c15c..b84f19519 100644 --- a/nvls/test.cu +++ b/nvls/test.cu @@ -135,10 +135,6 @@ int main() { // allocate physical memory (data buffer) CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/)); - // everyone binds memory to the multicast - CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0)); - MPI_Barrier(MPI_COMM_WORLD); - // usual VA business: map both MC and PA to two different VA addresses void* uc_va; void* mc_va; CUmemAccessDesc accessDesc = {}; @@ -153,6 +149,12 @@ int main() { // set access on UC address CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1)); + + // everyone binds memory to the multicast + CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0)); + MPI_Barrier(MPI_COMM_WORLD); + // usual VA business: map both MC and PA to two different VA addresses + // Map a VA to MC space CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0)); CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0)); diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 9d4456695..f558dda48 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -528,6 +528,6 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u group.barrier() assert cp.array_equal(memory, memory_expected) -@parametrize_mpi_groups(2, 4, 8, 16) +@parametrize_mpi_groups(2, 4, 8) def test_nvls(mpi_group: MpiGroup): group, connections = create_and_connect(mpi_group, "NVLS") diff --git a/src/connection.cc b/src/connection.cc index 399430581..a7d1bbf2b 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -123,6 +123,7 @@ struct NvlsConnection::Impl { mcFileDesc_ = 0; MSCCLPP_CUTHROW( cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); + // TODO: we need proper throw in here. rootPid_ = getpid(); INFO(MSCCLPP_COLL, "NVLS handle created on root"); @@ -132,6 +133,7 @@ struct NvlsConnection::Impl { auto it = data.begin(); std::copy_n(it, sizeof(*this), reinterpret_cast(this)); + // TODO: proper throw int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast(mcRootFileDescFd), @@ -140,6 +142,8 @@ struct NvlsConnection::Impl { INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); } + + // TODO: close all FDs and deallocate all handles. }; NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) From 3011707d8e4df86f18d18129a4c720f319f30260 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sat, 20 Jan 2024 06:15:50 +0000 Subject: [PATCH 30/67] memalloc added --- include/mscclpp/core.hpp | 15 ++++++++--- include/mscclpp/gpu_utils.hpp | 32 +++++++++++++--------- src/connection.cc | 50 +++++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 18 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 183d8c815..8910a03e4 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -449,18 +450,26 @@ class Connection { }; class NvlsConnection { - CUmemGenericAllocationHandle mcHandle_; - size_t bufferSize_; - public: NvlsConnection(size_t bufferSize, int numDevices); NvlsConnection(const std::vector& data); + NvlsConnection() = delete; + // TODO: Clean up after yourself! + // ~NvlsConnection(); std::vector serialize(); // Everyone needs to synchronize after creating a NVLS connection before adding devices void addDevice(); void addDevice(int cudaDeviceId); + struct DeviceMulticastPointer { + public: + std::shared_ptr devicePtr_; + std::shared_ptr mcPtr_; + }; + + std::shared_ptr allocateAndBindCuda(size_t size); + private: struct Impl; diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 986893bab..7099b59aa 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -50,10 +50,14 @@ struct CudaStreamWithFlags { cudaStream_t stream_; }; +template struct CudaDeleter; + template struct PhysicalCudaMemory { - CUmemGenericAllocationHandle memHandle; - T* devicePtr; + CUmemGenericAllocationHandle memHandle_; + std::shared_ptr devicePtr_; + PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr) + : memHandle_(memHandle), devicePtr_(std::shared_ptr(devicePtr, CudaDeleter())) {} }; namespace detail { @@ -80,33 +84,35 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { int deviceId = -1; MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); - PhysicalCudaMemory* ret = new PhysicalCudaMemory(); CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = deviceId; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + CUmemGenericAllocationHandle memHandle; size_t bufferSize = sizeof(T) * nelem; // allocate physical memory - MSCCLPP_CUTHROW(cuMemCreate(&ret->memHandle, bufferSize, &prop, 0 /*flags*/)); + MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/)); CUmemAccessDesc accessDesc = {}; accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDesc.location.id = deviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + T* devicePtr; // Map the device pointer - MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&ret->devicePtr, bufferSize, gran, 0U, 0)); - MSCCLPP_CUDATHROW(cudaMemset(ret->devicePtr, 0, bufferSize)); - MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)ret->devicePtr, bufferSize, 0, ret->memHandle, 0)); - MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)ret->devicePtr, bufferSize, &accessDesc, 1)); + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0)); + MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0)); + MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1)); CudaStreamWithFlags stream(cudaStreamNonBlocking); - MSCCLPP_CUDATHROW(cudaMemsetAsync(ret->devicePtr, 0, nelem * sizeof(T), stream)); + MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream)); MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); - return ret; + + return new PhysicalCudaMemory(memHandle, devicePtr); } template @@ -160,7 +166,7 @@ Memory safeAlloc(size_t nelem) { return Memory(ptr, Deleter()); } -template +template Memory safeAlloc(size_t nelem, size_t gran) { T* ptr = nullptr; try { @@ -219,7 +225,7 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { /// TODO: docs... template -std::shared_ptr allocSharedPhysicalCuda(size_t count, size_t gran) { +std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::shared_ptr>>(count, gran); } @@ -248,7 +254,7 @@ UniqueCudaPtr allocUniqueCuda(size_t count = 1) { } template -std::shared_ptr allocUniquePhysicalCuda(size_t count, size_t gran) { +std::shared_ptr> allocUniquePhysicalCuda(size_t count, size_t gran) { return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::unique_ptr, CudaDeleter>>>(count, gran); } diff --git a/src/connection.cc b/src/connection.cc index a7d1bbf2b..8d8d86a17 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -107,9 +107,11 @@ struct NvlsConnection::Impl { // These are only defined for multicast (NVLS) capability pid_t rootPid_; int mcFileDesc_; + size_t offset_; + std::vector>> physicalMemoryStorage; // use this only for the root of the NVLS - Impl(size_t bufferSize, int numDevices) { + Impl(size_t bufferSize, int numDevices) : offset_(0) { minMcGran_ = 0; mcGran_ = 0; mcProp_ = {}; @@ -129,7 +131,7 @@ struct NvlsConnection::Impl { INFO(MSCCLPP_COLL, "NVLS handle created on root"); } - Impl(const std::vector& data) { + Impl(const std::vector& data) : offset_(0) { auto it = data.begin(); std::copy_n(it, sizeof(*this), reinterpret_cast(this)); @@ -143,6 +145,41 @@ struct NvlsConnection::Impl { INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); } + struct MultiCastDeleter { + void operator()(char* ptr) { + // TODO: do something in here + } + }; + + std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize) { + if (offset_ > bufferSize_) { + throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError); + } + if (bufferSize_ - offset_ < devBuffSize) { + throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); + } + + physicalMemoryStorage.push_back(physicalMem); + + MSCCLPP_CUTHROW( + cuMulticastBindMem(mcHandle_, offset_ /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0)); + + char* mcPtr; + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + int deviceId = -1; + MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); + accessDesc.location.id = deviceId; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_)); + MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); + offset_ += devBuffSize; + + return std::shared_ptr(mcPtr, MultiCastDeleter()); + } + // TODO: close all FDs and deallocate all handles. }; @@ -171,6 +208,15 @@ std::vector NvlsConnection::serialize() { return result; } +std::shared_ptr NvlsConnection::allocateAndBindCuda(size_t size) { + auto mem = allocSharedPhysicalCuda(size, pimpl_->minMcGran_); + auto mcPtr = pimpl_->bindMemory(mem, size); + auto ret = std::make_shared(); + ret->devicePtr_ = mem->devicePtr_; + ret->mcPtr_ = mcPtr; + return ret; +} + // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) From e449835808193b5b20d7c2767389faadf826ae90 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sat, 20 Jan 2024 09:21:05 +0000 Subject: [PATCH 31/67] looks like it is working --- include/mscclpp/core.hpp | 10 +++++++++- include/mscclpp/gpu_utils.hpp | 8 ++++---- python/mscclpp/core_py.cpp | 14 +++++++++++++- python/test/test_mscclpp.py | 22 +++++++++++++++++++++- src/connection.cc | 12 ++++++++++++ 5 files changed, 59 insertions(+), 7 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 8910a03e4..08746b974 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -463,9 +464,16 @@ class NvlsConnection { void addDevice(int cudaDeviceId); struct DeviceMulticastPointer { - public: + private: std::shared_ptr devicePtr_; std::shared_ptr mcPtr_; + size_t bufferSize_; + + public: + using DeviceHandle = DeviceMulticastPointerDeviceHandle; + DeviceHandle deviceHandle(); + + friend class NvlsConnection; }; std::shared_ptr allocateAndBindCuda(size_t size); diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 7099b59aa..10a307188 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -50,7 +50,8 @@ struct CudaStreamWithFlags { cudaStream_t stream_; }; -template struct CudaDeleter; +template +struct CudaDeleter; template struct PhysicalCudaMemory { @@ -100,18 +101,17 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { accessDesc.location.id = deviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - T* devicePtr; + T* devicePtr = NULL; // Map the device pointer MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0)); - MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1)); + MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize)); CudaStreamWithFlags stream(cudaStreamNonBlocking); MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream)); MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); - return new PhysicalCudaMemory(memHandle, devicePtr); } diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 190a90d7b..a988151ef 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -126,7 +126,19 @@ void register_core(nb::module_& m) { .def("transport", &Connection::transport) .def("remote_transport", &Connection::remoteTransport); - nb::class_(m, "NvlsConnection"); + nb::class_ deviceMulticastPointer(m, "DeviceMulticastPointer"); + deviceMulticastPointer.def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle); + + nb::class_(deviceMulticastPointer, "DeviceHandle") + .def(nb::init<>()) + .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr) + .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr) + .def_rw("size", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::bufferSize) + .def_prop_ro("raw", [](const NvlsConnection::DeviceMulticastPointer::DeviceHandle& self) -> nb::bytes { + return nb::bytes(reinterpret_cast(&self), sizeof(self)); + }); + + nb::class_(m, "NvlsConnection").def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda); nb::class_(m, "Endpoint") .def("transport", &Endpoint::transport) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index f558dda48..aa63a3d20 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -283,6 +283,7 @@ def __init__( use_packet=False, scratch=None, fifo=None, + nvls_mem_handle=None ): file_dir = os.path.dirname(os.path.abspath(__file__)) if test_name == "h2d_semaphore": @@ -321,6 +322,12 @@ def __init__( ).get_compiled_kernel() self.nblocks = 1 self.nthreads = 1024 + elif test_name == "nvls": + self._kernel = KernelBuilder( + file="nvls_test.cu", kernel_name="nvls_test", file_dir=file_dir + ).get_compiled_kernel() + self.nblocks = 1 + self.nthreads = 1 else: assert False @@ -349,6 +356,8 @@ def __init__( semaphore_device_handles = [semaphore.device_handle().raw for semaphore in semaphore_or_channels] self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8) self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels) + elif test_name == "nvls": + self.params = nvls_mem_handle.device_handle().raw + pack(my_rank, nranks) def __call__(self): return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None) @@ -530,4 +539,15 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u @parametrize_mpi_groups(2, 4, 8) def test_nvls(mpi_group: MpiGroup): - group, connections = create_and_connect(mpi_group, "NVLS") + group, connection = create_and_connect(mpi_group, "NVLS") + nelem = 2**29 + mem_handle = connection.allocate_bind_memory(nelem) + + kernel = MscclppKernel( + "nvls", + my_rank=group.my_rank, + nranks=group.nranks, + nvls_mem_handle=mem_handle + ) + + kernel() diff --git a/src/connection.cc b/src/connection.cc index 8d8d86a17..8ca55cc71 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -10,6 +10,7 @@ #include #include +#include "api.h" #include "debug.h" #include "endpoint.hpp" #include "infiniband/verbs.h" @@ -121,6 +122,7 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; + bufferSize_ = mcProp_.size; MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); mcFileDesc_ = 0; MSCCLPP_CUTHROW( @@ -214,9 +216,19 @@ std::shared_ptr NvlsConnection::allocate auto ret = std::make_shared(); ret->devicePtr_ = mem->devicePtr_; ret->mcPtr_ = mcPtr; + ret->bufferSize_ = size; return ret; } +MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle +NvlsConnection::DeviceMulticastPointer::deviceHandle() { + NvlsConnection::DeviceMulticastPointer::DeviceHandle device; + device.devicePtr = this->devicePtr_.get(); + device.mcPtr = this->mcPtr_.get(); + device.bufferSize = this->bufferSize_; + return device; +}; + // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) From 7715776be3bf57174bf4b0cab5d8f689f2640d3c Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sat, 20 Jan 2024 18:36:19 +0000 Subject: [PATCH 32/67] it works --- python/test/test_mscclpp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index aa63a3d20..cccf522e0 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -537,7 +537,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u group.barrier() assert cp.array_equal(memory, memory_expected) -@parametrize_mpi_groups(2, 4, 8) +@parametrize_mpi_groups(8) def test_nvls(mpi_group: MpiGroup): group, connection = create_and_connect(mpi_group, "NVLS") nelem = 2**29 @@ -549,5 +549,7 @@ def test_nvls(mpi_group: MpiGroup): nranks=group.nranks, nvls_mem_handle=mem_handle ) - + kernel() + cp.cuda.runtime.deviceSynchronize() + group.barrier() kernel() From 855b2ee023e2cf4d3614c63a2b3b0dbea93eaa18 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 07:34:57 +0000 Subject: [PATCH 33/67] missing files --- include/mscclpp/gpu_utils.hpp | 11 ++++++--- include/mscclpp/nvls_device.hpp | 18 ++++++++++++++ python/test/nvls_test.cu | 43 +++++++++++++++++++++++++++++++++ python/test/test_mscclpp.py | 2 ++ 4 files changed, 70 insertions(+), 4 deletions(-) create mode 100644 include/mscclpp/nvls_device.hpp create mode 100644 python/test/nvls_test.cu diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 10a307188..1f2280703 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -106,10 +106,9 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1)); - MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize)); - CudaStreamWithFlags stream(cudaStreamNonBlocking); - MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream)); + MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream)); + MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); return new PhysicalCudaMemory(memHandle, devicePtr); @@ -188,8 +187,10 @@ template struct CudaDeleter { using TPtrOrArray = std::conditional_t, T, T*>; void operator()(TPtrOrArray ptr) { + printf("QQQQQ %p\n", ptr); AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaFree(ptr)); + printf("deletedCuda successfully\n"); } }; @@ -199,7 +200,9 @@ struct CudaPhysicalDeleter { std::conditional_t>, PhysicalCudaMemory, PhysicalCudaMemory*>; void operator()(TPtrOrArray ptr) { AvoidCudaGraphCaptureGuard cgcGuard; - // TODO: adding free'ing stuff here + printf("IIIIIIIIII %p\n", ptr); + delete ptr; + printf("deleted successfully\n"); } }; diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp new file mode 100644 index 000000000..106420e58 --- /dev/null +++ b/include/mscclpp/nvls_device.hpp @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#ifndef MSCCLPP_NVLS_DEVICE_HPP_ +#define MSCCLPP_NVLS_DEVICE_HPP_ + +namespace mscclpp { + +/// Device-side handle for @ref Host2DeviceSemaphore. +struct DeviceMulticastPointerDeviceHandle { + void* devicePtr; + void* mcPtr; + size_t bufferSize; +}; + +} // namespace mscclpp + +#endif // MSCCLPP_SEMAPHORE_DEVICE_HPP_ diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu new file mode 100644 index 000000000..cbd1a170e --- /dev/null +++ b/python/test/nvls_test.cu @@ -0,0 +1,43 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include + +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ + "r"(val.w) \ + : "memory"); +// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ + : "memory"); + + +extern "C" __global__ void __launch_bounds__(1024, 1) + nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) { + int tid = threadIdx.x; + int bid = blockIdx.x; + if (tid == 0 && bid == 0) { + float* devPtr = (float*)nvlsPtrs.devicePtr; + devPtr[0] = 3; + devPtr[1] = 4; + devPtr[2] = 5; + devPtr[3] = 6; + __threadfence_system(); + } + if (tid == 0 && bid == 0 && my_rank == 0) { + float* devPtr = (float*)nvlsPtrs.devicePtr; + + float* mcPtr = (float*)nvlsPtrs.mcPtr; + uint4 val; + MULTIMEM_LD(val, mcPtr); + MULTIMEM_ST(val, mcPtr); + __threadfence_system(); + + float tmp = *(float*)&val.x; + + printf("RRR %f %f\n", *devPtr, tmp); + } +} diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index cccf522e0..83d121c70 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -553,3 +553,5 @@ def test_nvls(mpi_group: MpiGroup): cp.cuda.runtime.deviceSynchronize() group.barrier() kernel() + cp.cuda.runtime.deviceSynchronize() + group.barrier() From 1120070ef8911b129399fb81f4914434ecece922 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 07:37:22 +0000 Subject: [PATCH 34/67] back to a working version --- include/mscclpp/gpu_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 1f2280703..f8319549b 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -201,7 +201,7 @@ struct CudaPhysicalDeleter { void operator()(TPtrOrArray ptr) { AvoidCudaGraphCaptureGuard cgcGuard; printf("IIIIIIIIII %p\n", ptr); - delete ptr; + // delete ptr; printf("deleted successfully\n"); } }; From d2d5ec0c2f242309c918aa80340b1078e3563459 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 08:49:17 +0000 Subject: [PATCH 35/67] clean up - wip --- include/mscclpp/gpu_utils.hpp | 26 +++++++++++++++++--------- python/test/nvls_test.cu | 1 - 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index f8319549b..031b08012 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -57,8 +57,14 @@ template struct PhysicalCudaMemory { CUmemGenericAllocationHandle memHandle_; std::shared_ptr devicePtr_; - PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr) - : memHandle_(memHandle), devicePtr_(std::shared_ptr(devicePtr, CudaDeleter())) {} + size_t bufferSize_; + // The deallocator for devicePtr will only unmap and free the address range. The physical memory + // deallocation will happen with CudaPhysicalDeleter. + PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize) + : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr(devicePtr, [this](T* ptr) { + MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_)); + MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_)); + })) {} }; namespace detail { @@ -111,7 +117,7 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream)); - return new PhysicalCudaMemory(memHandle, devicePtr); + return new PhysicalCudaMemory(memHandle, devicePtr, bufferSize); } template @@ -187,10 +193,8 @@ template struct CudaDeleter { using TPtrOrArray = std::conditional_t, T, T*>; void operator()(TPtrOrArray ptr) { - printf("QQQQQ %p\n", ptr); AvoidCudaGraphCaptureGuard cgcGuard; MSCCLPP_CUDATHROW(cudaFree(ptr)); - printf("deletedCuda successfully\n"); } }; @@ -200,9 +204,8 @@ struct CudaPhysicalDeleter { std::conditional_t>, PhysicalCudaMemory, PhysicalCudaMemory*>; void operator()(TPtrOrArray ptr) { AvoidCudaGraphCaptureGuard cgcGuard; - printf("IIIIIIIIII %p\n", ptr); - // delete ptr; - printf("deleted successfully\n"); + MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_)); + delete ptr; } }; @@ -226,7 +229,12 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { return detail::safeAlloc, CudaDeleter, std::shared_ptr>(count); } -/// TODO: docs... +/// Allocated physical memory on the device and returns a memory handle along with a memory handle for it. +/// The deallocation only happens PhysicalCudaMemory goes out of scope. +/// @tparam T Type of each element in the allocated memory. +/// @param count Number of elements to allocate. +/// @param gran the granularity forof the allocation. +/// @return A std::shared_ptr to the memory handle and a device pointer for that memory. template std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index cbd1a170e..d2f3ada98 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -14,7 +14,6 @@ : "l"(ptr) \ : "memory"); - extern "C" __global__ void __launch_bounds__(1024, 1) nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) { int tid = threadIdx.x; From 08e077ac21470c1da1ae11f1e9755fbc08126f29 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 09:08:45 +0000 Subject: [PATCH 36/67] clean up -- wip --- include/mscclpp/gpu_utils.hpp | 2 +- src/connection.cc | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 031b08012..032e375c5 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -56,8 +56,8 @@ struct CudaDeleter; template struct PhysicalCudaMemory { CUmemGenericAllocationHandle memHandle_; - std::shared_ptr devicePtr_; size_t bufferSize_; + std::shared_ptr devicePtr_; // The deallocator for devicePtr will only unmap and free the address range. The physical memory // deallocation will happen with CudaPhysicalDeleter. PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize) diff --git a/src/connection.cc b/src/connection.cc index 8ca55cc71..2b19c67cc 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -147,9 +147,17 @@ struct NvlsConnection::Impl { INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); } - struct MultiCastDeleter { + struct MultiCastBindDeleter { + CUmemGenericAllocationHandle mcHandle_; + int deviceId_; + size_t offset_; + size_t bufferSize_; + MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize) + : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {} void operator()(char* ptr) { - // TODO: do something in here + MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_)); + MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_)); + MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_)); } }; @@ -177,9 +185,10 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); + MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize); offset_ += devBuffSize; - return std::shared_ptr(mcPtr, MultiCastDeleter()); + return std::shared_ptr(mcPtr, deleter); } // TODO: close all FDs and deallocate all handles. From 887790af88061cd3b111ce0a791723aa8ffd7ca9 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 09:28:35 +0000 Subject: [PATCH 37/67] ok cleaned up --- include/mscclpp/core.hpp | 2 -- src/connection.cc | 19 ++++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 08746b974..f22f9009c 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -455,8 +455,6 @@ class NvlsConnection { NvlsConnection(size_t bufferSize, int numDevices); NvlsConnection(const std::vector& data); NvlsConnection() = delete; - // TODO: Clean up after yourself! - // ~NvlsConnection(); std::vector serialize(); // Everyone needs to synchronize after creating a NVLS connection before adding devices diff --git a/src/connection.cc b/src/connection.cc index 2b19c67cc..9a6007b70 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -127,8 +127,11 @@ struct NvlsConnection::Impl { mcFileDesc_ = 0; MSCCLPP_CUTHROW( cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); - // TODO: we need proper throw in here. + rootPid_ = getpid(); + if (rootPid_ < 0) { + throw mscclpp::SysError("getpid() failed", errno); + } INFO(MSCCLPP_COLL, "NVLS handle created on root"); } @@ -139,14 +142,25 @@ struct NvlsConnection::Impl { // TODO: proper throw int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); + if (rootPidFd < 0) { + throw mscclpp::SysError("pidfd_open() failed", errno); + } int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + if (mcRootFileDescFd < 0) { + throw mscclpp::SysError("pidfd_getfd() failed", errno); + } MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast(mcRootFileDescFd), CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); close(rootPidFd); + close(mcRootFileDescFd); INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); } + ~Impl() { + // we don't need to free multicast handle object according to NCCL. + } + struct MultiCastBindDeleter { CUmemGenericAllocationHandle mcHandle_; int deviceId_; @@ -169,6 +183,7 @@ struct NvlsConnection::Impl { throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); } + // keepin a copy physicalMem around so that the user doesn't accidentally get rids of all of them. physicalMemoryStorage.push_back(physicalMem); MSCCLPP_CUTHROW( @@ -190,8 +205,6 @@ struct NvlsConnection::Impl { return std::shared_ptr(mcPtr, deleter); } - - // TODO: close all FDs and deallocate all handles. }; NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) From d570916030df76a17a86a16ae8e92b0104589dc3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 09:55:29 +0000 Subject: [PATCH 38/67] starting to look good --- include/mscclpp/gpu_utils.hpp | 3 +++ python/test/test_mscclpp.py | 8 ++++++-- src/connection.cc | 7 ++++--- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 032e375c5..857a5cea3 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -237,6 +237,9 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { /// @return A std::shared_ptr to the memory handle and a device pointer for that memory. template std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { + if (count % gran) { + throw Error("The request allocation size is not divisible by the required granularity", ErrorCode::InvalidUsage); + } return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::shared_ptr>>(count, gran); } diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 83d121c70..aa282e9c1 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -540,8 +540,11 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u @parametrize_mpi_groups(8) def test_nvls(mpi_group: MpiGroup): group, connection = create_and_connect(mpi_group, "NVLS") - nelem = 2**29 - mem_handle = connection.allocate_bind_memory(nelem) + nbytes = 2**21 + mem_handle = connection.allocate_bind_memory(nbytes) + + nbytes = 2**21 + mem_handle2 = connection.allocate_bind_memory(nbytes) kernel = MscclppKernel( "nvls", @@ -555,3 +558,4 @@ def test_nvls(mpi_group: MpiGroup): kernel() cp.cuda.runtime.deviceSynchronize() group.barrier() + time.sleep(100) diff --git a/src/connection.cc b/src/connection.cc index 9a6007b70..2235f6e4f 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -121,7 +121,7 @@ struct NvlsConnection::Impl { mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_; + mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_; bufferSize_ = mcProp_.size; MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); mcFileDesc_ = 0; @@ -133,7 +133,8 @@ struct NvlsConnection::Impl { throw mscclpp::SysError("getpid() failed", errno); } - INFO(MSCCLPP_COLL, "NVLS handle created on root"); + INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n", + mcProp_.size, minMcGran_, mcGran_); } Impl(const std::vector& data) : offset_(0) { @@ -198,7 +199,7 @@ struct NvlsConnection::Impl { accessDesc.location.id = deviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); - MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize); offset_ += devBuffSize; From 9d43d4d6c1858e1d259bad9a9415a6175cc43c63 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 10:25:22 +0000 Subject: [PATCH 39/67] wip --- python/test/nvls_test.cu | 4 ++- python/test/test_mscclpp.py | 69 ++++++++++++++++++++----------------- src/connection.cc | 2 ++ 3 files changed, 42 insertions(+), 33 deletions(-) diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index d2f3ada98..20353bdd0 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -2,6 +2,7 @@ // Licensed under the MIT license. #include +#include #define MULTIMEM_ST(val, ptr) \ asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ @@ -15,7 +16,8 @@ : "memory"); extern "C" __global__ void __launch_bounds__(1024, 1) - nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) { + nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, + mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { int tid = threadIdx.x; int bid = blockIdx.x; if (tid == 0 && bid == 0) { diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index aa282e9c1..06ef57a7c 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -116,19 +116,15 @@ def init_target(): mpi_group.comm.barrier() - -def create_and_connect(mpi_group: MpiGroup, transport: str): - if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: - pytest.skip("cannot use nvlink/nvls for cross node") - group = mscclpp_comm.CommGroup(mpi_group.comm) +def create_connection(group: mscclpp_comm.CommGroup, transport: str): if transport == "NVLS": - all_ranks = list(range(mpi_group.comm.size)) + all_ranks = list(range(group.nranks)) tran = Transport.Nvls connection = group.make_connection(all_ranks, tran) - return group, connection + return connection - remote_nghrs = list(range(mpi_group.comm.size)) - remote_nghrs.remove(mpi_group.comm.rank) + remote_nghrs = list(range(group.nranks)) + remote_nghrs.remove(group.my_rank) if transport == "NVLink": tran = Transport.CudaIpc elif transport == "IB": @@ -136,20 +132,27 @@ def create_and_connect(mpi_group: MpiGroup, transport: str): else: assert False connections = group.make_connection(remote_nghrs, tran) - return group, connections + return connections + +def create_group_and_connection(mpi_group: MpiGroup, transport: str): + if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: + pytest.skip("cannot use nvlink/nvls for cross node") + group = mscclpp_comm.CommGroup(mpi_group.comm) + connection = create_connection(group, transport) + return group, connection @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) def test_group_with_connections(mpi_group: MpiGroup, transport: str): - create_and_connect(mpi_group, transport) + create_group_and_connection(mpi_group, transport) @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int): - group, connections = create_and_connect(mpi_group, transport) + group, connections = create_group_and_connection(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) nelemPerRank = nelem // group.nranks sizePerRank = nelemPerRank * memory.itemsize @@ -190,7 +193,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, if device == "cpu" and transport == "NVLink": pytest.skip("nvlink doesn't work with host allocated memory") - group, connections = create_and_connect(mpi_group, transport) + group, connections = create_group_and_connection(mpi_group, transport) xp = cp if device == "cuda" else np if group.my_rank == 0: memory = xp.random.randn(nelem) @@ -234,7 +237,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores(mpi_group: MpiGroup): - group, connections = create_and_connect(mpi_group, "IB") + group, connections = create_group_and_connection(mpi_group, "IB") semaphores = group.make_semaphore(connections, Host2HostSemaphore) for rank in connections: @@ -247,7 +250,7 @@ def test_h2h_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) def test_h2h_semaphores_gil_release(mpi_group: MpiGroup): - group, connections = create_and_connect(mpi_group, "IB") + group, connections = create_group_and_connection(mpi_group, "IB") semaphores = group.make_semaphore(connections, Host2HostSemaphore) @@ -283,7 +286,8 @@ def __init__( use_packet=False, scratch=None, fifo=None, - nvls_mem_handle=None + nvls_mem_handle=None, + nvls_buffer_size=None ): file_dir = os.path.dirname(os.path.abspath(__file__)) if test_name == "h2d_semaphore": @@ -332,7 +336,7 @@ def __init__( assert False self.params = b"" - if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]: + if semaphore_or_channels != None: first_arg = next(iter(semaphore_or_channels.values())) size_of_semaphore_or_channels = len(first_arg.device_handle().raw) device_handles = [] @@ -345,6 +349,8 @@ def __init__( device_handles.append(semaphore_or_channels[rank].device_handle().raw) # keep a reference to the device handles so that they don't get garbage collected self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8) + + if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]: self.params += pack(self._d_semaphore_or_channels, my_rank, nranks) if test_name == "sm_channel": self.params += pack(tensor.size, use_packet) @@ -357,7 +363,7 @@ def __init__( self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8) self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels) elif test_name == "nvls": - self.params = nvls_mem_handle.device_handle().raw + pack(my_rank, nranks) + self.params = nvls_mem_handle.device_handle().raw + pack(self._d_semaphore_or_channels) + pack(my_rank, nranks, nvls_buffer_size) def __call__(self): return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None) @@ -370,7 +376,7 @@ def signal(semaphores): for rank in semaphores: semaphores[rank].signal() - group, connections = create_and_connect(mpi_group, transport) + group, connections = create_group_and_connection(mpi_group, transport) semaphores = group.make_semaphore(connections, Host2DeviceSemaphore) kernel = MscclppKernel("h2d_semaphore", group.my_rank, group.nranks, semaphores) @@ -386,7 +392,7 @@ def signal(semaphores): @parametrize_mpi_groups(2, 4, 8, 16) def test_d2d_semaphores(mpi_group: MpiGroup): - group, connections = create_and_connect(mpi_group, "NVLink") + group, connections = create_group_and_connection(mpi_group, "NVLink") semaphores = group.make_semaphore(connections, SmDevice2DeviceSemaphore) group.barrier() @@ -400,7 +406,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup): @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("use_packet", [False, True]) def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): - group, connections = create_and_connect(mpi_group, "NVLink") + group, connections = create_group_and_connection(mpi_group, "NVLink") memory = cp.zeros(nelem, dtype=cp.int32) if use_packet: @@ -448,7 +454,7 @@ def test_fifo( @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): - group, connections = create_and_connect(mpi_group, transport) + group, connections = create_group_and_connection(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) nelemPerRank = nelem // group.nranks @@ -498,7 +504,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): @pytest.mark.parametrize("transport", ["NVLink", "IB"]) @pytest.mark.parametrize("use_packet", [False, True]) def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): - group, connections = create_and_connect(mpi_group, transport) + group, connections = create_group_and_connection(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) if use_packet: @@ -539,23 +545,22 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u @parametrize_mpi_groups(8) def test_nvls(mpi_group: MpiGroup): - group, connection = create_and_connect(mpi_group, "NVLS") + group, nvls_connection = create_group_and_connection(mpi_group, "NVLS") nbytes = 2**21 - mem_handle = connection.allocate_bind_memory(nbytes) + mem_handle = nvls_connection.allocate_bind_memory(nbytes) - nbytes = 2**21 - mem_handle2 = connection.allocate_bind_memory(nbytes) + nvlinks_connections = create_connection(group, "NVLink") + semaphores = group.make_semaphore(nvlinks_connections, SmDevice2DeviceSemaphore) kernel = MscclppKernel( "nvls", my_rank=group.my_rank, nranks=group.nranks, - nvls_mem_handle=mem_handle + nvls_mem_handle=mem_handle, + nvls_buffer_size=nbytes, + semaphore_or_channels=semaphores ) + kernel() cp.cuda.runtime.deviceSynchronize() group.barrier() - kernel() - cp.cuda.runtime.deviceSynchronize() - group.barrier() - time.sleep(100) diff --git a/src/connection.cc b/src/connection.cc index 2235f6e4f..0fe8b228c 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -173,6 +173,8 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_)); MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_)); MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_)); + + INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr); } }; From d3f4243201feb4f4945b9086ea799a5e037ee4dd Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 10:52:19 +0000 Subject: [PATCH 40/67] correctness check passes! --- python/test/nvls_test.cu | 60 +++++++++++++++++++++++++++++-------- python/test/test_mscclpp.py | 4 +-- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index 20353bdd0..6f648b1f6 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -3,6 +3,10 @@ #include #include +#include +#include + +__device__ mscclpp::DeviceSyncer deviceSyncer; #define MULTIMEM_ST(val, ptr) \ asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ @@ -18,27 +22,57 @@ extern "C" __global__ void __launch_bounds__(1024, 1) nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { + + int nelem = nbytes/sizeof(float); + float* dev_ptr = (float*)nvlsPtrs.devicePtr; + float* mc_ptr = (float*)nvlsPtrs.mcPtr; int tid = threadIdx.x; int bid = blockIdx.x; - if (tid == 0 && bid == 0) { - float* devPtr = (float*)nvlsPtrs.devicePtr; - devPtr[0] = 3; - devPtr[1] = 4; - devPtr[2] = 5; - devPtr[3] = 6; + + for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){ + dev_ptr[idx] = my_rank; + } + deviceSyncer.sync(gridDim.x); + if (tid == 0 && bid == 0){ __threadfence_system(); } - if (tid == 0 && bid == 0 && my_rank == 0) { - float* devPtr = (float*)nvlsPtrs.devicePtr; - float* mcPtr = (float*)nvlsPtrs.mcPtr; + if (bid == 0){ + if (tid < nranks && tid != my_rank) { + semaphores[tid].signal(); + semaphores[tid].wait(); + } + } + deviceSyncer.sync(gridDim.x); + + int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks; + int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks; + + int my_offset = (tid + bid * blockDim.x) * 4; + int my_step = blockDim.x * gridDim.x * 4; + + for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { uint4 val; - MULTIMEM_LD(val, mcPtr); - MULTIMEM_ST(val, mcPtr); + MULTIMEM_LD(val, mc_ptr + idx); + MULTIMEM_ST(val, mc_ptr + idx); + } + + deviceSyncer.sync(gridDim.x); + if (tid == 0 && bid == 0){ __threadfence_system(); + } - float tmp = *(float*)&val.x; + if (bid == 0){ + if (tid < nranks && tid != my_rank) { + semaphores[tid].signal(); + semaphores[tid].wait(); + } + } + deviceSyncer.sync(gridDim.x); - printf("RRR %f %f\n", *devPtr, tmp); + for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){ + if (dev_ptr[idx] != ((nranks * (nranks-1))/2)){ + __assert_fail("dev_ptr[idx] != nranks", __FILE__, __LINE__, __PRETTY_FUNCTION__); + } } } diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 06ef57a7c..e7437c7ec 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -330,8 +330,8 @@ def __init__( self._kernel = KernelBuilder( file="nvls_test.cu", kernel_name="nvls_test", file_dir=file_dir ).get_compiled_kernel() - self.nblocks = 1 - self.nthreads = 1 + self.nblocks = 64 + self.nthreads = 1024 else: assert False From d30f557145fece4cbf69db21d03a383cb84bd6c3 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Sun, 21 Jan 2024 11:52:19 +0000 Subject: [PATCH 41/67] debugging --- include/mscclpp/gpu_utils.hpp | 5 +- python/mscclpp_benchmark/allreduce.cu | 64 +++++++++++++++++++ python/mscclpp_benchmark/allreduce_bench.py | 13 ++-- python/mscclpp_benchmark/mscclpp_op.py | 70 ++++++++++++++++++++- python/test/nvls_test.cu | 21 +++---- src/connection.cc | 7 ++- 6 files changed, 157 insertions(+), 23 deletions(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 857a5cea3..7580771f9 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -62,8 +62,9 @@ struct PhysicalCudaMemory { // deallocation will happen with CudaPhysicalDeleter. PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize) : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr(devicePtr, [this](T* ptr) { - MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_)); - MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_)); + // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_)); + // printf("MMMMMMMMM %p\n", ptr); + // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_)); })) {} }; diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu index e86047283..f7045878a 100644 --- a/python/mscclpp_benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -775,3 +776,66 @@ extern "C" __global__ void __launch_bounds__(1024, 1) globalFlag += 1; } } + +// ------------------------------------------- +// AllReduce6 +// NVLS +// ------------------------------------------- + +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ + "r"(val.w) \ + : "memory"); +// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ + : "memory"); + +extern "C" __global__ void __launch_bounds__(1024, 1) + allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, + mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, TYPE* buff, int my_rank, int nranks, + size_t nbytes) { + int nelem = nbytes / sizeof(float); + float* dev_ptr = (float*)nvlsPtrs.devicePtr; + float* mc_ptr = (float*)nvlsPtrs.mcPtr; + int tid = threadIdx.x; + int bid = blockIdx.x; + + if (tid == 0 && bid == 0) { + __threadfence_system(); + } + if (bid == 0) { + if (tid < nranks - 1) { + semaphores[tid].signal(); + semaphores[tid].wait(); + } + } + deviceSyncer.sync(gridDim.x); + + int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks; + int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks; + + int my_offset = (tid + bid * blockDim.x) * 4; + int my_step = blockDim.x * gridDim.x * 4; + + for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { + uint4 val; + MULTIMEM_LD(val, mc_ptr + idx); + MULTIMEM_ST(val, mc_ptr + idx); + } + + deviceSyncer.sync(gridDim.x); + if (tid == 0 && bid == 0) { + __threadfence_system(); + } + + if (bid == 0) { + if (tid < nranks - 1) { + semaphores[tid].signal(); + semaphores[tid].wait(); + } + } + deviceSyncer.sync(gridDim.x); +} \ No newline at end of file diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 5a3987cd3..bebbf1c47 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. import cupy as cp -from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5 +from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5, MscclppAllReduce6 from nccl_op import NcclAllReduce from mpi4py import MPI import cupy.cuda.nccl as nccl @@ -145,10 +145,11 @@ def run_benchmark( proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - if memory.nbytes < 2**20: - mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - else: - mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) + mscclpp_call = MscclppAllReduce6(mscclpp_group, memory) + # if memory.nbytes < 2**20: + # mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) + # else: + # mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) # else: # proxy_service = ProxyService() # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) @@ -171,7 +172,7 @@ def run_benchmark( memory_nbytes = memory.nbytes mscclpp_time = bench_time(niter, mscclpp_call) mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3 - mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL" + mscclpp_check = "PASS" #if check_correctness(memory, mscclpp_call) else "FAIL" nccl_time = bench_time(niter, nccl_call) nccl_algBw = memory_nbytes / nccl_time / 1e3 diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py index ab51f7c84..662fc5bc0 100644 --- a/python/mscclpp_benchmark/mscclpp_op.py +++ b/python/mscclpp_benchmark/mscclpp_op.py @@ -1,7 +1,7 @@ import os import cupy as cp import ctypes -from mscclpp import Transport, ProxyService +from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore import mscclpp.comm as mscclpp_comm from mscclpp.utils import KernelBuilder, pack @@ -418,3 +418,71 @@ def auto_tune(self): for block_size in block_size_to_try: self.set_params(nblocks, block_size) yield nblocks, block_size + + + +class MscclppAllReduce6: + def __init__( + self, + group: mscclpp_comm.CommGroup, + memory: cp.ndarray, + block_size: int = 1024, + nblocks: int = 32, + ): + self.group = group + self.memory = memory + type_str = type_to_str(memory.dtype) + all_ranks = list(range(group.nranks)) + remote_nghrs = all_ranks.copy() + remote_nghrs.remove(self.group.my_rank) + + self.group.barrier() + # create a connection for each remote neighbor + self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) + self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls) + self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(2**29) # just using recommended size for now + + # create a sm_channel for each remote neighbor + self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore) + file_dir = os.path.dirname(os.path.abspath(__file__)) + self.kernel = KernelBuilder( + file="allreduce.cu", + kernel_name="allreduce6", + file_dir=file_dir, + macro_dict={"TYPE": type_str}, + ).get_compiled_kernel() + self.device_handles = [] + for rank in range(self.group.nranks): + if rank != self.group.my_rank: + self.device_handles.append(self.semaphores[rank].device_handle().raw) + + self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8) + self.nvls_handle = self.nvls_mem_handle.device_handle().raw + + self.set_params(nblocks, block_size) + + def __call__(self, stream_ptr): + self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) + return self.memory + + def set_params(self, nblocks, block_size): + self.nblocks = nblocks + self.block_size = block_size + self.params = b"" + self.params += pack( + self.device_handles_cp, + self.nvls_handle, + # self.memory, + self.group.my_rank, + self.group.nranks, + # ctypes.c_size_t(self.memory.size), + ) + + def auto_tune(self): + nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108] + block_size_to_try = [256, 512, 1024] + for nblocks in nblocks_to_try: + for block_size in block_size_to_try: + self.set_params(nblocks, block_size) + yield nblocks, block_size + diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index 6f648b1f6..5001072ac 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -1,10 +1,10 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include -#include #include +#include #include +#include __device__ mscclpp::DeviceSyncer deviceSyncer; @@ -22,22 +22,21 @@ __device__ mscclpp::DeviceSyncer deviceSyncer; extern "C" __global__ void __launch_bounds__(1024, 1) nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { - - int nelem = nbytes/sizeof(float); + int nelem = nbytes / sizeof(float); float* dev_ptr = (float*)nvlsPtrs.devicePtr; float* mc_ptr = (float*)nvlsPtrs.mcPtr; int tid = threadIdx.x; int bid = blockIdx.x; - for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){ + for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) { dev_ptr[idx] = my_rank; } deviceSyncer.sync(gridDim.x); - if (tid == 0 && bid == 0){ + if (tid == 0 && bid == 0) { __threadfence_system(); } - if (bid == 0){ + if (bid == 0) { if (tid < nranks && tid != my_rank) { semaphores[tid].signal(); semaphores[tid].wait(); @@ -58,11 +57,11 @@ extern "C" __global__ void __launch_bounds__(1024, 1) } deviceSyncer.sync(gridDim.x); - if (tid == 0 && bid == 0){ + if (tid == 0 && bid == 0) { __threadfence_system(); } - if (bid == 0){ + if (bid == 0) { if (tid < nranks && tid != my_rank) { semaphores[tid].signal(); semaphores[tid].wait(); @@ -70,8 +69,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) } deviceSyncer.sync(gridDim.x); - for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){ - if (dev_ptr[idx] != ((nranks * (nranks-1))/2)){ + for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) { + if (dev_ptr[idx] != ((nranks * (nranks - 1)) / 2)) { __assert_fail("dev_ptr[idx] != nranks", __FILE__, __LINE__, __PRETTY_FUNCTION__); } } diff --git a/src/connection.cc b/src/connection.cc index 0fe8b228c..ffb249920 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -170,9 +170,10 @@ struct NvlsConnection::Impl { MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize) : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {} void operator()(char* ptr) { - MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_)); - MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_)); - MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_)); + // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_)); + // printf("NNNNNN %p\n", ptr); + // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_)); + // MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_)); INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr); } From 572b30b3c7f7308a94a6fce572aca6cac1b40c66 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 21 Jan 2024 12:59:01 +0000 Subject: [PATCH 42/67] update --- include/mscclpp/core.hpp | 1 + include/mscclpp/gpu_utils.hpp | 22 ++++++++-------------- src/bootstrap/bootstrap.cc | 14 ++++++++++++++ src/communicator.cc | 19 ++++++------------- src/connection.cc | 10 +++++----- 5 files changed, 34 insertions(+), 32 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index f22f9009c..91c82fe3b 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -43,6 +43,7 @@ class Bootstrap { virtual void allGather(void* allData, int size) = 0; virtual void barrier() = 0; + void groupBarrier(const std::vector& ranks); void send(const std::vector& data, int peer, int tag); void recv(std::vector& data, int peer, int tag); }; diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 7580771f9..792e754ef 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -56,16 +56,10 @@ struct CudaDeleter; template struct PhysicalCudaMemory { CUmemGenericAllocationHandle memHandle_; - size_t bufferSize_; - std::shared_ptr devicePtr_; - // The deallocator for devicePtr will only unmap and free the address range. The physical memory - // deallocation will happen with CudaPhysicalDeleter. - PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize) - : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr(devicePtr, [this](T* ptr) { - // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_)); - // printf("MMMMMMMMM %p\n", ptr); - // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_)); - })) {} + T* devicePtr_; + size_t size_; + PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t size) + : memHandle_(memHandle), devicePtr_(devicePtr), size_(size) {} }; namespace detail { @@ -201,12 +195,12 @@ struct CudaDeleter { template struct CudaPhysicalDeleter { - using TPtrOrArray = - std::conditional_t>, PhysicalCudaMemory, PhysicalCudaMemory*>; - void operator()(TPtrOrArray ptr) { + static_assert(!std::is_array_v, "T must not be an array"); + void operator()(PhysicalCudaMemory* ptr) { AvoidCudaGraphCaptureGuard cgcGuard; + MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr->devicePtr_, ptr->size_)); + MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr->devicePtr_, ptr->size_)); MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_)); - delete ptr; } }; diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc index 649a1f62e..00a58b992 100644 --- a/src/bootstrap/bootstrap.cc +++ b/src/bootstrap/bootstrap.cc @@ -35,6 +35,20 @@ struct ExtInfo { SocketAddress extAddressListen; }; +MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector& ranks) { + int dummy = 0; + for (auto rank : ranks) { + if (rank != this->getRank()) { + this->send(static_cast(&dummy), sizeof(dummy), rank, 0); + } + } + for (auto rank : ranks) { + if (rank != this->getRank()) { + this->recv(static_cast(&dummy), sizeof(dummy), rank, 0); + } + } +} + MSCCLPP_API_CPP void Bootstrap::send(const std::vector& data, int peer, int tag) { size_t size = data.size(); send((void*)&size, sizeof(size_t), peer, tag); diff --git a/src/communicator.cc b/src/communicator.cc index 6c1849aae..d5c3e9ed4 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -3,6 +3,8 @@ #include "communicator.hpp" +#include + #include "api.h" #include "debug.h" @@ -111,6 +113,7 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollecti int myRank = bootstrap->getRank(); bool isRoot = false; bool amongAllRanks = false; + std::sort(allRanks.begin(), allRanks.end()); int rootRank = allRanks[0]; for (auto nvlsRank : allRanks) { if (nvlsRank == myRank) amongAllRanks = true; @@ -122,7 +125,6 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollecti if (rootRank == myRank) isRoot = true; std::shared_ptr conn; - if (isRoot) { conn = std::make_shared(config.nvlsBufferSize, allRanks.size()); auto serialized = conn->serialize(); @@ -136,21 +138,12 @@ MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollecti } // Now let's synchronize all ranks - int dummy = 0; - for (auto nvlsRank : allRanks) { - if (nvlsRank != myRank) { - bootstrap->send(static_cast(&dummy), sizeof(dummy), nvlsRank, 0); - } - } - for (auto nvlsRank : allRanks) { - if (nvlsRank != myRank) { - bootstrap->recv(static_cast(&dummy), sizeof(dummy), nvlsRank, 0); - } - } - + bootstrap->groupBarrier(allRanks); // now it is safe to add my device conn->addDevice(); + // sync here to make sure all ranks have added their devices + bootstrap->groupBarrier(allRanks); return conn; } diff --git a/src/connection.cc b/src/connection.cc index ffb249920..fce7404ed 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -167,6 +167,7 @@ struct NvlsConnection::Impl { int deviceId_; size_t offset_; size_t bufferSize_; + MultiCastBindDeleter() = default; MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize) : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {} void operator()(char* ptr) { @@ -217,14 +218,11 @@ NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) void NvlsConnection::addDevice() { int cudaDeviceId; MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); - MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId)); - - INFO(MSCCLPP_COLL, "NVLS connection created"); + this->addDevice(cudaDeviceId); } void NvlsConnection::addDevice(int cudaDeviceId) { MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId)); - INFO(MSCCLPP_COLL, "NVLS connection created"); } @@ -236,11 +234,13 @@ std::vector NvlsConnection::serialize() { return result; } +// TODO: we need to atuo delete the memory we multicast pointer is no used anymore std::shared_ptr NvlsConnection::allocateAndBindCuda(size_t size) { auto mem = allocSharedPhysicalCuda(size, pimpl_->minMcGran_); auto mcPtr = pimpl_->bindMemory(mem, size); auto ret = std::make_shared(); - ret->devicePtr_ = mem->devicePtr_; + // hack, need to update + ret->devicePtr_ = std::shared_ptr(mem->devicePtr_, NvlsConnection::Impl::MultiCastBindDeleter()); ret->mcPtr_ = mcPtr; ret->bufferSize_ = size; return ret; From 8a710abd69396b24071f925dd976b20c1cb169f8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 21 Jan 2024 14:26:27 +0000 Subject: [PATCH 43/67] update --- include/mscclpp/core.hpp | 5 ++++- src/connection.cc | 35 +++++++++-------------------------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 91c82fe3b..589dcf30c 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -464,12 +464,15 @@ class NvlsConnection { struct DeviceMulticastPointer { private: - std::shared_ptr devicePtr_; + std::shared_ptr> deviceMem_; std::shared_ptr mcPtr_; size_t bufferSize_; public: using DeviceHandle = DeviceMulticastPointerDeviceHandle; + DeviceMulticastPointer(std::shared_ptr> deviceMem, std::shared_ptr mcPtr, + size_t bufferSize) + : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {} DeviceHandle deviceHandle(); friend class NvlsConnection; diff --git a/src/connection.cc b/src/connection.cc index fce7404ed..2933b0659 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -162,24 +162,6 @@ struct NvlsConnection::Impl { // we don't need to free multicast handle object according to NCCL. } - struct MultiCastBindDeleter { - CUmemGenericAllocationHandle mcHandle_; - int deviceId_; - size_t offset_; - size_t bufferSize_; - MultiCastBindDeleter() = default; - MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize) - : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {} - void operator()(char* ptr) { - // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_)); - // printf("NNNNNN %p\n", ptr); - // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_)); - // MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_)); - - INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr); - } - }; - std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize) { if (offset_ > bufferSize_) { throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError); @@ -205,7 +187,13 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); - MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize); + + // Is this enough? Or we should update the offset as well + auto deleter = [=](char* ptr) { + MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize)); + MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize)); + MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId, offset_, devBuffSize)); + }; offset_ += devBuffSize; return std::shared_ptr(mcPtr, deleter); @@ -238,18 +226,13 @@ std::vector NvlsConnection::serialize() { std::shared_ptr NvlsConnection::allocateAndBindCuda(size_t size) { auto mem = allocSharedPhysicalCuda(size, pimpl_->minMcGran_); auto mcPtr = pimpl_->bindMemory(mem, size); - auto ret = std::make_shared(); - // hack, need to update - ret->devicePtr_ = std::shared_ptr(mem->devicePtr_, NvlsConnection::Impl::MultiCastBindDeleter()); - ret->mcPtr_ = mcPtr; - ret->bufferSize_ = size; - return ret; + return std::make_shared(mem, mcPtr, size); } MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() { NvlsConnection::DeviceMulticastPointer::DeviceHandle device; - device.devicePtr = this->devicePtr_.get(); + device.devicePtr = this->deviceMem_->devicePtr_; device.mcPtr = this->mcPtr_.get(); device.bufferSize = this->bufferSize_; return device; From 2aeae96e2697dc5036d4a99bb76d406951a9bc11 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Sun, 21 Jan 2024 15:07:54 +0000 Subject: [PATCH 44/67] fix --- src/connection.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 2933b0659..24b043396 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -189,10 +189,12 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); // Is this enough? Or we should update the offset as well - auto deleter = [=](char* ptr) { + auto deleter = [=, bindOffset = offset_](char* ptr) { + CUdevice device; + MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId)); MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize)); MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize)); - MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId, offset_, devBuffSize)); + MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, bindOffset, devBuffSize)); }; offset_ += devBuffSize; From 43d60ae5ec2b929ebbaf940be8c95c80ec8c7c23 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 22 Jan 2024 03:10:49 +0000 Subject: [PATCH 45/67] fix benchmark --- python/mscclpp_benchmark/allreduce.cu | 3 +-- python/mscclpp_benchmark/allreduce_bench.py | 2 +- python/mscclpp_benchmark/mscclpp_op.py | 5 ++--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu index f7045878a..56aeb572b 100644 --- a/python/mscclpp_benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -796,8 +796,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) extern "C" __global__ void __launch_bounds__(1024, 1) allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, TYPE* buff, int my_rank, int nranks, - size_t nbytes) { - int nelem = nbytes / sizeof(float); + size_t nelem) { float* dev_ptr = (float*)nvlsPtrs.devicePtr; float* mc_ptr = (float*)nvlsPtrs.mcPtr; int tid = threadIdx.x; diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index bebbf1c47..aaea7bc71 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -248,7 +248,7 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - for i in range(10, 28): + for i in range(10, 21): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py index 662fc5bc0..c5e81c3c4 100644 --- a/python/mscclpp_benchmark/mscclpp_op.py +++ b/python/mscclpp_benchmark/mscclpp_op.py @@ -472,10 +472,10 @@ def set_params(self, nblocks, block_size): self.params += pack( self.device_handles_cp, self.nvls_handle, - # self.memory, + self.memory, self.group.my_rank, self.group.nranks, - # ctypes.c_size_t(self.memory.size), + ctypes.c_size_t(self.memory.size), ) def auto_tune(self): @@ -485,4 +485,3 @@ def auto_tune(self): for block_size in block_size_to_try: self.set_params(nblocks, block_size) yield nblocks, block_size - From a9e274b44b18b15dd937ac2537088a6a7268707c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Mon, 22 Jan 2024 04:54:11 +0000 Subject: [PATCH 46/67] a bit clean up --- python/mscclpp/core_py.cpp | 6 +++--- python/mscclpp_benchmark/allreduce_bench.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index a988151ef..452f95b21 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -126,10 +126,10 @@ void register_core(nb::module_& m) { .def("transport", &Connection::transport) .def("remote_transport", &Connection::remoteTransport); - nb::class_ deviceMulticastPointer(m, "DeviceMulticastPointer"); - deviceMulticastPointer.def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle); + nb::class_(m, "DeviceMulticastPointer") + .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle); - nb::class_(deviceMulticastPointer, "DeviceHandle") + nb::class_(m, "DeviceHandle") .def(nb::init<>()) .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr) .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index aaea7bc71..bebbf1c47 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -248,7 +248,7 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - for i in range(10, 21): + for i in range(10, 28): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: From c1ebc3e197da8d72662244ad70f473adc9d03426 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 22 Jan 2024 06:18:27 +0000 Subject: [PATCH 47/67] correctness works now --- include/mscclpp/core.hpp | 3 +++ include/mscclpp/gpu_utils.hpp | 4 ++- python/mscclpp/core_py.cpp | 6 ++++- python/mscclpp_benchmark/allreduce_bench.py | 22 +++++++++++------ python/mscclpp_benchmark/mscclpp_op.py | 22 +++++++++++++---- python/test/test_mscclpp.py | 27 +++++++++++++-------- src/connection.cc | 8 +++--- 7 files changed, 65 insertions(+), 27 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 589dcf30c..d7e6467b0 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -474,12 +474,15 @@ class NvlsConnection { size_t bufferSize) : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {} DeviceHandle deviceHandle(); + char* getDevicePtr(); friend class NvlsConnection; }; std::shared_ptr allocateAndBindCuda(size_t size); + size_t getMultiCastMinGranularity(); + private: struct Impl; diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 792e754ef..b3b8ec7bb 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -233,7 +233,9 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { template std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { if (count % gran) { - throw Error("The request allocation size is not divisible by the required granularity", ErrorCode::InvalidUsage); + throw Error("The request allocation size is not divisible by the required granularity:" + std::to_string(count) + + " vs " + std::to_string(gran), + ErrorCode::InvalidUsage); } return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::shared_ptr>>(count, gran); diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 452f95b21..996cd3d99 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -127,6 +127,8 @@ void register_core(nb::module_& m) { .def("remote_transport", &Connection::remoteTransport); nb::class_(m, "DeviceMulticastPointer") + .def("get_device_ptr", + [](NvlsConnection::DeviceMulticastPointer* self) { return (uintptr_t)self->getDevicePtr(); }) .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle); nb::class_(m, "DeviceHandle") @@ -138,7 +140,9 @@ void register_core(nb::module_& m) { return nb::bytes(reinterpret_cast(&self), sizeof(self)); }); - nb::class_(m, "NvlsConnection").def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda); + nb::class_(m, "NvlsConnection") + .def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda) + .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity); nb::class_(m, "Endpoint") .def("transport", &Endpoint::transport) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index bebbf1c47..a8faa7a68 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -2,7 +2,14 @@ # Licensed under the MIT license. import cupy as cp -from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5, MscclppAllReduce6 +from mscclpp_op import ( + MscclppAllReduce1, + MscclppAllReduce2, + MscclppAllReduce3, + MscclppAllReduce4, + MscclppAllReduce5, + MscclppAllReduce6, +) from nccl_op import NcclAllReduce from mpi4py import MPI import cupy.cuda.nccl as nccl @@ -145,7 +152,8 @@ def run_benchmark( proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - mscclpp_call = MscclppAllReduce6(mscclpp_group, memory) + mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) + memory = mscclpp_call.get_memory() # if memory.nbytes < 2**20: # mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) # else: @@ -155,7 +163,7 @@ def run_benchmark( # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) # proxy_service.start_proxy() else: - if memory.nbytes < 2**22: + if memory.nbytes < 2 ** 22: proxy_service = ProxyService() mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service) proxy_service.start_proxy() @@ -172,7 +180,7 @@ def run_benchmark( memory_nbytes = memory.nbytes mscclpp_time = bench_time(niter, mscclpp_call) mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3 - mscclpp_check = "PASS" #if check_correctness(memory, mscclpp_call) else "FAIL" + mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL" nccl_time = bench_time(niter, nccl_call) nccl_algBw = memory_nbytes / nccl_time / 1e3 @@ -250,13 +258,13 @@ def run_benchmark( speed_ups = [] for i in range(10, 28): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - nelems = 2**i + nelems = 2 ** i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: - nelems = 3 * 2**i + nelems = 3 * 2 ** i else: raise RuntimeError("Only support one node/two nodes communication") - if nelems * data_type().itemsize > 2**32: + if nelems * data_type().itemsize > 2 ** 32: break # due to trigger bit width limitation, we can only support up to 2**32 size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems) diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py index c5e81c3c4..f36cc5f61 100644 --- a/python/mscclpp_benchmark/mscclpp_op.py +++ b/python/mscclpp_benchmark/mscclpp_op.py @@ -420,18 +420,19 @@ def auto_tune(self): yield nblocks, block_size - class MscclppAllReduce6: def __init__( self, group: mscclpp_comm.CommGroup, - memory: cp.ndarray, + nelem: int, + memory_dtype: cp.dtype, block_size: int = 1024, nblocks: int = 32, ): self.group = group - self.memory = memory - type_str = type_to_str(memory.dtype) + datatype_size = memory_dtype().itemsize + buffer_size = nelem * datatype_size + type_str = type_to_str(memory_dtype) all_ranks = list(range(group.nranks)) remote_nghrs = all_ranks.copy() remote_nghrs.remove(self.group.my_rank) @@ -440,7 +441,15 @@ def __init__( # create a connection for each remote neighbor self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc) self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls) - self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(2**29) # just using recommended size for now + min_gran = self.nvls_connection.get_multicast_min_granularity() + aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran) + self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory( + aligned_buffer_size + ) # just using recommended size for now + self.memory_ptr = self.nvls_mem_handle.get_device_ptr() + + self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0) + self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr) # create a sm_channel for each remote neighbor self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore) @@ -461,6 +470,9 @@ def __init__( self.set_params(nblocks, block_size) + def get_memory(self): + return self.memory + def __call__(self, stream_ptr): self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr) return self.memory diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index e7437c7ec..45f11574c 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -116,13 +116,14 @@ def init_target(): mpi_group.comm.barrier() + def create_connection(group: mscclpp_comm.CommGroup, transport: str): if transport == "NVLS": all_ranks = list(range(group.nranks)) tran = Transport.Nvls connection = group.make_connection(all_ranks, tran) return connection - + remote_nghrs = list(range(group.nranks)) remote_nghrs.remove(group.my_rank) if transport == "NVLink": @@ -134,6 +135,7 @@ def create_connection(group: mscclpp_comm.CommGroup, transport: str): connections = group.make_connection(remote_nghrs, tran) return connections + def create_group_and_connection(mpi_group: MpiGroup, transport: str): if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False: pytest.skip("cannot use nvlink/nvls for cross node") @@ -150,7 +152,7 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str): @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) -@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int): group, connections = create_group_and_connection(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) @@ -185,7 +187,7 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int) @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) -@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]]) +@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20, 27]]) @pytest.mark.parametrize("device", ["cuda", "cpu"]) def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str): # this test starts with a random tensor on rank 0 and rotates it all the way through all ranks @@ -287,7 +289,7 @@ def __init__( scratch=None, fifo=None, nvls_mem_handle=None, - nvls_buffer_size=None + nvls_buffer_size=None, ): file_dir = os.path.dirname(os.path.abspath(__file__)) if test_name == "h2d_semaphore": @@ -363,7 +365,11 @@ def __init__( self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8) self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels) elif test_name == "nvls": - self.params = nvls_mem_handle.device_handle().raw + pack(self._d_semaphore_or_channels) + pack(my_rank, nranks, nvls_buffer_size) + self.params = ( + nvls_mem_handle.device_handle().raw + + pack(self._d_semaphore_or_channels) + + pack(my_rank, nranks, nvls_buffer_size) + ) def __call__(self): return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None) @@ -403,7 +409,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) @pytest.mark.parametrize("use_packet", [False, True]) def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): group, connections = create_group_and_connection(mpi_group, "NVLink") @@ -451,7 +457,7 @@ def test_fifo( @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): group, connections = create_group_and_connection(mpi_group, transport) @@ -500,7 +506,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["NVLink", "IB"]) @pytest.mark.parametrize("use_packet", [False, True]) def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): @@ -543,10 +549,11 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u group.barrier() assert cp.array_equal(memory, memory_expected) + @parametrize_mpi_groups(8) def test_nvls(mpi_group: MpiGroup): group, nvls_connection = create_group_and_connection(mpi_group, "NVLS") - nbytes = 2**21 + nbytes = 2 ** 21 mem_handle = nvls_connection.allocate_bind_memory(nbytes) nvlinks_connections = create_connection(group, "NVLink") @@ -558,7 +565,7 @@ def test_nvls(mpi_group: MpiGroup): nranks=group.nranks, nvls_mem_handle=mem_handle, nvls_buffer_size=nbytes, - semaphore_or_channels=semaphores + semaphore_or_channels=semaphores, ) kernel() diff --git a/src/connection.cc b/src/connection.cc index 24b043396..889dbe069 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -10,7 +10,6 @@ #include #include -#include "api.h" #include "debug.h" #include "endpoint.hpp" #include "infiniband/verbs.h" @@ -231,8 +230,7 @@ std::shared_ptr NvlsConnection::allocate return std::make_shared(mem, mcPtr, size); } -MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle -NvlsConnection::DeviceMulticastPointer::deviceHandle() { +NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() { NvlsConnection::DeviceMulticastPointer::DeviceHandle device; device.devicePtr = this->deviceMem_->devicePtr_; device.mcPtr = this->mcPtr_.get(); @@ -240,6 +238,10 @@ NvlsConnection::DeviceMulticastPointer::deviceHandle() { return device; }; +char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; }; + +size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->minMcGran_; } + // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) From 96303cebdd2ed8f043edf26f0860f1b1e8353ef1 Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 22 Jan 2024 06:34:16 +0000 Subject: [PATCH 48/67] all works for h100 --- python/mscclpp_benchmark/allreduce_bench.py | 22 +++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index a8faa7a68..e4b854594 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -152,16 +152,18 @@ def run_benchmark( proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) - memory = mscclpp_call.get_memory() - # if memory.nbytes < 2**20: - # mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - # else: - # mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) - # else: - # proxy_service = ProxyService() - # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) - # proxy_service.start_proxy() + # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) + # memory = mscclpp_call.get_memory() + if memory.nbytes < 2**20: + mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) + elif memory.nbytes < 2**21: + mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) + else: + mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) + memory = mscclpp_call.get_memory() + # proxy_service = ProxyService() + # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) + # proxy_service.start_proxy() else: if memory.nbytes < 2 ** 22: proxy_service = ProxyService() From feca28f1ef0bed0fd4d4ee074dca291d269f9e3b Mon Sep 17 00:00:00 2001 From: Saeed Maleki Date: Mon, 22 Jan 2024 06:57:41 +0000 Subject: [PATCH 49/67] lint --- python/mscclpp_benchmark/allreduce_bench.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index e4b854594..12c6bf7f0 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -154,9 +154,9 @@ def run_benchmark( if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) # memory = mscclpp_call.get_memory() - if memory.nbytes < 2**20: + if memory.nbytes < 2 ** 20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2**21: + elif memory.nbytes < 2 ** 21: mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) From 21820a69f91c157d153b2a97f39c2bf88b39ec8c Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 22 Jan 2024 10:01:11 +0000 Subject: [PATCH 50/67] works --- src/connection.cc | 114 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 22 deletions(-) diff --git a/src/connection.cc b/src/connection.cc index 889dbe069..df7c6c669 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -14,7 +14,6 @@ #include "endpoint.hpp" #include "infiniband/verbs.h" #include "npkit/npkit.h" -#include "registered_memory.hpp" namespace mscclpp { @@ -98,7 +97,7 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { // NVLS -struct NvlsConnection::Impl { +struct NvlsConnection::Impl : public std::enable_shared_from_this { CUmemGenericAllocationHandle mcHandle_; size_t bufferSize_; CUmulticastObjectProp mcProp_; @@ -107,11 +106,12 @@ struct NvlsConnection::Impl { // These are only defined for multicast (NVLS) capability pid_t rootPid_; int mcFileDesc_; - size_t offset_; - std::vector>> physicalMemoryStorage; + + std::list> allocatedRanges_; + std::list> freeRanges_; // use this only for the root of the NVLS - Impl(size_t bufferSize, int numDevices) : offset_(0) { + Impl(size_t bufferSize, int numDevices) { minMcGran_ = 0; mcGran_ = 0; mcProp_ = {}; @@ -126,6 +126,7 @@ struct NvlsConnection::Impl { mcFileDesc_ = 0; MSCCLPP_CUTHROW( cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); + freeRanges_.emplace_back(0, bufferSize_); rootPid_ = getpid(); if (rootPid_ < 0) { @@ -136,11 +137,21 @@ struct NvlsConnection::Impl { mcProp_.size, minMcGran_, mcGran_); } - Impl(const std::vector& data) : offset_(0) { + Impl(const std::vector& data) { auto it = data.begin(); - std::copy_n(it, sizeof(*this), reinterpret_cast(this)); - - // TODO: proper throw + std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast(&this->mcHandle_)); + it += sizeof(this->mcHandle_); + std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast(&this->bufferSize_)); + it += sizeof(this->bufferSize_); + std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast(&this->minMcGran_)); + it += sizeof(this->minMcGran_); + std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast(&this->mcGran_)); + it += sizeof(this->mcGran_); + std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast(&this->rootPid_)); + it += sizeof(this->rootPid_); + std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast(&this->mcFileDesc_)); + + freeRanges_.emplace_back(0, bufferSize_); int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); if (rootPidFd < 0) { throw mscclpp::SysError("pidfd_open() failed", errno); @@ -159,21 +170,76 @@ struct NvlsConnection::Impl { ~Impl() { // we don't need to free multicast handle object according to NCCL. + if (rootPid_ == getpid()) { + close(mcFileDesc_); + } } - std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize) { - if (offset_ > bufferSize_) { - throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError); + Impl(const Impl&) = delete; + Impl& operator=(const Impl&) = delete; + + size_t allocateBuffer(size_t size) { + if (freeRanges_.empty()) { + throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage); } - if (bufferSize_ - offset_ < devBuffSize) { - throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); + auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(), + [size](const std::pair& range) { return range.second >= size; }); + if (it != freeRanges_.end()) { + size_t offset = it->first; + size_t rangeSize = it->second; + if (rangeSize == size) { + freeRanges_.erase(it); + } else { + it->first += size; + it->second -= size; + } + allocatedRanges_.emplace_back(offset, size); + return offset; } + throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); + } - // keepin a copy physicalMem around so that the user doesn't accidentally get rids of all of them. - physicalMemoryStorage.push_back(physicalMem); + void freeBuffer(size_t offset, size_t size) noexcept { + auto it = std::find_if(allocatedRanges_.begin(), allocatedRanges_.end(), + [offset, size](const std::pair& range) { + return range.first == offset && range.second == size; + }); + if (it == allocatedRanges_.end()) { + return; + } + allocatedRanges_.erase(it); + it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair& range) { + return range.first + range.second >= offset; + }); + if (it == freeRanges_.end()) { + freeRanges_.emplace_back(offset, size); + return; + } + if (it->first + it->second == offset) { + // merge with the previous free range if possible + it->second += size; + // merge with the next free range if possible + auto nextItr = std::next(it); + if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) { + it->second += nextItr->second; + freeRanges_.erase(nextItr); + } + return; + } else if (it->first == offset + size) { + // merge with the next free range if possible + it->first -= size; + it->second += size; + return; + } else { + freeRanges_.emplace(it, offset, size); + return; + } + } + std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize) { + size_t offset = allocateBuffer(devBuffSize); MSCCLPP_CUTHROW( - cuMulticastBindMem(mcHandle_, offset_ /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0)); + cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0)); char* mcPtr; @@ -187,15 +253,14 @@ struct NvlsConnection::Impl { MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0)); MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); - // Is this enough? Or we should update the offset as well - auto deleter = [=, bindOffset = offset_](char* ptr) { + auto deleter = [=, self = shared_from_this()](char* ptr) { CUdevice device; MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId)); MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize)); MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize)); - MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, bindOffset, devBuffSize)); + MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize)); + self->freeBuffer(offset, devBuffSize); }; - offset_ += devBuffSize; return std::shared_ptr(mcPtr, deleter); } @@ -219,7 +284,12 @@ NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(std::make std::vector NvlsConnection::serialize() { std::vector result; - std::copy_n(reinterpret_cast(pimpl_.get()), sizeof(*pimpl_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->mcHandle_), sizeof(pimpl_->mcHandle_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->bufferSize_), sizeof(pimpl_->bufferSize_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->minMcGran_), sizeof(pimpl_->minMcGran_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->mcGran_), sizeof(pimpl_->mcGran_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(result)); return result; } From 20f3b0f1a570be4f1dd482aa89a62dd10f2f305d Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 04:12:23 +0000 Subject: [PATCH 51/67] clean up --- include/mscclpp/core.hpp | 2 - include/mscclpp/gpu_utils.hpp | 10 +- src/CMakeLists.txt | 2 +- src/connection.cc | 217 --------------------------- src/nvls_connection.cu | 270 ++++++++++++++++++++++++++++++++++ 5 files changed, 279 insertions(+), 222 deletions(-) create mode 100644 src/nvls_connection.cu diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index d7e6467b0..f208c9a9b 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -480,12 +480,10 @@ class NvlsConnection { }; std::shared_ptr allocateAndBindCuda(size_t size); - size_t getMultiCastMinGranularity(); private: struct Impl; - std::shared_ptr pimpl_; }; diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index b3b8ec7bb..5e4e1c625 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -228,7 +228,7 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { /// The deallocation only happens PhysicalCudaMemory goes out of scope. /// @tparam T Type of each element in the allocated memory. /// @param count Number of elements to allocate. -/// @param gran the granularity forof the allocation. +/// @param gran the granularity of the allocation. /// @return A std::shared_ptr to the memory handle and a device pointer for that memory. template std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { @@ -264,8 +264,14 @@ UniqueCudaPtr allocUniqueCuda(size_t count = 1) { return detail::safeAlloc, CudaDeleter, UniqueCudaPtr>(count); } +/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it. +/// The memory is zeroed out. +/// @tparam T Type of each element in the allocated memory. +/// @param count Number of elements to allocate. +/// @param gran the granularity of the allocation. +/// @return A std::unique_ptr to the memory handle and a device pointer for that memory. template -std::shared_ptr> allocUniquePhysicalCuda(size_t count, size_t gran) { +std::unique_ptr> allocUniquePhysicalCuda(size_t count, size_t gran) { return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::unique_ptr, CudaDeleter>>>(count, gran); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index cfbcc927a..45b4075d2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu) target_sources(mscclpp_obj PRIVATE ${SOURCES}) target_include_directories(mscclpp_obj PRIVATE include) diff --git a/src/connection.cc b/src/connection.cc index df7c6c669..f89b96138 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -95,223 +95,6 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) { INFO(MSCCLPP_P2P, "CudaIpcConnection flushing connection"); } -// NVLS - -struct NvlsConnection::Impl : public std::enable_shared_from_this { - CUmemGenericAllocationHandle mcHandle_; - size_t bufferSize_; - CUmulticastObjectProp mcProp_; - size_t minMcGran_; - size_t mcGran_; - // These are only defined for multicast (NVLS) capability - pid_t rootPid_; - int mcFileDesc_; - - std::list> allocatedRanges_; - std::list> freeRanges_; - - // use this only for the root of the NVLS - Impl(size_t bufferSize, int numDevices) { - minMcGran_ = 0; - mcGran_ = 0; - mcProp_ = {}; - mcProp_.size = bufferSize; - mcProp_.numDevices = numDevices; - mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); - MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_; - bufferSize_ = mcProp_.size; - MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); - mcFileDesc_ = 0; - MSCCLPP_CUTHROW( - cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); - freeRanges_.emplace_back(0, bufferSize_); - - rootPid_ = getpid(); - if (rootPid_ < 0) { - throw mscclpp::SysError("getpid() failed", errno); - } - - INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n", - mcProp_.size, minMcGran_, mcGran_); - } - - Impl(const std::vector& data) { - auto it = data.begin(); - std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast(&this->mcHandle_)); - it += sizeof(this->mcHandle_); - std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast(&this->bufferSize_)); - it += sizeof(this->bufferSize_); - std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast(&this->minMcGran_)); - it += sizeof(this->minMcGran_); - std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast(&this->mcGran_)); - it += sizeof(this->mcGran_); - std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast(&this->rootPid_)); - it += sizeof(this->rootPid_); - std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast(&this->mcFileDesc_)); - - freeRanges_.emplace_back(0, bufferSize_); - int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); - if (rootPidFd < 0) { - throw mscclpp::SysError("pidfd_open() failed", errno); - } - int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); - if (mcRootFileDescFd < 0) { - throw mscclpp::SysError("pidfd_getfd() failed", errno); - } - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast(mcRootFileDescFd), - CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); - close(rootPidFd); - close(mcRootFileDescFd); - - INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); - } - - ~Impl() { - // we don't need to free multicast handle object according to NCCL. - if (rootPid_ == getpid()) { - close(mcFileDesc_); - } - } - - Impl(const Impl&) = delete; - Impl& operator=(const Impl&) = delete; - - size_t allocateBuffer(size_t size) { - if (freeRanges_.empty()) { - throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage); - } - auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(), - [size](const std::pair& range) { return range.second >= size; }); - if (it != freeRanges_.end()) { - size_t offset = it->first; - size_t rangeSize = it->second; - if (rangeSize == size) { - freeRanges_.erase(it); - } else { - it->first += size; - it->second -= size; - } - allocatedRanges_.emplace_back(offset, size); - return offset; - } - throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); - } - - void freeBuffer(size_t offset, size_t size) noexcept { - auto it = std::find_if(allocatedRanges_.begin(), allocatedRanges_.end(), - [offset, size](const std::pair& range) { - return range.first == offset && range.second == size; - }); - if (it == allocatedRanges_.end()) { - return; - } - allocatedRanges_.erase(it); - it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair& range) { - return range.first + range.second >= offset; - }); - if (it == freeRanges_.end()) { - freeRanges_.emplace_back(offset, size); - return; - } - if (it->first + it->second == offset) { - // merge with the previous free range if possible - it->second += size; - // merge with the next free range if possible - auto nextItr = std::next(it); - if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) { - it->second += nextItr->second; - freeRanges_.erase(nextItr); - } - return; - } else if (it->first == offset + size) { - // merge with the next free range if possible - it->first -= size; - it->second += size; - return; - } else { - freeRanges_.emplace(it, offset, size); - return; - } - } - - std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize) { - size_t offset = allocateBuffer(devBuffSize); - MSCCLPP_CUTHROW( - cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0)); - - char* mcPtr; - - CUmemAccessDesc accessDesc = {}; - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - int deviceId = -1; - MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); - accessDesc.location.id = deviceId; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); - MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0)); - MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); - - auto deleter = [=, self = shared_from_this()](char* ptr) { - CUdevice device; - MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId)); - MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize)); - MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize)); - MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize)); - self->freeBuffer(offset, devBuffSize); - }; - - return std::shared_ptr(mcPtr, deleter); - } -}; - -NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) - : pimpl_(std::make_shared(bufferSize, numDevices)) {} - -void NvlsConnection::addDevice() { - int cudaDeviceId; - MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); - this->addDevice(cudaDeviceId); -} - -void NvlsConnection::addDevice(int cudaDeviceId) { - MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId)); - INFO(MSCCLPP_COLL, "NVLS connection created"); -} - -NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(std::make_shared(data)) {} - -std::vector NvlsConnection::serialize() { - std::vector result; - std::copy_n(reinterpret_cast(&pimpl_->mcHandle_), sizeof(pimpl_->mcHandle_), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&pimpl_->bufferSize_), sizeof(pimpl_->bufferSize_), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&pimpl_->minMcGran_), sizeof(pimpl_->minMcGran_), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&pimpl_->mcGran_), sizeof(pimpl_->mcGran_), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(result)); - std::copy_n(reinterpret_cast(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(result)); - return result; -} - -// TODO: we need to atuo delete the memory we multicast pointer is no used anymore -std::shared_ptr NvlsConnection::allocateAndBindCuda(size_t size) { - auto mem = allocSharedPhysicalCuda(size, pimpl_->minMcGran_); - auto mcPtr = pimpl_->bindMemory(mem, size); - return std::make_shared(mem, mcPtr, size); -} - -NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() { - NvlsConnection::DeviceMulticastPointer::DeviceHandle device; - device.devicePtr = this->deviceMem_->devicePtr_; - device.mcPtr = this->mcPtr_.get(); - device.bufferSize = this->bufferSize_; - return device; -}; - -char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; }; - -size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->minMcGran_; } - // IBConnection IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context) diff --git a/src/nvls_connection.cu b/src/nvls_connection.cu new file mode 100644 index 000000000..f6655fb82 --- /dev/null +++ b/src/nvls_connection.cu @@ -0,0 +1,270 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include + +#include +#include +#include + +#include "debug.h" +#include "endpoint.hpp" + +namespace mscclpp { + +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) +class NvlsConnection::Impl : public std::enable_shared_from_this { + public: + // use this only for the root of the NVLS + Impl(size_t bufferSize, int numDevices); + Impl(const std::vector& data); + ~Impl(); + + Impl(const Impl&) = delete; + Impl& operator=(const Impl&) = delete; + + size_t getMinMcGran() { return minMcGran_; } + std::vector serialize(); + void addDevice(int cudaDeviceId); + size_t allocateBuffer(size_t size); + void freeBuffer(size_t offset, size_t size) noexcept; + std::shared_ptr bindMemory(std::shared_ptr> physicalMem, size_t devBuffSize); + + private: + friend class NvlsConnection; + CUmemGenericAllocationHandle mcHandle_; + CUmulticastObjectProp mcProp_; + size_t bufferSize_; + size_t minMcGran_; + size_t mcGran_; + // These are only defined for multicast (NVLS) capability + pid_t rootPid_; + int mcFileDesc_; + + std::list> allocatedRanges_; + std::list> freeRanges_; +}; + +NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) { + minMcGran_ = 0; + mcGran_ = 0; + mcProp_ = {}; + mcProp_.size = bufferSize; + mcProp_.numDevices = numDevices; + mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM)); + MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_; + bufferSize_ = mcProp_.size; + MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_)); + mcFileDesc_ = 0; + MSCCLPP_CUTHROW( + cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/)); + freeRanges_.emplace_back(0, bufferSize_); + + rootPid_ = getpid(); + if (rootPid_ < 0) { + throw mscclpp::SysError("getpid() failed", errno); + } + + INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n", + mcProp_.size, minMcGran_, mcGran_); +} + +NvlsConnection::Impl::Impl(const std::vector& data) { + auto it = data.begin(); + std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast(&this->mcHandle_)); + it += sizeof(this->mcHandle_); + std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast(&this->bufferSize_)); + it += sizeof(this->bufferSize_); + std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast(&this->minMcGran_)); + it += sizeof(this->minMcGran_); + std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast(&this->mcGran_)); + it += sizeof(this->mcGran_); + std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast(&this->rootPid_)); + it += sizeof(this->rootPid_); + std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast(&this->mcFileDesc_)); + + freeRanges_.emplace_back(0, bufferSize_); + int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0); + if (rootPidFd < 0) { + throw mscclpp::SysError("pidfd_open() failed", errno); + } + int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0); + if (mcRootFileDescFd < 0) { + throw mscclpp::SysError("pidfd_getfd() failed", errno); + } + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast(mcRootFileDescFd), + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + close(rootPidFd); + close(mcRootFileDescFd); + + INFO(MSCCLPP_COLL, "NVLS handle was imported from root"); +} + +NvlsConnection::Impl::~Impl() { + // we don't need to free multicast handle object according to NCCL. + if (rootPid_ == getpid()) { + close(mcFileDesc_); + } +} + +std::vector NvlsConnection::Impl::serialize() { + std::vector result; + std::copy_n(reinterpret_cast(&mcHandle_), sizeof(mcHandle_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&bufferSize_), sizeof(bufferSize_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&minMcGran_), sizeof(minMcGran_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&mcGran_), sizeof(mcGran_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&rootPid_), sizeof(rootPid_), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&mcFileDesc_), sizeof(mcFileDesc_), std::back_inserter(result)); + return result; +} + +void NvlsConnection::Impl::addDevice(int cudaDeviceId) { + MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId)); + INFO(MSCCLPP_COLL, "NVLS connection created"); +} + +size_t NvlsConnection::Impl::allocateBuffer(size_t size) { + if (freeRanges_.empty()) { + throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage); + } + auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(), + [size](const std::pair& range) { return range.second >= size; }); + if (it != freeRanges_.end()) { + size_t offset = it->first; + size_t rangeSize = it->second; + if (rangeSize == size) { + freeRanges_.erase(it); + } else { + it->first += size; + it->second -= size; + } + allocatedRanges_.emplace_back(offset, size); + return offset; + } + throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); +} + +void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept { + auto it = std::find_if( + allocatedRanges_.begin(), allocatedRanges_.end(), + [offset, size](const std::pair& range) { return range.first == offset && range.second == size; }); + if (it == allocatedRanges_.end()) { + return; + } + allocatedRanges_.erase(it); + it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair& range) { + return range.first + range.second >= offset; + }); + if (it == freeRanges_.end()) { + freeRanges_.emplace_back(offset, size); + return; + } + if (it->first + it->second == offset) { + // merge with the previous free range if possible + it->second += size; + // merge with the next free range if possible + auto nextItr = std::next(it); + if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) { + it->second += nextItr->second; + freeRanges_.erase(nextItr); + } + return; + } else if (it->first == offset + size) { + // merge with the next free range if possible + it->first -= size; + it->second += size; + return; + } else { + freeRanges_.emplace(it, offset, size); + return; + } +} + +std::shared_ptr NvlsConnection::Impl::bindMemory(std::shared_ptr> physicalMem, + size_t devBuffSize) { + size_t offset = allocateBuffer(devBuffSize); + MSCCLPP_CUTHROW( + cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0)); + + char* mcPtr; + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + int deviceId = -1; + MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); + accessDesc.location.id = deviceId; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0)); + MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0)); + MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1)); + + auto deleter = [=, self = shared_from_this()](char* ptr) { + CUdevice device; + MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId)); + MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize)); + MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize)); + MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize)); + self->freeBuffer(offset, devBuffSize); + }; + + return std::shared_ptr(mcPtr, deleter); +} +#else +class NvlsConnection::Impl { + public: + // use this only for the root of the NVLS + Impl(size_t, int) { throw notSupportedError; } + Impl(const std::vector&) { throw notSupportedError; } + + Impl(const Impl&) = delete; + Impl& operator=(const Impl&) = delete; + + std::vector serialize() { throw notSupportedError; } + size_t allocateBuffer(size_t) { throw notSupportedError; } + void freeBuffer(size_t, size_t) { throw notSupportedError; } + std::shared_ptr bindMemory(std::shared_ptr>, size_t) { throw notSupportedError; } + void addDevice(int) { throw notSupportedError; } + size_t getMinMcGran() { throw notSupportedError; } + + private: + Error notSupportedError = Error("NVLS is not supported on this CUDA version", ErrorCode::InvalidUsage); +}; +#endif + +NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) + : pimpl_(std::make_shared(bufferSize, numDevices)) {} + +void NvlsConnection::addDevice() { + int cudaDeviceId; + MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId)); + this->addDevice(cudaDeviceId); +} + +void NvlsConnection::addDevice(int cudaDeviceId) { pimpl_->addDevice(cudaDeviceId); } + +NvlsConnection::NvlsConnection(const std::vector& data) : pimpl_(std::make_shared(data)) {} + +std::vector NvlsConnection::serialize() { return pimpl_->serialize(); } + +std::shared_ptr NvlsConnection::allocateAndBindCuda(size_t size) { + auto mem = allocSharedPhysicalCuda(size, pimpl_->getMinMcGran()); + auto mcPtr = pimpl_->bindMemory(mem, size); + return std::make_shared(mem, mcPtr, size); +} + +NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() { + NvlsConnection::DeviceMulticastPointer::DeviceHandle device; + device.devicePtr = this->deviceMem_->devicePtr_; + device.mcPtr = this->mcPtr_.get(); + device.bufferSize = this->bufferSize_; + return device; +}; + +char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; }; + +size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->getMinMcGran(); } + +} // namespace mscclpp From 2ec813e60dac497db065445e5b24b6e30288b92f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 04:16:51 +0000 Subject: [PATCH 52/67] lint --- python/mscclpp_benchmark/allreduce_bench.py | 12 ++++++------ python/test/test_mscclpp.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 12c6bf7f0..4e9aeca0b 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -154,9 +154,9 @@ def run_benchmark( if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) # memory = mscclpp_call.get_memory() - if memory.nbytes < 2 ** 20: + if memory.nbytes < 2**20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2 ** 21: + elif memory.nbytes < 2**21: mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) @@ -165,7 +165,7 @@ def run_benchmark( # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) # proxy_service.start_proxy() else: - if memory.nbytes < 2 ** 22: + if memory.nbytes < 2**22: proxy_service = ProxyService() mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service) proxy_service.start_proxy() @@ -260,13 +260,13 @@ def run_benchmark( speed_ups = [] for i in range(10, 28): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - nelems = 2 ** i + nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: - nelems = 3 * 2 ** i + nelems = 3 * 2**i else: raise RuntimeError("Only support one node/two nodes communication") - if nelems * data_type().itemsize > 2 ** 32: + if nelems * data_type().itemsize > 2**32: break # due to trigger bit width limitation, we can only support up to 2**32 size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 45f11574c..bebd752bc 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -152,7 +152,7 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str): @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) -@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int): group, connections = create_group_and_connection(mpi_group, transport) memory = cp.zeros(nelem, dtype=cp.int32) @@ -187,7 +187,7 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int) @parametrize_mpi_groups(2, 4, 8, 16) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) -@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20, 27]]) +@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]]) @pytest.mark.parametrize("device", ["cuda", "cpu"]) def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str): # this test starts with a random tensor on rank 0 and rotates it all the way through all ranks @@ -409,7 +409,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup): @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("use_packet", [False, True]) def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool): group, connections = create_group_and_connection(mpi_group, "NVLink") @@ -457,7 +457,7 @@ def test_fifo( @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["IB", "NVLink"]) def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): group, connections = create_group_and_connection(mpi_group, transport) @@ -506,7 +506,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): @parametrize_mpi_groups(2, 4, 8, 16) -@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]]) +@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]]) @pytest.mark.parametrize("transport", ["NVLink", "IB"]) @pytest.mark.parametrize("use_packet", [False, True]) def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool): @@ -553,7 +553,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u @parametrize_mpi_groups(8) def test_nvls(mpi_group: MpiGroup): group, nvls_connection = create_group_and_connection(mpi_group, "NVLS") - nbytes = 2 ** 21 + nbytes = 2**21 mem_handle = nvls_connection.allocate_bind_memory(nbytes) nvlinks_connections = create_connection(group, "NVLink") From f493d22e51d30b5f57c877490fcafb48acb0b0d0 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 05:17:26 +0000 Subject: [PATCH 53/67] pass build --- include/mscclpp/core.hpp | 2 +- src/nvls_connection.cu | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index f208c9a9b..8bf9e7987 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -483,7 +483,7 @@ class NvlsConnection { size_t getMultiCastMinGranularity(); private: - struct Impl; + class Impl; std::shared_ptr pimpl_; }; diff --git a/src/nvls_connection.cu b/src/nvls_connection.cu index f6655fb82..9938578e9 100644 --- a/src/nvls_connection.cu +++ b/src/nvls_connection.cu @@ -1,6 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. - +#include #include #include @@ -12,8 +12,7 @@ #include "endpoint.hpp" namespace mscclpp { - -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)) class NvlsConnection::Impl : public std::enable_shared_from_this { public: // use this only for the root of the NVLS From 9e5f0e679ff5f84d7a8c5484ea62b1bcf41f631f Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 06:43:37 +0000 Subject: [PATCH 54/67] fix benchmark --- python/mscclpp_benchmark/allreduce.cu | 4 +++- python/mscclpp_benchmark/allreduce_bench.py | 24 +++++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu index 56aeb572b..127521939 100644 --- a/python/mscclpp_benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -782,6 +782,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // NVLS // ------------------------------------------- +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 #define MULTIMEM_ST(val, ptr) \ asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ "r"(val.w) \ @@ -837,4 +838,5 @@ extern "C" __global__ void __launch_bounds__(1024, 1) } } deviceSyncer.sync(gridDim.x); -} \ No newline at end of file +} +#endif diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 4e9aeca0b..278d52251 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -79,6 +79,11 @@ def human_readable_size(size, decimal_places=1): return f"{size:.{decimal_places}f} {unit}" +def is_nvls_enabled(): + compute_capability = cp.cuda.Device().compute_capability + return not cp.cuda.runtime.is_hip and compute_capability >= "90" + + def check_correctness(memory, func, niter=100): ac = True for p in range(niter): @@ -152,18 +157,18 @@ def run_benchmark( proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: - # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) - # memory = mscclpp_call.get_memory() if memory.nbytes < 2**20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2**21: + elif memory.nbytes < 2**21 if is_nvls_enabled() else memory.nbytes < 2**29: mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: - mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) - memory = mscclpp_call.get_memory() - # proxy_service = ProxyService() - # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) - # proxy_service.start_proxy() + if is_nvls_enabled(): + mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) + memory = mscclpp_call.get_memory() + else: + proxy_service = ProxyService() + mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) + proxy_service.start_proxy() else: if memory.nbytes < 2**22: proxy_service = ProxyService() @@ -258,7 +263,8 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - for i in range(10, 28): + end_range = 28 if is_nvls_enabled() else 29 + for i in range(10, end_range): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2: From 2eb20aff3be3a20bea7a40bb26d1126526a40c71 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 08:38:42 +0000 Subject: [PATCH 55/67] pass test --- include/mscclpp/utils.hpp | 2 ++ python/mscclpp/__init__.py | 1 + python/mscclpp/utils.py | 3 +++ python/mscclpp/utils_py.cpp | 1 + python/mscclpp_benchmark/allreduce_bench.py | 13 ++++--------- python/test/test_mscclpp.py | 6 +++--- src/CMakeLists.txt | 2 +- src/{nvls_connection.cu => nvls_connection.cc} | 4 ++-- src/utils.cc | 12 ++++++++++++ 9 files changed, 29 insertions(+), 15 deletions(-) rename src/{nvls_connection.cu => nvls_connection.cc} (98%) diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp index c8ef3d271..80b3bf39d 100644 --- a/include/mscclpp/utils.hpp +++ b/include/mscclpp/utils.hpp @@ -37,6 +37,8 @@ struct ScopedTimer : public Timer { std::string getHostName(int maxlen, const char delim); +bool isNvlsSupported(); + } // namespace mscclpp #endif // MSCCLPP_UTILS_HPP_ diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index d411bc1b0..0e3618591 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -19,6 +19,7 @@ Transport, TransportFlags, version, + is_nvls_supported, ) __version__ = version() diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 9f71b70c4..762dd24d3 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -153,3 +153,6 @@ def pack(*args): else: raise RuntimeError(f"Unsupported type: {type(arg)}") return res + +def is_nvls_supported(): + return cp.cuda.runtime.runtimeGetVersion() >= 12010 diff --git a/python/mscclpp/utils_py.cpp b/python/mscclpp/utils_py.cpp index 16800a752..e9e847ee8 100644 --- a/python/mscclpp/utils_py.cpp +++ b/python/mscclpp/utils_py.cpp @@ -20,4 +20,5 @@ void register_utils(nb::module_& m) { nb::class_(m, "ScopedTimer").def(nb::init(), nb::arg("name")); m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim")); + m.def("is_nvls_supported", &isNvlsSupported); } diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 278d52251..c141a9e6b 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -14,7 +14,7 @@ from mpi4py import MPI import cupy.cuda.nccl as nccl import mscclpp.comm as mscclpp_comm -from mscclpp import ProxyService +from mscclpp import ProxyService, is_nvls_supported from prettytable import PrettyTable import netifaces as ni @@ -79,11 +79,6 @@ def human_readable_size(size, decimal_places=1): return f"{size:.{decimal_places}f} {unit}" -def is_nvls_enabled(): - compute_capability = cp.cuda.Device().compute_capability - return not cp.cuda.runtime.is_hip and compute_capability >= "90" - - def check_correctness(memory, func, niter=100): ac = True for p in range(niter): @@ -159,10 +154,10 @@ def run_benchmark( if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: if memory.nbytes < 2**20: mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2**21 if is_nvls_enabled() else memory.nbytes < 2**29: + elif memory.nbytes < 2**21 if is_nvls_supported() else memory.nbytes < 2**29: mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) else: - if is_nvls_enabled(): + if is_nvls_supported(): mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) memory = mscclpp_call.get_memory() else: @@ -263,7 +258,7 @@ def run_benchmark( mscclpp_algbw = [] nccl_algbw = [] speed_ups = [] - end_range = 28 if is_nvls_enabled() else 29 + end_range = 28 if is_nvls_supported() else 29 for i in range(10, end_range): if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: nelems = 2**i diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index bebd752bc..62c2619ad 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -19,6 +19,7 @@ SmDevice2DeviceSemaphore, TcpBootstrap, Transport, + is_nvls_supported, ) import mscclpp.comm as mscclpp_comm from mscclpp.utils import KernelBuilder, pack @@ -361,8 +362,6 @@ def __init__( elif test_name == "fifo": self.params = fifo.device_handle().raw elif test_name == "proxy": - semaphore_device_handles = [semaphore.device_handle().raw for semaphore in semaphore_or_channels] - self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8) self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels) elif test_name == "nvls": self.params = ( @@ -494,7 +493,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str): fifo_device_handle = proxy.fifo_device_handle() kernel = MscclppKernel( - "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle + "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=semaphores, fifo=fifo_device_handle ) proxy.start() group.barrier() @@ -551,6 +550,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u @parametrize_mpi_groups(8) +@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported") def test_nvls(mpi_group: MpiGroup): group, nvls_connection = create_group_and_connection(mpi_group, "NVLS") nbytes = 2**21 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45b4075d2..cfbcc927a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu) +file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc) target_sources(mscclpp_obj PRIVATE ${SOURCES}) target_include_directories(mscclpp_obj PRIVATE include) diff --git a/src/nvls_connection.cu b/src/nvls_connection.cc similarity index 98% rename from src/nvls_connection.cu rename to src/nvls_connection.cc index 9938578e9..136a0a421 100644 --- a/src/nvls_connection.cu +++ b/src/nvls_connection.cc @@ -4,7 +4,6 @@ #include #include -#include #include #include @@ -12,7 +11,8 @@ #include "endpoint.hpp" namespace mscclpp { -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)) + +#if (CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)) class NvlsConnection::Impl : public std::enable_shared_from_this { public: // use this only for the root of the NVLS diff --git a/src/utils.cc b/src/utils.cc index 7153d55c5..627df2df7 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -66,4 +67,15 @@ std::string getHostName(int maxlen, const char delim) { return hostname.substr(0, i); } +bool isNvlsSupported() { +#if (CUDART_VERSION >= 12010) + CUdevice dev; + int nvlsSupport; + MSCCLPP_CUTHROW(cuCtxGetDevice(&dev)); + MSCCLPP_CUTHROW(cuDeviceGetAttribute(&nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); + return nvlsSupport == 1; +#endif + return false; +} + } // namespace mscclpp From 8e32bd2641de64d8192e423bc9a12700f40739ce Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 09:00:14 +0000 Subject: [PATCH 56/67] fix --- python/mscclpp/utils.py | 1 + src/nvls_connection.cc | 1 + src/utils.cc | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 762dd24d3..79f28d01a 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -154,5 +154,6 @@ def pack(*args): raise RuntimeError(f"Unsupported type: {type(arg)}") return res + def is_nvls_supported(): return cp.cuda.runtime.runtimeGetVersion() >= 12010 diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc index 136a0a421..08f31e36e 100644 --- a/src/nvls_connection.cc +++ b/src/nvls_connection.cc @@ -4,6 +4,7 @@ #include #include +#include #include #include diff --git a/src/utils.cc b/src/utils.cc index 627df2df7..8475f2f60 100644 --- a/src/utils.cc +++ b/src/utils.cc @@ -70,10 +70,10 @@ std::string getHostName(int maxlen, const char delim) { bool isNvlsSupported() { #if (CUDART_VERSION >= 12010) CUdevice dev; - int nvlsSupport; + int isNvlsSupported; MSCCLPP_CUTHROW(cuCtxGetDevice(&dev)); - MSCCLPP_CUTHROW(cuDeviceGetAttribute(&nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); - return nvlsSupport == 1; + MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isNvlsSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); + return isNvlsSupported == 1; #endif return false; } From 5ecb01fe692d1b4a22b2a7c2f55d278126de8a03 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 10:21:31 +0000 Subject: [PATCH 57/67] clean up --- include/mscclpp/core.hpp | 9 ++++++++- include/mscclpp/gpu_utils.hpp | 10 +++++----- python/mscclpp/utils.py | 4 ---- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index 8bf9e7987..af969108c 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -698,7 +698,14 @@ class Communicator { /// to the connection. NonblockingFuture> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig); - /// TBD + /// Connect to NVLS on setup. + /// + /// This function used to connect to NVLS on setup. NVLS collective using multicast operations to send/recv data. + /// Here we need to put all involved ranks into the collective group. + /// + /// @param allRanks The ranks of all processes involved in the collective. + /// @param config The configuration for the local endpoint. + /// @return std::shared_ptr A shared pointer to the NVLS connection. std::shared_ptr connctNvlsCollective(std::vector allRanks, EndpointConfig config); /// Get the remote rank a connection is connected to. diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 5e4e1c625..909ccd821 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -168,6 +168,11 @@ Memory safeAlloc(size_t nelem) { template Memory safeAlloc(size_t nelem, size_t gran) { + if (nelem * sizeof(T) % gran) { + throw Error("The request allocation size is not divisible by the required granularity:" + + std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran), + ErrorCode::InvalidUsage); + } T* ptr = nullptr; try { ptr = alloc(nelem, gran); @@ -232,11 +237,6 @@ std::shared_ptr allocSharedCuda(size_t count = 1) { /// @return A std::shared_ptr to the memory handle and a device pointer for that memory. template std::shared_ptr> allocSharedPhysicalCuda(size_t count, size_t gran) { - if (count % gran) { - throw Error("The request allocation size is not divisible by the required granularity:" + std::to_string(count) + - " vs " + std::to_string(gran), - ErrorCode::InvalidUsage); - } return detail::safeAlloc, detail::cudaPhysicalCalloc, CudaPhysicalDeleter, std::shared_ptr>>(count, gran); } diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py index 79f28d01a..9f71b70c4 100644 --- a/python/mscclpp/utils.py +++ b/python/mscclpp/utils.py @@ -153,7 +153,3 @@ def pack(*args): else: raise RuntimeError(f"Unsupported type: {type(arg)}") return res - - -def is_nvls_supported(): - return cp.cuda.runtime.runtimeGetVersion() >= 12010 From 292c240d61eebe0806db7a4ab1ab7de6ce8ba3c8 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Tue, 23 Jan 2024 10:26:31 +0000 Subject: [PATCH 58/67] fix --- nvls/README | 2 - nvls/test.cu | 199 -------------------------------------------- nvls/test2.cpp | 143 ------------------------------- src/communicator.cc | 11 ++- 4 files changed, 5 insertions(+), 350 deletions(-) delete mode 100644 nvls/README delete mode 100644 nvls/test.cu delete mode 100644 nvls/test2.cpp diff --git a/nvls/README b/nvls/README deleted file mode 100644 index c385affc4..000000000 --- a/nvls/README +++ /dev/null @@ -1,2 +0,0 @@ -nvcc -I/usr/lib/x86_64-linux-gnu/openmpi/include -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -L/usr/lib/x86_64-linux-gnu/openmpi/lib -L /usr/local/cuda/lib64/ -lmpi_cxx -lmpi -lcupti -lcupti_static test.cu -gencode arch=compute_90,code=sm_90 -lcuda -lcudart -lnccl - diff --git a/nvls/test.cu b/nvls/test.cu deleted file mode 100644 index b84f19519..000000000 --- a/nvls/test.cu +++ /dev/null @@ -1,199 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#define CUCHECK(cmd) \ - do { \ - auto err = cmd; \ - if (err != 0) { \ - printf("Cuda failure %d: Line %d", err, __LINE__); \ - exit(-1); \ - } \ - } while (false) - -// AR kernel snippet for sm_90 only - -#define MULTIMEM_ST(val, ptr) \ - asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ - "r"(val.w) \ - : "memory"); -// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc -#define MULTIMEM_LD(val, ptr) \ - asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ - : "l"(ptr) \ - : "memory"); - -__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) { - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){ - uc_ptr[idx] = myrank + idx; - } -} - -__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) { - for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){ - float expected = (float)((nranks * (nranks-1)) / 2 + nranks * idx); - if (abs(uc_ptr[idx] - expected) > 0.01 * expected){ - printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected); - } - } -} - - -__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) { - // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction - // line is assumed to be 16B 4 ints of 8 halves - int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks; - int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks; - - int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4; - int my_step = blockDim.x * gridDim.x * 4; - - for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { - uint4 val; - MULTIMEM_LD(val, mc_ptr + idx); - MULTIMEM_ST(val, mc_ptr + idx); - } -} - -int main() { - int myrank, nranks; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - MPI_Comm_size(MPI_COMM_WORLD, &nranks); - - cudaSetDevice(myrank); - CUresult res; - - size_t size = 1024 * 1024 * 512; - CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - - CUmulticastObjectProp mcProp = {}; - mcProp.numDevices = nranks; - mcProp.size = size; - mcProp.handleTypes = handleType; - - size_t minGran, gran; - gran = 0; - minGran = 0; - CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); - CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran); - size_t mcSize = ((size + gran - 1) / gran) * gran; - mcProp.size = mcSize; - - CUmemGenericAllocationHandle handle; - // only one rank creates the multicast object - if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); - - int fd, peerfd; - fd = 0; - peerfd = 0; - if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); - - // some ugly UDS business - // Borrow ipcsocket.{c,h} from nccl code - // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the - // exported handles - // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node - - pid_t currentPid = getpid(); - MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); - MPI_Bcast(¤tPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD); - int pidFd = syscall(SYS_pidfd_open, currentPid, 0); - - // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); - // everyone else would now have same multicast object - int peerFd = 0; - peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0); - if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerFd, handleType)); - MPI_Barrier(MPI_COMM_WORLD); - - // if(myrank) - // close(peerfd); - // else - close(fd); - // end of ugly UDS business - // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called - int mydev = myrank; - CUCHECK(cuMulticastAddDevice(handle, mydev)); - MPI_Barrier(MPI_COMM_WORLD); - - CUmemGenericAllocationHandle memhandle; - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = mydev; - prop.requestedHandleTypes = handleType; - - // allocate physical memory (data buffer) - CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/)); - - void* uc_va; - void* mc_va; - CUmemAccessDesc accessDesc = {}; - accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - accessDesc.location.id = mydev; - accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - - // Map a VA to UC space - CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0)); - cudaMemset(uc_va, 0, size); - CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0)); - // set access on UC address - CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1)); - - - // everyone binds memory to the multicast - CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0)); - MPI_Barrier(MPI_COMM_WORLD); - // usual VA business: map both MC and PA to two different VA addresses - - // Map a VA to MC space - CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0)); - CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0)); - // set access on MC address - CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1)); - - int rept = 10; - int block_size = 1024; - int nblocks = 16; - - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - init_kernel<<>>((float*)uc_va, size/sizeof(float), myrank, nranks); - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - check_correctness<<>>((float*)uc_va, size/sizeof(float), myrank, nranks); - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - - for (int input_size = 1024; input_size <= size; input_size *= 2){ - // warmup - for (int i = 0; i < rept; i++) { - testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); - } - cudaDeviceSynchronize(); - MPI_Barrier(MPI_COMM_WORLD); - double st = MPI_Wtime(); - for (int i = 0; i < rept; i++) { - testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); - } - cudaDeviceSynchronize(); - double en = MPI_Wtime(); - double time = (en - st) / rept; - if (!myrank) printf("input_size %d | Time = %f us, alg_bw = %f (GBps)\n", input_size, time*1e6, input_size / 1e9 / time); - } - MPI_Barrier(MPI_COMM_WORLD); - MPI_Finalize(); -} -//........ diff --git a/nvls/test2.cpp b/nvls/test2.cpp deleted file mode 100644 index 400d566ae..000000000 --- a/nvls/test2.cpp +++ /dev/null @@ -1,143 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - -#define CUCHECK(cmd) do { \ - auto err = cmd; \ - if( err != 0 ) { \ - printf("Cuda failure %d: Line %d", err, __LINE__); \ - } \ -} while(false) - -int main(){ - int myrank, nranks; - MPI_Init(NULL, NULL); - MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - MPI_Comm_size(MPI_COMM_WORLD, &nranks); - - cudaSetDevice(myrank); - CUresult res; - - -CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - - CUmulticastObjectProp mcProp = {}; - mcProp.numDevices = nranks; - mcProp.size = size; - mcProp.handleTypes = handleType; - - size_t minGran, gran; - CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); - CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); - - size_t mcSize = ((size+gran-1)/gran)*gran; - mcProp.size = mcSize; - - //only one rank creates the multicast object - if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); - - int fd, peerfd; - if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); - - //some ugly UDS business - // Borrow ipcsocket.{c,h} from nccl code - //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles - // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node - - volatile uint32_t abortFlag = 0; - struct ncclIpcSocket ipcSock = { 0 }; - uint64_t opId=0xdeadcafebeef; - ncclResult_t ret = ncclSuccess; - - NCCLCHECK(ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag)); - MPI_Barrier(MPI_COMM_WORLD); - if(!myrank) - for(int p=1;p= 900 -#define MULTIMEM_ST(val, ptr) \ - asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), \ - "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w) \ - : "memory"); -//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc -#define MULTIMEM_LD(val, ptr) \ - asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" \ - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ - : "l"(ptr) \ - : "memory"); -#endif - -//for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction -//line is assumed to be 16B 4 ints of 8 halves -const int start_elem = threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x); -const int end_elem = max(start_elem, numlines); -__syncthreads(); - for (int line = start_elem; line < end_elem; line += loop_step0) { - uint4 val; - MULTIMEM_LD(val, mc_ptr + (lineoffset + line)) - MULTIMEM_ST(val, mc_ptr + (lineoffset + line)) - } -__syncthreads(); - -*/ diff --git a/src/communicator.cc b/src/communicator.cc index d5c3e9ed4..e4710f272 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -110,26 +110,25 @@ MSCCLPP_API_CPP NonblockingFuture> Communicator::con MSCCLPP_API_CPP std::shared_ptr Communicator::connctNvlsCollective(std::vector allRanks, EndpointConfig config) { auto bootstrap = this->bootstrap(); - int myRank = bootstrap->getRank(); + int rank = bootstrap->getRank(); bool isRoot = false; bool amongAllRanks = false; - std::sort(allRanks.begin(), allRanks.end()); int rootRank = allRanks[0]; for (auto nvlsRank : allRanks) { - if (nvlsRank == myRank) amongAllRanks = true; + if (nvlsRank == rank) amongAllRanks = true; rootRank = std::min(rootRank, nvlsRank); } if (amongAllRanks == false) { - throw Error("my rank is not among allRanks", ErrorCode::InvalidUsage); + throw Error("rank is not among allRanks", ErrorCode::InvalidUsage); } - if (rootRank == myRank) isRoot = true; + if (rootRank == rank) isRoot = true; std::shared_ptr conn; if (isRoot) { conn = std::make_shared(config.nvlsBufferSize, allRanks.size()); auto serialized = conn->serialize(); for (auto nvlsRank : allRanks) { - if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0); + if (nvlsRank != rank) bootstrap->send(serialized, nvlsRank, 0); } } else { std::vector data; From a9f0280589512e992aa0cc1949b4607910a98d75 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 23 Jan 2024 23:03:00 +0000 Subject: [PATCH 59/67] HIP compatibility --- include/mscclpp/gpu.hpp | 15 +++++++++++++++ include/mscclpp/gpu_utils.hpp | 5 +++++ src/include/endpoint.hpp | 2 -- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index d3d48ce1f..2f73b4b3b 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -19,6 +19,9 @@ using cudaIpcMemHandle_t = hipIpcMemHandle_t; using CUresult = hipError_t; using CUdeviceptr = hipDeviceptr_t; +using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t; +using CUmemAllocationProp = hipMemAllocationProp; +using CUmemAccessDesc = hipMemAccessDesc; constexpr auto cudaSuccess = hipSuccess; constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking; @@ -32,6 +35,11 @@ constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice; constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost; constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess; +constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned; +constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice; +constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor; +constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite; + #ifndef CUDA_SUCCESS #define CUDA_SUCCESS hipSuccess #endif // CUDA_SUCCESS @@ -68,7 +76,14 @@ constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess; #define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__) #define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__) +#define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__) +#define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__) #define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__) +#define cuMemCreate(...) hipMemCreate(__VA_ARGS__) +#define cuMemRelease(...) hipMemRelease(__VA_ARGS__) +#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__) +#define cuMemMap(...) hipMemMap(__VA_ARGS__) +#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__) #else diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 909ccd821..3a96f9a45 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -90,7 +90,12 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.id = deviceId; +#if defined(__HIP_PLATFORM_AMD__) + // TODO: revisit when HIP fixes this typo in the field name + prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#else prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; +#endif CUmemGenericAllocationHandle memHandle; size_t bufferSize = sizeof(T) * nelem; diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp index a773efb5e..311fa9982 100644 --- a/src/include/endpoint.hpp +++ b/src/include/endpoint.hpp @@ -4,8 +4,6 @@ #ifndef MSCCLPP_ENDPOINT_HPP_ #define MSCCLPP_ENDPOINT_HPP_ -#include - #include #include From f1ec27867210b8bfde020533e291811b934868a0 Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 23 Jan 2024 23:30:36 +0000 Subject: [PATCH 60/67] minor updates --- include/mscclpp/gpu.hpp | 8 ++++++++ include/mscclpp/gpu_utils.hpp | 2 +- src/communicator.cc | 2 -- src/connection.cc | 4 ---- src/endpoint.cc | 3 --- src/nvls_connection.cc | 8 ++++---- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 2f73b4b3b..d46a9ac6b 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -92,4 +92,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #endif +// NVLS +#if !defined(__HIP_PLATFORM_AMD__) +#include +#define USE_NVLS ((CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))) +#else // !defined(__HIP_PLATFORM_AMD__) +#define USE_NVLS 0 +#endif // !defined(__HIP_PLATFORM_AMD__) + #endif // MSCCLPP_GPU_HPP_ diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 3a96f9a45..6ba6a545d 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -107,7 +107,7 @@ PhysicalCudaMemory* cudaPhysicalCalloc(size_t nelem, size_t gran) { accessDesc.location.id = deviceId; accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - T* devicePtr = NULL; + T* devicePtr = nullptr; // Map the device pointer MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0)); MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0)); diff --git a/src/communicator.cc b/src/communicator.cc index e4710f272..d0fb07a23 100644 --- a/src/communicator.cc +++ b/src/communicator.cc @@ -3,8 +3,6 @@ #include "communicator.hpp" -#include - #include "api.h" #include "debug.h" diff --git a/src/connection.cc b/src/connection.cc index f89b96138..4d719f3b2 100644 --- a/src/connection.cc +++ b/src/connection.cc @@ -3,10 +3,6 @@ #include "connection.hpp" -#include -#include - -#include #include #include diff --git a/src/endpoint.cc b/src/endpoint.cc index f6e3dc09c..dbc773898 100644 --- a/src/endpoint.cc +++ b/src/endpoint.cc @@ -1,8 +1,5 @@ #include "endpoint.hpp" -#include -#include - #include #include "api.h" diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc index 08f31e36e..1ff96a427 100644 --- a/src/nvls_connection.cc +++ b/src/nvls_connection.cc @@ -1,6 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -#include + #include #include @@ -13,7 +13,7 @@ namespace mscclpp { -#if (CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)) +#if (USE_NVLS) class NvlsConnection::Impl : public std::enable_shared_from_this { public: // use this only for the root of the NVLS @@ -212,7 +212,7 @@ std::shared_ptr NvlsConnection::Impl::bindMemory(std::shared_ptr(mcPtr, deleter); } -#else +#else // !(USE_NVLS) class NvlsConnection::Impl { public: // use this only for the root of the NVLS @@ -232,7 +232,7 @@ class NvlsConnection::Impl { private: Error notSupportedError = Error("NVLS is not supported on this CUDA version", ErrorCode::InvalidUsage); }; -#endif +#endif // !(USE_NVLS) NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) : pimpl_(std::make_shared(bufferSize, numDevices)) {} From 032c00abb418df3378b58ff88b2a674d7575b8df Mon Sep 17 00:00:00 2001 From: Changho Hwang Date: Tue, 23 Jan 2024 23:50:03 +0000 Subject: [PATCH 61/67] minor update --- include/mscclpp/gpu_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 6ba6a545d..9be6a7d16 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -173,7 +173,7 @@ Memory safeAlloc(size_t nelem) { template Memory safeAlloc(size_t nelem, size_t gran) { - if (nelem * sizeof(T) % gran) { + if ((nelem * sizeof(T)) % gran) { throw Error("The request allocation size is not divisible by the required granularity:" + std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran), ErrorCode::InvalidUsage); From fa0565fe6744f8be8434ce8fb833c18081a1d945 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 24 Jan 2024 07:14:10 +0000 Subject: [PATCH 62/67] add more tests --- python/mscclpp/__init__.py | 1 + python/mscclpp/comm.py | 19 ++++++++++++------- python/mscclpp/core_py.cpp | 1 + python/test/test_mscclpp.py | 21 ++++++++++++++++++++- src/nvls_connection.cc | 2 ++ 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py index 0e3618591..8f013e080 100644 --- a/python/mscclpp/__init__.py +++ b/python/mscclpp/__init__.py @@ -6,6 +6,7 @@ from ._mscclpp import ( Communicator, Connection, + EndpointConfig, Fifo, Host2DeviceSemaphore, Host2HostSemaphore, diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py index 3085cc3df..d84410668 100644 --- a/python/mscclpp/comm.py +++ b/python/mscclpp/comm.py @@ -8,6 +8,7 @@ from ._mscclpp import ( Communicator, Connection, + EndpointConfig, Host2DeviceSemaphore, Host2HostSemaphore, ProxyService, @@ -79,17 +80,21 @@ def my_ib_device(self, local_rank: int) -> Transport: assert False # only 8 IBs are supported def make_connection( - self, all_ranks: list[int], transports: Transport | dict[int, Transport] + self, + all_ranks: list[int], + endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport], ) -> dict[int, Connection]: - if transports == Transport.Nvls: - return self.communicator.connct_nvls_collective(all_ranks, transports) + if type(endpoints) is Transport: + endpoints = EndpointConfig(endpoints) + if endpoints.transport == Transport.Nvls: + return self.communicator.connct_nvls_collective(all_ranks, endpoints) connections = {} for rank in all_ranks: - if type(transports) is dict: - transport = transports[rank] + if type(endpoints) is dict: + endpoint = endpoints[rank] else: - transport = transports - connections[rank] = self.communicator.connect_on_setup(rank, 0, transport) + endpoint = endpoints + connections[rank] = self.communicator.connect_on_setup(rank, 0, endpoint) self.communicator.setup() connections = {rank: connections[rank].get() for rank in connections} return connections diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp index 996cd3d99..5fd4bd317 100644 --- a/python/mscclpp/core_py.cpp +++ b/python/mscclpp/core_py.cpp @@ -152,6 +152,7 @@ void register_core(nb::module_& m) { nb::class_(m, "EndpointConfig") .def(nb::init<>()) .def(nb::init_implicit(), nb::arg("transport")) + .def(nb::init(), nb::arg("transport"), nb::arg("nvlsBufferSize")) .def_rw("transport", &EndpointConfig::transport) .def_rw("ib_max_cq_size", &EndpointConfig::ibMaxCqSize) .def_rw("ib_max_cq_poll_num", &EndpointConfig::ibMaxCqPollNum) diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py index 62c2619ad..4b3cb6ebf 100644 --- a/python/test/test_mscclpp.py +++ b/python/test/test_mscclpp.py @@ -12,6 +12,7 @@ import pytest from mscclpp import ( + EndpointConfig, Fifo, Host2DeviceSemaphore, Host2HostSemaphore, @@ -278,6 +279,24 @@ def target_signal(sems, conns): group.barrier() +@parametrize_mpi_groups(8) +@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported") +def test_nvls_connection(mpi_group: MpiGroup): + if all_ranks_on_the_same_node(mpi_group) is False: + pytest.skip("cannot use nvls for cross node") + group = mscclpp_comm.CommGroup(mpi_group.comm) + all_ranks = list(range(group.nranks)) + endpoint = EndpointConfig(Transport.Nvls, 2**22) + nvls_connection = group.make_connection(all_ranks, endpoint) + mem_handle1 = nvls_connection.allocate_bind_memory(2**21) + mem_handle2 = nvls_connection.allocate_bind_memory(2**21) + with pytest.raises(Exception): + mem_handle3 = nvls_connection.allocate_bind_memory(2**21) + # the memory is freed on the destructor of mem_handle2 + mem_handle2 = None + mem_handle3 = nvls_connection.allocate_bind_memory(2**21) + + class MscclppKernel: def __init__( self, @@ -549,7 +568,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u assert cp.array_equal(memory, memory_expected) -@parametrize_mpi_groups(8) +@parametrize_mpi_groups(4, 8) @pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported") def test_nvls(mpi_group: MpiGroup): group, nvls_connection = create_group_and_connection(mpi_group, "NVLS") diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc index 1ff96a427..78f3e52d5 100644 --- a/src/nvls_connection.cc +++ b/src/nvls_connection.cc @@ -142,6 +142,7 @@ size_t NvlsConnection::Impl::allocateBuffer(size_t size) { it->second -= size; } allocatedRanges_.emplace_back(offset, size); + INFO(MSCCLPP_COLL, "NVLS connection allocated %ld bytes at offset %ld", size, offset); return offset; } throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage); @@ -152,6 +153,7 @@ void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept { allocatedRanges_.begin(), allocatedRanges_.end(), [offset, size](const std::pair& range) { return range.first == offset && range.second == size; }); if (it == allocatedRanges_.end()) { + WARN("NVLS connection tried to free a buffer that was not allocated"); return; } allocatedRanges_.erase(it); From 28fd377f1805017b2c5e6aaadc3b05c39e71f309 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 24 Jan 2024 09:11:44 +0000 Subject: [PATCH 63/67] move multimem instruction to source code --- include/mscclpp/gpu.hpp | 1 + include/mscclpp/nvls_device.hpp | 44 +++++++++++++++++++++++++++ python/mscclpp_benchmark/allreduce.cu | 14 ++------- python/test/nvls_test.cu | 15 ++------- 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index d46a9ac6b..f560a655c 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -88,6 +88,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #else #include +#include #include #endif diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp index 106420e58..52ade275d 100644 --- a/include/mscclpp/nvls_device.hpp +++ b/include/mscclpp/nvls_device.hpp @@ -4,13 +4,57 @@ #ifndef MSCCLPP_NVLS_DEVICE_HPP_ #define MSCCLPP_NVLS_DEVICE_HPP_ +#include +#include + +#include "device.hpp" + namespace mscclpp { +template +constexpr bool dependentFalse = false; // workaround before CWG2518/P2593R1 + /// Device-side handle for @ref Host2DeviceSemaphore. struct DeviceMulticastPointerDeviceHandle { void* devicePtr; void* mcPtr; size_t bufferSize; + +#if defined(MSCCLPP_DEVICE_COMPILE) + template + MSCCLPP_DEVICE_INLINE void multimemLoad(TVaule& val, T* ptr) { + static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4"); + if constexpr (std::is_same::value) { + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } else if constexpr (std::is_same::value) { + asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];" + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) + : "l"(ptr) + : "memory"); + } else { + static_assert(dependentFalse, "Not supported type"); + } + }; + + template + MSCCLPP_DEVICE_INLINE void multimemStore(const TVaule& val, T* ptr) { + static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4"); + if constexpr (std::is_same::value) { + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), + "r"(val.w) + : "memory"); + } else if constexpr (std::is_same::value) { + asm volatile("multimem.st.global.v4.f16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), + "r"(val.w) + : "memory"); + } else { + static_assert(dependentFalse, "Not supported type"); + } + }; +#endif }; } // namespace mscclpp diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu index 127521939..69aa3919f 100644 --- a/python/mscclpp_benchmark/allreduce.cu +++ b/python/mscclpp_benchmark/allreduce.cu @@ -783,16 +783,6 @@ extern "C" __global__ void __launch_bounds__(1024, 1) // ------------------------------------------- #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 -#define MULTIMEM_ST(val, ptr) \ - asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ - "r"(val.w) \ - : "memory"); -// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc -#define MULTIMEM_LD(val, ptr) \ - asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ - : "l"(ptr) \ - : "memory"); extern "C" __global__ void __launch_bounds__(1024, 1) allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, @@ -822,8 +812,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { uint4 val; - MULTIMEM_LD(val, mc_ptr + idx); - MULTIMEM_ST(val, mc_ptr + idx); + nvlsPtrs.multimemLoad(val, mc_ptr + idx); + nvlsPtrs.multimemStore(val, mc_ptr + idx); } deviceSyncer.sync(gridDim.x); diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu index 5001072ac..022b4d6ca 100644 --- a/python/test/nvls_test.cu +++ b/python/test/nvls_test.cu @@ -8,17 +8,6 @@ __device__ mscclpp::DeviceSyncer deviceSyncer; -#define MULTIMEM_ST(val, ptr) \ - asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ - "r"(val.w) \ - : "memory"); -// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc -#define MULTIMEM_LD(val, ptr) \ - asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ - : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ - : "l"(ptr) \ - : "memory"); - extern "C" __global__ void __launch_bounds__(1024, 1) nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) { @@ -52,8 +41,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1) for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { uint4 val; - MULTIMEM_LD(val, mc_ptr + idx); - MULTIMEM_ST(val, mc_ptr + idx); + nvlsPtrs.multimemLoad(val, mc_ptr + idx); + nvlsPtrs.multimemStore(val, mc_ptr + idx); } deviceSyncer.sync(gridDim.x); From 9bac9e884c798f081ade34eaab2a2af249c288ab Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 29 Jan 2024 10:03:51 +0000 Subject: [PATCH 64/67] restore file --- test/CMakeLists.txt | 1 + test/nvls_test.cu | 203 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 test/nvls_test.cu diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 087fdeb86..ef85cde5a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -23,6 +23,7 @@ endfunction() add_test_executable(allgather_test_cpp allgather_test_cpp.cu) add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu) +add_test_executable(nvls_test nvls_test.cu) configure_file(run_mpi_test.sh.in run_mpi_test.sh) diff --git a/test/nvls_test.cu b/test/nvls_test.cu new file mode 100644 index 000000000..e01b4d790 --- /dev/null +++ b/test/nvls_test.cu @@ -0,0 +1,203 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define CUCHECK(cmd) \ + do { \ + auto err = cmd; \ + if (err != 0) { \ + printf("Cuda failure %d: Line %d", err, __LINE__); \ + exit(-1); \ + } \ + } while (false) + +// AR kernel snippet for sm_90 only + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 +#define MULTIMEM_ST(val, ptr) \ + asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \ + "r"(val.w) \ + : "memory"); +// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc +#define MULTIMEM_LD(val, ptr) \ + asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \ + : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w) \ + : "l"(ptr) \ + : "memory"); +#else +#define MULTIMEM_ST(val, ptr) +#define MULTIMEM_LD(val, ptr) +#endif + +__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) { + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) { + uc_ptr[idx] = myrank + idx; + } +} + +__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) { + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) { + float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx); + if (abs(uc_ptr[idx] - expected) > 0.01 * expected) { + printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected); + } + } +} + +__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) { + // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction + // line is assumed to be 16B 4 ints of 8 halves + int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks; + int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks; + + int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4; + int my_step = blockDim.x * gridDim.x * 4; + + for (int idx = my_st + my_offset; idx < my_en; idx += my_step) { + [[maybe_unused]] uint4 val; + MULTIMEM_LD(val, mc_ptr + idx); + MULTIMEM_ST(val, mc_ptr + idx); + } +} + +int main() { +#if (USE_NVLS) + int myrank, nranks; + MPI_Init(NULL, NULL); + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + + cudaSetDevice(myrank); + + size_t size = 1024 * 1024 * 512; + CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + + CUmulticastObjectProp mcProp = {}; + mcProp.numDevices = nranks; + mcProp.size = size; + mcProp.handleTypes = handleType; + + size_t minGran, gran; + gran = 0; + minGran = 0; + CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM)); + CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED)); + + if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran); + size_t mcSize = ((size + gran - 1) / gran) * gran; + mcProp.size = mcSize; + + CUmemGenericAllocationHandle handle; + // only one rank creates the multicast object + if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp)); + + int fd = 0; + if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/)); + + // some ugly UDS business + // Borrow ipcsocket.{c,h} from nccl code + // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the + // exported handles + // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node + + pid_t currentPid = getpid(); + MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); + MPI_Bcast(¤tPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD); + int pidFd = syscall(SYS_pidfd_open, currentPid, 0); + + // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD); + // everyone else would now have same multicast object + int peerFd = 0; + peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0); + if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, reinterpret_cast(peerFd), handleType)); + MPI_Barrier(MPI_COMM_WORLD); + + close(fd); + // end of ugly UDS business + // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called + int mydev = myrank; + CUCHECK(cuMulticastAddDevice(handle, mydev)); + MPI_Barrier(MPI_COMM_WORLD); + + CUmemGenericAllocationHandle memhandle; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = mydev; + prop.requestedHandleTypes = handleType; + + // allocate physical memory (data buffer) + CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/)); + + void* uc_va; + void* mc_va; + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = mydev; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + // Map a VA to UC space + CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0)); + cudaMemset(uc_va, 0, size); + CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0)); + // set access on UC address + CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1)); + + // everyone binds memory to the multicast + CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0)); + MPI_Barrier(MPI_COMM_WORLD); + // usual VA business: map both MC and PA to two different VA addresses + + // Map a VA to MC space + CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0)); + CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0)); + // set access on MC address + CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1)); + + int rept = 10; + int block_size = 1024; + int nblocks = 16; + + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + init_kernel<<>>((float*)uc_va, size / sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + testing<<>>((float*)mc_va, size / sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + check_correctness<<>>((float*)uc_va, size / sizeof(float), myrank, nranks); + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + + for (size_t input_size = 1024; input_size <= size; input_size *= 2) { + // warmup + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); + } + cudaDeviceSynchronize(); + MPI_Barrier(MPI_COMM_WORLD); + double st = MPI_Wtime(); + for (int i = 0; i < rept; i++) { + testing<<>>((float*)mc_va, input_size / sizeof(float), myrank, nranks); + } + cudaDeviceSynchronize(); + double en = MPI_Wtime(); + double time = (en - st) / rept; + if (!myrank) + printf("input_size %ld | Time = %f us, alg_bw = %f (GBps)\n", input_size, time * 1e6, input_size / 1e9 / time); + } + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +#endif // (USE_NVLS) +} From 7291380ce815e3a19a9c28286cf42c29967593f2 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Mon, 29 Jan 2024 11:28:33 +0000 Subject: [PATCH 65/67] address comments --- python/mscclpp_benchmark/allreduce_bench.py | 57 ++++++++++++--------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index c141a9e6b..47d622c2e 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -128,6 +128,21 @@ def bench_time(niter: int, func): return cp.cuda.get_elapsed_time(start, end) / niter * 1000.0 +def find_best_algo(mscclpp_algos, niter): + assert len(mscclpp_algos) > 0 + best_time = 10000000.0 + best_algo = None + for algo in mscclpp_algos: + config, cur_time = find_best_config(algo, niter) + if cur_time < best_time: + best_time = cur_time + best_algo = algo + algo.set_params(*config) + if MPI.COMM_WORLD.rank == 0: + print(best_algo, end="", flush=True) + return best_algo + + def find_best_config(mscclpp_call, niter): best_time = 10000000.0 for config in mscclpp_call.auto_tune(): @@ -140,7 +155,7 @@ def find_best_config(mscclpp_call, niter): best_config = MPI.COMM_WORLD.bcast(best_config, root=0) if MPI.COMM_WORLD.rank == 0: print(best_config, end="", flush=True) - return best_config + return best_config, best_time def run_benchmark( @@ -152,30 +167,27 @@ def run_benchmark( proxy_service = None if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1: + proxy_service = ProxyService() if memory.nbytes < 2**20: - mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out) - elif memory.nbytes < 2**21 if is_nvls_supported() else memory.nbytes < 2**29: - mscclpp_call = MscclppAllReduce1(mscclpp_group, memory) + mscclpp_algos = [MscclppAllReduce2(mscclpp_group, memory, memory_out)] else: + mscclpp_algos = [ + MscclppAllReduce1(mscclpp_group, memory), + MscclppAllReduce3(mscclpp_group, memory, proxy_service), + ] if is_nvls_supported(): - mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type) - memory = mscclpp_call.get_memory() - else: - proxy_service = ProxyService() - mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service) - proxy_service.start_proxy() + mscclpp_algos.append(MscclppAllReduce6(mscclpp_group, nelem, data_type)) else: if memory.nbytes < 2**22: - proxy_service = ProxyService() - mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service) - proxy_service.start_proxy() + mscclpp_algos = [MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)] else: - proxy_service = ProxyService() - mscclpp_call = MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service) - proxy_service.start_proxy() + mscclpp_algos = [MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service)] - best_config = find_best_config(mscclpp_call, 20) - mscclpp_call.set_params(*best_config) + proxy_service.start_proxy() + MPI.COMM_WORLD.barrier() + mscclpp_call = find_best_algo(mscclpp_algos, 20) + if isinstance(mscclpp_call, MscclppAllReduce6): + memory = mscclpp_call.get_memory() nccl_call = NcclAllReduce(nccl_op, memory) @@ -188,13 +200,8 @@ def run_benchmark( nccl_algBw = memory_nbytes / nccl_time / 1e3 nccl_check = "PASS" if check_correctness(memory, nccl_call) else "FAIL" - if ( - isinstance(mscclpp_call, MscclppAllReduce3) - or isinstance(mscclpp_call, MscclppAllReduce5) - or isinstance(mscclpp_call, MscclppAllReduce4) - ): - MPI.COMM_WORLD.barrier() - proxy_service.stop_proxy() + MPI.COMM_WORLD.barrier() + proxy_service.stop_proxy() speed_up = nccl_time / mscclpp_time if MPI.COMM_WORLD.rank == 0: From 2acddec015a7666123cd81e44f9b8e7c3ea68659 Mon Sep 17 00:00:00 2001 From: Binyang Li Date: Wed, 31 Jan 2024 03:11:54 +0000 Subject: [PATCH 66/67] add comment --- mscclpp_vs_nccl_comparison_num_nodes_1.jpeg | Bin 0 -> 60467 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg diff --git a/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..119e0eef4ee733480cd99cf5df49bd19b3c29890 GIT binary patch literal 60467 zcmeFZ1yo&G)-HN*cXtgC2<{FcK!D)EEm&{}?j9s)aDoSSg1ZHGhv4q+et0`o)!kLq z|NnROeQ&%w#=Q+=!{M+utU1>==iKZ2=Gsp`o>oBU(h^bAR{5;qoClkQ4mwG{o5Z;Z6FMINOh|gP<{BFez9? zVX+kT;V5mf*?gnE!c)C0Z^uy@JE3Mbu=7Je#KprWAbdtcOZS|fgOiJ!hnG+6mAHiD zYbj~vw<@Y?>KdAcM#k?=OwG*g9UPsUU0mJ#KL!K_1%C>Oj){$nPe}Zdl$n*Clbe@c zP*_n}Rb5kCSKsixqqD2Kr?;@quWETct7bG+^6g1pVyC5K)fddKy8is-u7E@FaPTv-blFb($`(@PE@^%C& zcBK;>1G_OqTxyQhXQw|+`^B=qW|-gql4XB4>>qZ`fsmmffXRcx011Px@8~jp5dU-h z#|Qtx!2fp`SalF4(p&bBn0ny%w6P+IWjg&aMEK3wWtz*;BO0MIBMf46#!*-HF{)rs z&5NONwCQV!*%F0TsMzzll4O^$$@G_D%fZ+XwV5|p*UG2E^?6cOpt0f-qqI5osj!M$ zJ6CJ_6pUsgD3Am3|HO+|*PlrmPv@=KyDhjyCMZv4 zI*~jDaHA}n81SQFF^%;3aV?#xH^7`j7WOW3voPBo0|Uu6SW>nGLGtBng>PiS2VOv1 zsm#}<1a#9ujdG%4Sa5@B8%6~1evT zH-W>$?ui?(i)ZAVAP*M;-p)OIU2xqv9jP{^$+209B)th-*J8BFwV*)?wb*)Tp!WJo zz`D&N&qufK_bzM8=`_uuqx5i)R1E3^qZo?T&1?{MP7t6OpeJT=wi0S^bJnAS057FVLnTQ z$M^B_+E0+w6~?Dyc(CWoTUnWSKI_`-{DfZ}u{fBo?juvm=Hp1c0z$DDB0WtH65uU! zX_-);IxIdS(~1_4P!gp#2sSx~JIg>B64N*p9|=F^5X6)v#kaEjTF>Zi!%&%Eg5O^U z@@NSvG4}rMUTRKA8ZnJK=UG$WSlkk0zg;wh$o3-O3s*%nf@p+$dI^h}n{%iT?H<-+ z#rZ*G=Ij90tgaL&j%0&8`lB_pVOfdPIvL7tyy_Ct=mBMPf2W0XVa)`Gb$y7^>krOG zX}3cls~u8`X|k?_7J`|~a*H}E%qxau^Z}yoK{e!JDOG55?SP@?Vr{1i+fIv<7dxqmS>3fZDGyY(cZ?L;T60d)U>(J?lY`5X?!56^m((Ha z^5I$}Jn+oU@4M${ZH$Sy>F0ic_2Fh}Qpwand9$#GfIfk2;WStbZMlM@y^j2vDxhz} zZK6sK(@d8F>k0IJggOwV#h}lI`bA@91c_*S`}SO&j0_1cm>^uaGr@j$d~c>h)gu3G zd~IBc0PRrQjGNbU?RB@11m3Fj-~t>K|*(uP>eNsbT_!$J&O*y zoX&8*Y8#(o#Bi=sKpxwf3{!vzxwj1a^B(<92Ziq^2-tdN6iNGB57mhW1}AFc@fS?v zT_5TCoZ1MPg**a|d^@DPXpg~Stpu(Ia_(SfTw}=CZfmh)zKF#rE{4c2 zx+&!Y{1C`yUi+qI;83=hCQ`6LOQ1k7F`BJ{nL-0U#eamN(P69fMp-3Y=OjL+NQc@? za5LM!w;U3iosJe1ZF@6qdKwY5hNN{bOL?+XF*`stfNW_X0?VR@zhK`s?851La^2H{ z6WIz`)lnGfY;&r%yZU&zsp7GL@yr@W2! z+mKxg=8WV06A0a6PNJ1GvC{1O;^!X-Tv7Cu=rU=xzCWBxE2J>#(xmX|$zOC{L6x{h z26&jQHnx6RdUnPcY9?GI9_y5>4!(^eX3_gf>BB#QxuW?E3iti2M5N+`XgRVYEtX1x3p=` zo&#V z>B1W~=!|s{%QLjVO2PZoz`CvF2t6O2>1aLqOd_{@&tQFA{+9&6;u=HbdcD zpmAfkM$Ok<_XIMTII4PGQc^~O5W$NpVG{OPNU5!WU_a7)+vAI}h#q=%%DOlyGhc1I zzSZ{$>t*iu?BP$Krfxoa5t8`0bsop#V1*xM&S2~0#~6;Y@1u!}?==iS-yBg}8PZT8 zhArfnNnH%AR2+U>s?V)%o}u5`Y;vS>A+Ua{acQ&Ms)w)EBEx!6sY@13@OlCv%OmO* z#z}hf30N8M&ez!q+Wwf~&lb;JxKWcknXX)CRPEowKxW%n)(@5<&TsRn3w0&(q44D; zj+yomlz)5L6Xam^QU>Wzq-0Q<@NERbvol`;qW26H_en%OpY>W?%BYU9Gg?R#fwSV~pYU64B z?!#xp$9%h&em$Bs913>Jd%R%y@&x+G(AIjvWU6-qhj*j+1cF!d_1@7QZ9PF{KFL5$ z{_RA%^UNnuTEP>@6LHIX74d}m5h)1|4ABL~_Wq8uqIZYK!TbOf`alPj`;QX^c3QU! zy_a9sr{BD6LV5zd+zNOC`Pg~~kevj$;DG~%fcHRaPas6lZqsk4M*cRwQWG$G8RQda zwEnC23bEd*Jihsiz&d3w!4}bVS|K>kV z`vV~UgS6HE#La(_&3{to-)>6&@2xkG14LX?Q8I=+u<6~}k2@4GU@*nG z8QpUrkv>S-gss?zr4P>7iojqi{`-eLDcut&%D?>ybcN&nNL^Q&Rv(G&-P^wM1iD9T z1xITuz&`kJ9*Bhbgh_b16kL^MJQU(Rfg*QW7i_i|qMJ`8N7+xSNIsF96G_YPoO?W$X(Tw{;+uXTk+7`=aqE10RV? z#;d*1cSxCGLRmv_Imt7etx_(iD8_c7E_P(!s-*aM)})C`^=e7_cA~G=xu5$TN=A)(1jH~ynh@GlHE*>Bqt*elZIRCWLe*Zc zae4Bqg2kCHZ%{X99MI~A5M0z z^DJaJC#pSWBp#1fR}E>-t0XIiz;CF2YvhVPR)F{re5;MsP3+j?#gKe5TrZT6v71D4 zvnx-JI@IlSzPK{d=rnf$gV-Nt!lRF!qNCC%Ymgc zBj^=XVcKIpQ#%VjsE zND`q!w7lWvOZL1Nf)raO4;>nt-{4k;V^B57Q=-eA1@uQFrWV8XOO}A#K!|PV5OANx z4NQe-CmzIY+&N>F>8fMQu;}zyZR4Uo$SDSi9k^gG;|t!o=&sF3g^0O3*9_x^ur0|a zz9c~9qleTI=9fm>zqS#i?hrD z56{iNE#7`MpZnO|A)gw(ieBBHuS@`K*Uuvmc8S9l9nwgn9EP(oA}>#5)+kb6*MiJr6n({#W-q}Kd$ieoD-uI!X9mhL8x$gLoT%j7GE|p(m&cb zBGsbg21i&?*?|>}yG67mh=lTT?57-aTb+vSRO2aaAIED;x2tvuQ)^nkwijr1P>R|G z#L+0=BS-Z@ub&siXD7dw+W)48gXVKAz;9h~qoLjxT|btcJNUUxQ4jP6|9&PoYFk!~Jf#rr7=694MbYV-2gpr~F zG+faWX!lu{!{wk$7FYre?g{imIX68WcM=ku4OKD>57Eb>qP!oQEmaM5E*BL$-&|o- z62DUG2P+W4rJ-q-XWayKw@@U=jAyl!S43OUGrn317HcZ6!DdzB3N#!Zg%xagdvDqr zzw+>mypyg&!iv$A(S;~gmI&mdNUz0OPT)qeSBly+ajRCkI;8za6J2*zO0qV@IVO~jFtMZ%heL0t zf+C8`iZ4QiF*Fx-7K@{^k~t(^g+X?nNFW&NrfSt{&d*8N;N6grPRqCmcRY+pWH#kA zsjbb=3NU)TAa^;+a|%I{iNTwbkD@XB%(`xkWS{#=(*lw+T4{M!Xi_uJ-sB( zclk!It@*auzJ1^H#1-3=vBRwC{f&mrt4YouQRg)(n;N zTbGIJ-iep`k_DlBbV@V4*xtiQUn_ia`4Ft~m>XMtzWW4%o|~kc@}lU2sPJ2VJ)TKj zauMBR{|1!}2gis0gV2@aS!rHJ3+A{0t0m<`iBF6*sq>b^gFBkOuQoV!jYM9)hg~(=_Uc5;oY`C}b z+1PcCyq3(incwl&oEj4_%rieve*P++FzMtedSw@MvfOugc3?iH%ehWs(MgIDPlAi$TAIag5+SL# z+?|m^#eE7D*+UYfCw4Lub`goGCBC5}kq{snJvB_CMZ7}RB-lk~K=qtQkdP*g9xFNb z!gMD;Gh|`+|Ccs~=LCKQ@5$*1s4sB(_Lec5m~&*Wj9a;{KdxF;U9dNj$tQ%}V=|cY zxK%GR($zeCPL{DRDaGq1!*O7SGZ&Mqi`Y6kp);eZg~IUfU63M<8m1RM93j2&52xz5 zx_egXP(PJDn0Sf7O#W!cFxX^slHAQ^!I!mH|uT0LZZy|Zlm(wmK4 z^F(7_GfZo~1O6f$V65qrLQc0ebL1mr?ygaWzo6^t}6I$Ak}XJfA>Wj751FkRf#c#=($2EucaA ziE%)Cm#|1f(`N>6?0YW$3@Conp9OTLBFExM2bGao|0vldV_cjYXOXqXmpY&sT6x6| zuM$?SAQk=Ui2r@glzMUS&S;NkOj>>k!-DeJQ;GExV}KgxD<;wnqAhSilbW=#nzy}-)do#WunU9(xx?l2-WH6V_2hKh*>3BDe- zm@zFj3HlgQ6P~Qvy;eBtKqA>c)mkQ$mPE>Mvi{wvcBb~j{vAx8ktj9Wa5SZ3yYeVj zFV$(8PWjlyB8x$fd`-=D$TQ_zUp3P_iH~Lyz2`gn>(X3{*SriPP?D}+_X*wgDrDWg z?gE0iLw#=&?mtfzcK02+P+G zgeKjpH93V1wa8~mw=Fh1Stj6Uv909RRNVKG4Av= z?rQwv+j0XoIT_rIl@A$eQYsFQ(i3lMT@P~34W+Vulg+gGSS5*l$6`Z3TL%Itq{*)D ztZZ=kiBP01_ASKyAvvRI#L)TcAW~C^oxAyGC_KNKpAQ+XmQe01X)!RWSW~c@$r5T& zzSg60;x|_C+EGC0?OSWshkryl#UUMq49ltqyhhSjui5kwkYw*>*x;3 zASn!ghUWy(7?T(xg7Sb68^HKMJ*KQw&dzJlenfArwB^3d=~^={@F=6ng<>oVz3Lfv za^hr`1VIIB^}6aXn%jPUalu%CurL1DSGn+8oKkyDlaE-kr=)TDFf1HsJtMz$Rp-X=UiiQw&q`r)T9L{%c)`JAyH+l3YI01Y z;q81YJ)6UGuKAFe-ZgOh+A_*{o=$1wyVkQqmw8%6@L(#@=|dqnZrzhsq3#~1FF|$^ z4#mQ)Ma$$z8ru0)X?(tU1I?F``CG!cST%%sm)nFLx5)1a)LAFOF|@P&WF6kLk9z0K zW7>KTRB03^$3v?_YKKmtPau}ZS4SW>s!P(29C61}`em8QMDsh@&4?VrO)Db;jR@Pb z*Rb%{V29?4$es*$N3u00y@6x?B>J?t0@0#f)wzK3L2is7J;gn=I$p8>?Gj~K9U_{f z2B7vP0yqEvMhg22=O$B!V#`ydP(luRf{NuaZaTzIs-ztmzR`MLs+OaiOS{7;j;wtU z+)re28R;PpjRxb5F8gB=FAcYBgEjnbA1x{(5bYzY@^AGQFE6xO6NWqM9VFA2JUpj| ztHw2UuMB0?zIV`44H44{=|XRMKj;r_c1Q5`B&uY-y$MyEjdrK>&MjcWS z$rxlNN=mTsh2H4Pc*-l^Oo!vFs&>y5*PAf%GvX~2w?c_y`-RYzDgj@9=kIMtd7>RY z?G`be$HIdYBn$C#BiaclAFpgx@2kkL>S2yYE)?9!C(wZVdW(VJE5ol>oXJJl2i0DZ zmTY(9kphps*YWAjYG-fu^XdYpag)~6*FL$LMET|4#b`5s@^ICE7FKTR%~;v z*^{VDl5OataNT7px;fXA(qla@3VU(y;m3?1H`Uy|{efL4-7GSCv89)>hSen}(^HubQ`@Cr-h z*}`90;WGH1S{L6%l+y-T7jp(zsfb(5hgz|%+0;O?Z)8b)gy~tf+`f^tGQ?Kg(ZA4f?yf;-a^|2Sc2;z zgZ&2M$H)*5#t(OkR~ck)KR>G;4WN2yWx@-~21QREH?X)xQ&3aS zHRq2#GCYC>2G3e-yM-FPb2Ks=Fl&`_pO-9-aM$avA~6~&pI?N_%ec?f$N%U?8A&?x z7zA6F7i0QkKlOB@tGmKjLZA_@J!*{T261)qZ zP+*Kjw0lY5lSuB~>2TTXp5C>w+1Zfy!8}X|Z9=ZGU@u{MI$>OAp~n_JZB~6A=M&Yf z`gY9bHWi6y<(sSo?xcv;wda=ANXbPMsL58g&@6h@$+$6IxA0`{&=7vo1^Dg5GE7X4 zq4z_=(k+t$^PS^ab&6*B)v?@mxb$ZTpK&cgsBLvbzS{!Ma}l9A7Lcq3yYbGM?6G(( zFOubJR&1qQzwYvdXR9X6>BJ3H@tm!uEs|gnBSW&#K?OyX9XSf(NlaGHzIoa08t-n= zVRO}o`AV_j)kd6>lw!L;y6D%FHAx{fwTdq`S?>Ea7N3&!KYv+>ClZCmN^=>}MqY8a zCd7?$Hb#jXjx*%UrpC{RUnHQS%AKNuIw?|}j-OVoW`nQbzq9Qd!64_z&I>v%xoUP8HgFhv!{KwcXGxhvYUwX1#(ZT<3RhFBD)tT)W1wd zlKT&iFl-YW?$_R_B+)@Oa!#KJBsMj93~dod>T*4SGy}(uz}>C&K@_kfOKD!M%w#1x zHVs~JIC&4Q9pIG=SoWn`b5(exlp|OJ&_(W?SJ0-i#;-qPjbAkl{XA^nZ6&GYJ}j+ou<<$YN+ z_f5=q!0?d%F}!;Brk-GZa^KeLJpaS9^E)+^Q>gE0l&3mfRjQFDB(V^|YH_|Mv<0`G zthY%Qg!J^vDpdVf>_yU?hJB2YJU5PyMPCOBZVWC&W^juu#fe^v&~5P@6}tmKbgy26@( zgH%nL&qj;U@pu=IJ0W)cYm>b8(**!eGyLAu>Vn7do$Vlx@QgPfJnfuLrFG8{f+?sU zANpPHJet691{qC_f#3Vx8xv-c;f zQ51Sb`rbv4XPr!^b-Zb&ja0CH>OsZRB)@-2XPA zjHF@oA;}x!75YNt@@UrhCmZ74hk$&Vmu}jAwVxNRk_(-A>1Du$uYpifBKK*LTY1kz zn#k{3#GwGqU+Qrpkl$2x>>A`O|Hkcp0_WJH7JyGTmoF>%-NxY>;3S-ZCCisU$Zfl@ zqr**X0`^zF%heiIn3;AWmoa-M~1%xM2EsE`VM7>4HCIpTZXMDerF&{P}w= zdP!D{OYhQ`$c@m3TP);#0#~l@g>a!H(42Yf1|1!E&j2hw>S@L zrZ&y_0{+-f?G!e5Q~=^#0)~a#=HT2>=^nARZe+Z``Nzo2_0t|pH7j*#7Z0J>=3Ax` zv$N3UsmG3~BRe6Y6Y=yQqxe%R)pHduP4Lmm0LRD6+mMIWpmiMvAnbD9`P2`uyeCI{ zL((*r#1LI=9+xwsxELF)^%J0{o8lW!AooH31~EG^kLNOr zTLNMT&?rx!W&tuA;HqLp5p=kQWpb8|DyVO_dDK_E^F8~r%kG;dz5auX*B@~NK^FSG{Sqmxmk=RT9aDPKC^z;@OLR-uT*;Os?%OgI z*#|zfvV9GP)kBM}7c{4WJ+m*^t&-}aj}IK10;Hv_46{&w!%%) z>#qkw2dHD)3~jr6L{KI8-uXtv?bNtgMWho-nwUW&hL(F&K_ShMrE~Ii5xzcqVP1N( z)b_51&*(b;ib?K#=8P}Rgizti#b`ZK&B<=*eqMZ|#`8@ogLv(j)gs3uP5+M-7}k1# z8CWu?2RmkrxNm+{Cug#+BM63?fV1IzAF@at7qvN@y(g1oeLNWCIFcqAhZpyF%yP85 z8>#2^&V8BjlOO07NHZ(H2H6|GdiOyZ@{cipOBqT>f6l9pg4hsRp2xBBQE*?<@PKkX zV{?(i?A#zAhB^*=zLkdN@k^Et0j*xnRtJMAcS`bp%*u0LS?wSQ&cDhL81S;AvUanL z*iPf!&orZ+U@sK2!b%*OqtX!QjddpaqDDoUp^Ebd*=dbrs+E5ewnCQ*yRL4JNX>VsTYTr&J#!aQ{s(n$JA`! z6|aq@iLIeA{M=OS9e387p4Wbg@> zXHs890z~Sgy#G2k?n}0^qErC3;+u@j5IWFbIZ6$?$j(??gu&m-oCm<=og#sOvpE{gWyVc?eeM)CiMx3%Z_ zkvRvmNwa((=^Yh9a3nzZkn#U4pu+dI?`lhg$$xCpVDhCM$k`JwA8oC@Q1hcXJEqX*%zNb~MyJm0w*)s=iqa2)PlfVks_?P}CTT-fcdE9;BlMoGFT!pxV zw{?1}piKj~`_9O7KL+KlhKD(oDlR@dgzC_%vl2@1?$P$F+Y=~8^?@@(Y<2OOmC9f( z>YD`i+NF50sc=JK&ojmbph0Rx`8xW_!n$?y(Sav965h0h_{KLw3Z=3nb$Ihle~8_+%d0M*Py`60_`&(bK zOQ*c0rOVmE)+e-&r7W4O1i1Nak$-pFfD(VP>F@y#wsB$#NYJcOmZXW9WBi4!L{93$ z3LS}*V1R4W4Bj&KG3oTsR^^EYbB%jcT#Vv7*S8n)%1@v@5_L=V5?86ScI$iZWKX=^ z0wz`5EAaiA)xn%xMt zD^DNk?A=x~Zh!y0Tm`V_4%%-H2Z7j>1^eygXMs-DT zoXz+@wngY#ta%eHnzfLpEq7G3xUY)pAb)@f3YKj=dMJ#2r zPGvmwW=`Hc=U-G-`!;4dcdXbDv4pWW^1>6eC;zQf>kv$J>OGK4wbLNx*!0DFoQCXx z`6SZ2_o~1{a8-ssl1h7uxwV!#N0CyVViDVQ8>37_7_CjsP${<1{6cR=?~E&UN1u$5 zbA)rM1TR|`TyY=fiPXWT7nG@Q3~S-s>sjE!t(NC~kqDUqlsHqA?YdD(Q6p zGqDW*nQ_@-Zr#Y;Nxwn_x)H|Wr-1wcXf4mUU;W(1a}5rwDNXxd+wi$10PMG*g7bd@ z9ne34?xbvW1pkQ+K~4JuH}`vyc*x0WF|90FO@GMZF7(5(xC=P@jx|y zs9Cu=?ld^;_7Kb&2*$o>OmVbv)LjOYNxm2h@kX{HZXDC2E2bM6>4y6HpcCUnaY@|!xIwj)mOG4Ctk;mq9VuogY<2O@&0YZHP8 zU{-Iq_zCGUtGWLz;WuP!3yELD+WP(Lcb{bXsA_OT_M~$4^fY&Cly#SeKWn|r*@zm3 z3_0&*JZ^ce`)u({)@0>I&jGh2E=J*3-3B%gcmkz7E=$(b_zll7SeQoRQz}GthXyF{fu4_v6QgId zbeDIA;9$T)`i?^UP~J7L6`CIP+w`0DDr@xA+dOOOop0uc84V}V!DXlR{w{5E`MwzYz);&^MWt(?Fo-NTOJ(be#={B3a z#hXea@qDfO_BDYR(xh{hn?YV>{S0Lr4N=WkMeBK{9JvW{qwFS*v391whPci^W}&r}7bS#!s04)9e9?!A%QoP5QJ|zd5bmyMVps*z zCJAqNXPQ@z!;<4>`$khx?WhCUquXL;ie3-n@(On5@d)s((K`{SSOwqIo}D5hV7>HL ztJPjGQ)69Z;){br7E0t5mfkl<8c;7QHmOQ{|9sta!5zc@VUJ!ii4jAU_{5gXM!;Eh zO#9`0W*k(On;#p!^rjG*jx?AMJX|lltEqi#%=`d4y?N(*qoyW@xfSwRIb8s|5vU_? zfkOGOHxjCGt}kQCuFAtMn@^v^_^?UZZ0df}4RnwAzLp2`Q8i4>f}xdp=#nzl80Q$cU(-d%|GkB3oMkTG)EC2P zU)O@irnAWy8_41*9(U374fnh_r*t%)^AX-$Q{E96TOlRJKh)C3ma9EApgTbE&uQ_cy-n~i6h zO@~O|-_Kp&8*0{pN2gP~OZ3k~gd6b>1fM|hvJMxDn%#)>z32OD^lIsXl3*z(nM@%6 z<#+zMy8r&?zgp=xJk)TqXU4i?%%$@2*PXP8y!lzPlSk{weW3g!ByD?#9H)Bs6KKeA zB|=N)b~a`qjXqOxJc0N<>I{AG@{gR!#57cI-dH@~f?LJ3I?M>o#$zLHUZ1lMJ-cwH zj_W0`bm(S*h2`ln|53b%smz#MKD9a}>>`!lp&-Cb3>EgR)G*_c(Gi&IOrUX*|MBa#J85tX9GUvlj$nd$jVfr#1Ti1-x zDc>P`;y^!5^0ks`w3*DIauY`RE01;PqdiuQIp^R@Z`@SG@$u_>8ktnqvSS)Q>ZsZv z_3z{#M0DmqQDgbQk(fe_{!@>o=1E|YW6_R~@vTI2ZC%>1bGTDclTwwj&tYnPkL5IO zd6~wZ7vo7zHu@d{wPtoe(z<*aHcxfRha+FDG6PRjsD}R{tN*3--|w!2N<6)c#UsV3 zZ;ZC7Tu}GU(URlK4jn}+lkZdmIy#hq-LOTScnzgxw%dF0V*xdhrYa$9a?-xb0w&8X z7WJphK^dn)agXWtwgMQgelPXOal2i3;UesB5IawfJ3RS2G&|kmaxV9KL;4kGoxV}w z^N>YOB5tPgHuOuG&D@>w6e{KKciZY7>+Sa*>|}=Q@xT0S?ojzAz|_lFWsEI{EtTfA z2)kQ79S}E0Ne32@r+1ujrC-+I+nIB8vLhpMX(_b51k`jRDkvM!`#Z=Z)++L{bI8({ zf!&;2*v+%jWWudrl&;JW2IUXGGyT7DRp+VNCo2zzuQK@yP}E;A=G$7`%;XnEFTol` zIdYEgzrr|OVL44DF7ceA9N5jf=!sbFHzbnt>{_4!P2j1g#~1BOC(A*_P0U~C0SWcK zE>KtOI>-JxwHa78p-zRbc%$f;l`rhO&0^m8zonGP19lNJJg_onK#>I>`|5Z)85F+6 zP&b!5)oR^JxMfTjy2Ms_+yWLCAFzMF!ocaj$+H0kIXgBUu9oiJW<0<^8 z(YO|gfdbpl;m2HMt|q&wF^^gyT<`=ka?ZWc>c|tXiC;U@tJ<3h-_ltiJ$xJk_a1`9 zjkz{%sm{TTX@R6ErXBm7K+kWSv*+@)+StcJoAD;$$FOfNYKF6Aw~)UiUcSE; z&LfR0!Ipx4j%{d88K9W5f(j!PGA1thDuOSrP{mr=*=qUcu-RGuh0FiqeF(!BmrF){ zs=69Ne*EkCLF?k#zIbjU&sJfC&mYh@C)8LCSLq0>jhfvjle8K?*`i$JH)^UHQ>gfm zJ;zCzaC+UJ8DmCe_`15Gs><}JEBJFg)xn3v349V!+O(!e`p35u7dspSPJxSgM(>y2 zOf=>3Kow!y_DI*N;TRjN6&C}Ru=)K@S*4y)rh{;CIphgcb9gy`9wmV}Ja;T>7EmKW z5?kDh#4XH90Oi|lKRqTwyp*@v{L)jUT9!dWN9fF5rMkh;VkqJ2Pp*k{99+?4jk0NXZ7=!B;%_wM?GeC-H`%N)RgZn+Rm=ppk1_f|r%n*et5Z)ya|jF3GVAOKYY{P4;AqnYoqN5va-*DousJ zoaD%t-$-T8nZ)(Jpm%Wk=WgXO$bGTh{l0PAzC(Ted34?AXFW5o8AC zdM+n=>|2{Y{*!zTY8?C$hz{Zxc5@Ya0s%gta9^gwfF^zK`kPAhlT`+^q_=>U?!F(RX1(!0wlE2F2hd3{Rh<%IE!}oE^tX`2n-S z{P$S_`b5u3DL(oEJK&en&}xDClqTKx@aaGuMeW#1iJAXzaG>`+6@ZKAM#r~bfzUSp zHy}LR`eFE<{WhaNRaC#9Dk{}5phx|xq5>0R_fwf`bPb~VIWb?4xp+;4@5j`JC|xH4 zc&0txQttVw;Q(lYCMs>3CViu;4)=>n0N9es{U?4IkceR4mlz8bLfBSgG3 z7N(#gAYYEO`LV zi>&`756BCV)1RA<b+i&zdKBS?aP@|d5l1HK+R(k7iVenHW z0Y*sd%4!KZ1hn;w|4Jpf_$jWz99{oIN(UbGcc%o`xJaWPWC}!(JZ;V2#96?8C?JAN z;(pU*=1ZHVfN9ZyvIj6Q17#ze3Zxn}G}oX(bU&f3X9B;dx-b8f2{bZ&=(QJ{lVoh| zH^faa@uE#)!wX@hsPG>xf<_OsPg^SC!b&kEDr#UjiWmZZ_-vzaxzh5ieC9*tf&EfC zMB7P*RU1&8Z9H0ajaIZk(UY>WA(FSpd*^Py^Pz*J38Mqqf><&Cl{h{ks14+%mN_%I z%V?z}a}@e$e8a3HM7!}y_uK%-53+~9J<&%fomn`C-*F_Rqd&C&U`fC4;b%KQQpkDe zPZ6r`rwHY{O^orY2=xFsH|qTNusmw&A$`LS;zvueleYvEXaOFtWV zWn2{5;=Nsow)q5_?i;qZSd2KntDg<%=r)@w%BE^f-o*hnL%|eSh>RV-pL~DHl!qUq zP#1<|n#(UVKmbOz_m9Xc`_@+)mj)=Y-$b;xKLCnqF6xr`5RPI zHK6K`^u*!C{9Q6K0N~!0;ltBCN<&Sio!X?5L?0%-dCZmjgZzcp4O`kTE@%34O8Z%( z(Eiv9p_I@#=n{wtPxJ(Ga}inwCQ3cihaAnZMdVjt{+8O4Tdw;@0Hvw%=(Y#{a(JmV z06-Y|KeahzqdA{%!ArpP{T7u#aNvY4-M$M3q~PB32+&U{7)a2+Pia3Rtj>KXLK7Ye z*%Hz~^k-tP3IpH_pl6%kH_=Cs#p(QKMRfjCR?7vj^COWQEvgD3;%6(!PY`G~Z(*Uo zl)AtDHLpK+)^RNcs^~Y=pGiFz(3ttsLO>;eEwq4y`L}iuzy*t>zg8&uUqyDHN8-cX z!Ca>|hJ)(Ffn{FuJ=XbUAW6?)59!jBx9kZx zJ-dR4)gk0lx#BTR6t)}8d#Z~ZPpCIB@_t<?0KI3uv^hZe5~hKf06? zXc-fBZ?PX3?u$I7#}K%#YR@`AwEW75PX0Rx`iEgEurrP-b?#3fu1>Rpe9z!{93kN0 zPJPUcJi|w}5pm-{*O%G!Ckr$55bR-Wqcb@v>N6>UjTtP#oTlq@b9)zeu^3saS986( z$P4TQO<5E?J3>(~jGz>}O_-G2S$1q8`nLtQ_M{vN8Cd38jpajwYd4^2@k zEUvT986sLB6R1|Gvm(xhR^=Ssxt|pKPnPaqvR2wXf7k@TkoPAc4;x})>@P4Zzvn3a z%4;=}b?=*aJyS0#U6}CE5c?<48>3tG^t{W`?n(i%oN$YdnLAt1msR68SJs8zEDU_* zcK$N4WjFlf zhZO$1(WnuvM`Gw|an=T2U=ry~v6ZJyk9lJc?MB!h?}w>9KON}2>kc)`RX3}}(iC^z z;+S~VrBQSEJ&-uEIcC4ho(%F*t1+EOugAV+ce`RhTUw^qm*4|8=?D9XA8u-6?W+y8in^<|2l{6eqcB=>;iOqYkug(i2nJSn$y^E7ilF~Ysp|~Gj+J1mDpUDh4 zp?@1ERX?=YJQQ8yD~|p4ACJg-lQ4Lu*Ehd)$j|+_a5c=Yma?}*J>?-#46k(u8hi zjeq;z-QJaG4b ze+Iz6|AHF-GNeJfcb*wry_aFoxR5uDE3}<9 zikAB5Xr%)DrKqoz(LiFeCg{PBSa`TdWitkg;?pkJP@(Q&iy>QlWWJ#ct-ju+ga=9R z%ygNlgoKg#TzTF8ux@YnMdl)l6H1qHa#)I^^7f z<&V5|Z!sJ6Y2=391`N{&{OG?P8}WYvM*cSejZQY7cp7oq*-T8dKp$tq-ZXyd=6dUU zerx3pKD0Tfd4_xxhT|6$1fT^v($&(1T#&I5`O;cfk91tS6(7S>J^&xdlWhFUM^SM2 z-ifb;o%>_(JqF8>BNhHmo%j2Mm+O3h8*?YWlk@TBRD#et?3lypE0`V@{So&M7`7hw+iOzhhnO3jI=3&#jkW4IfC` zt?L;c?%EsO;cdXh zoR|N3HcGS$%=b0AJ$QNMxA=1?y9SRW*pLU%vNeJ8@vq%N8nN@c2L=41=jF!|HTxtB z_gwZ5WPr}1&Fh(kfc&X?Bg)7%@wo1$nC_gUz?cB%X#Pi3&`U4$kE@0FTO}JsdaX4#uF&l4u<9v(F)?)2=6mvwh@xmE>Xy0lWk_? zE6U{g+Gw+H!)(1dDFH|y7SoefTHgvt^N%6C0{8OD;*LkUd3ZUn zDzL3=sXBi!G~2gPhUjqH<(a^Cmw4>fnG*JAwoS)l7F6^~{0JP4Z@d4R!SV~);z(e)nG^xhU2dTBHs12L2gd_iLypD_^s zYe7@`bBhX)ivA>82LTZG|IBy-9j6*`>w$vAY)7GdFjVVfd%&ZU?g;t&}doqM}RC@nX z$|X#GgPt!Tkv}nOZAeDQ%U((-f1|bxO`_|E2CsE=El3(8%DAFU}H9nWI^Emw0oJGK+?@<<*zpT zE~i4`nd6#&QD&yL6es7s(xWbO4D##3>h(t<>WBgG$KjNBklOyvZ^{f z`s+oQJ0CD_K(pNB(XCnD6GU*J@M9dG;WaltEfxNq&f>E=R2r1J|GyBZsG$A3!5T)) zpA6Yp9~^%)7%dZaE)o7X^l4a6R!ROVyv^1J)e{GKYQh82N;T1vx>>VgVBV1@?2!uX z?Ic4X;KZE==xMrYdOa@Z)ICi?MPcnne7nB7F-K|M5Gpi>%w959h1k73)!u%)Q#&|2 zn|Hfe%nRzMo8e9(lzJ9Bfx7&|$*O`XB!a4sgm4x9fcgHuOzSy$sYVM|lwCl=HD@%{ zF-%>pDFdHJ8gl9O-4y1-7N3i8f~cL^i`7z*#~ah^uMcWX%n~v`m}_5t;FiQe8NIX| z?6OFzWW3Oc0Y0^Bu5PF=`pVolA))soyj#x>_OyvaqkS={*JS?iLJ|*RWJaQ}=Q@#I zD0SgGxD_^?_QM9l_pB@1H86bQJ)bJu{=)02?oza>V6hYaBUuftHx)xVgDXSkRQ0i! z8YMj`B0G)Y;eCfWWiY39#>b<)v|P#$<~Hfb-+)>?dihyh7T$9(yGhSqvY%`zoo|0N zwTErf)&x-`B+_%O5FS@&E-50rGT_0rDpEmQX@F8>DE$Bq+ z&M9^o$g%9tD%{jv^a<#^mJx;N?5evr!_LLn1WTfuPBFw6(uPo*7a6j@$G^(K0lYgpDkUo^LZfYUPyGH zbx4}45MHDzOLkc3CpTU&bw=L;BOGBpe_`YsGpqTL^59WuZB@kL=X@va(eU=eJTeZL zst_DDwu4!%aXL4t#T;e3C4H;fBrVIB*rUjD$%Nf%3N|KMMRg^6EQ#& zYE4#YQ{jPD(yOwoggND}vtK@i$CDX2d8wBIH#d$sn&pqmUyk)MOFKXG0K35|3H}%iVc43+#S(+y1s#=V=;JNNbwX;t*>E2| zF08Ke+&E?c9*J7#WV zK#?Efjo?D}fm5(TT8@6q+B?!RgMWg2lJeym$?wWQ&{m@Q^Hx3d7-I9jgkNBO>mTDY~@ z3elYtJ<3EFT+bvmBb>je(P?Eg;52;cUEQXZZ+l@gq!8}J#s)Mv% zdnw~QpVCyFdX@NeVZfFo-8vM8iG_}NdofMF;3Ja(k39wMX7tD(TjYWLk693F z*E4eSM4E_i71%J@^I^ZKtGxTLyQ(x;c!C?Vkd`!NFqqAsEf{jw_gW^|`;51@3=vc= z1e3aT$dw|a@WJtP6I#uTSNVNCEYUa`x#b*59tT+Y4 zYlJ{}olZp^rV01;gHov!ppn+$0I9U`B0v3K#_XL{Mwqq4!5r0b&paA9_R|iQ4#+$@ znQ%$jmF^v`UctPI7jNnne}amnYaH11pC4U}Wsvw?Z{=@i!9-(>Y;^aqm>EYYI~EpX z2e6CZT=Nv{aGRBmJzEHevz`eeTllvY4dke1cXUu}iu1m6cA|c?@pb0N#!KgJ%E}0z zCq3E~Ni_VH8Qq8tfC5pT>Y!4@mkSf?@cc@l6^r{5#4pV&a%>C`K)rABXA;#f>`+uf zr!9TZ*tpc)#k=DA^@;2P+;X;bueA-i7~?YOBhL}D9VkfrY~Q&H_-{sQXX+HA0%_0G zJ9L?k>3^V;uzm#yA6JyJBj19CM4)el$A^=C|I)*9IYx_Lnq_pWL|gDtaiy7y?d%MS zDRNKO0z=0#rfY!TgcZ!R(zh;1?iGwbX#z>;TU8ycR+O8dq&<}pU(v^(kO=UfNOOyBu1=5;GpWpo~$HAJESRN(i3XG^C3IDRGcquzB%0@g19m;P4Yf%BmK06B*t~S)8G)ol|`ui1^{_c}SCG>GFms>jVfEAA~ zqs*S&C|ANr)O2-43G%G1>1UBX&n7tAuT0^UCHeYV8`;7|{k!uAPNV~M{u^iN*uaUf zsdvbXjraU|iW~h zu9N6sT|yteb5aE0xp!9<-u8zSzj*}`>u>}9*_lCvx~Qh$vDo{!qGtIN>j#b}EhxPD zUm^hTW8Vh-Xv|w`;7{iVi5(~>3GkCbjCZjg{$f7&D_goCwJpGrZ=$^kL0mbT4`2xn zU`Ym2Ro@{5u>bWyX`g8En79%lOMO!<#VNi3cZCqh?Q8-5P04v(rKvzicopQ?PzJeT-kQ_J zsXYQb-xG=Hap(?z2-dH;ehS3oh zRcsEPEki^h9k(QTAv7d5)6Q{Rr_S@kzP#K*hYV8eA`XJ&hJ>_RqA?@kT6z0%KBs~x$aZVqcFEsEkgz6V?MAJ)fAQ@)0MgZ!x7 zDWEX{*MGoVc7NRN05oG^PS`wmVLlgr2>Es#s#G_ckT^PJCyclK4cq0C8%eQbvZ?vl z>*e6U#>O4IR356p&k_Rq->Ks+uB&Jg^zUfbC900(rG_ijXsUsWXZTatR;gxZ4|vOi z=4bIVW2JFuI@H-{f`an%ySAwK2t)gQkqN_t&_;9DG(tK#@h2v8j^n zFCAmjnQ<0h#v*}%{<4SFMQm*({-M%feRUmUdff{kX0^8DJh9mIS>2JxK+JwKJST-& zt={CWEN{Q# zYjU>)-GXt0f8?n;4xrko0o4XRHXhN#=W)!UXq3b7eoCw!azQ=z?v7bL0e|iBF{*M7w4DG2?zb}4ss;XVzIu37-d1GuCFAA_UWi{+w*g333PfA~ zw|X;%;#T?H5}?t4s8s2UUFqz)`kdfEditnyH`Gtvw6&#^f81lEmidd-S#H0kDMReY zAD6aQ(q?U2df$ez49tICs6S?D2+(uzUR|tS2rvzms7T1;PEK0>;tE_P!Dg-EZf{Uj z$Gq)kuce9LpRzPc2U-NATnZpnNdX7L2OJF2Ex{&ZTz|r)UguV^iYSa12?dhpr_ukFFr) z>)jGeX@UQR*0gLjD`ku7O{pJbjsW~*yXPN%lKhQA?~6AmJmTsmr5h-2obHY?)g`9^ zDebp{TwN9=>jZJJZ|m@OLSt7$g*lIUse;SK)I>+i=?*3&%mxa2Z?s5c6wx(%ydJsv z2RWGmT0~aQZ5TQ(T91=K&rFl@9$s0l1TMWj{*b2EGhyisv9vvGAL`*Fuv)VuxAu(( zB1^MsU*ASOwRt9t-5Gc=w5OO?%a}SL!B*H}TW=3$x@eE%1Xq0S5#(It6Oyo=hqG^R z>V|Q7zPkVEwIeHS8~61x?;RKUtcWcszN@C}rLEDYb84vkE-}+v)ny~J~ zqc`S~8O9jFQ`T_xpP)d<9Ww}tGi_Z}Y3VDAe!{VA(HW-)K2-TsKYlFUtOFNRRCtff z@VGjK(doX;^nHel^<9Z$;hq@W-ZyU>vedsdb8oM`oOPhR()&gwC5Ziam3F}2e`ss= z=oBm4Bt90|aQ#)&iL7ReyM47%Srj~GIQ7Ou^5G@JIE&d3GsPBtBLWsw4)~$FV0rK|p|&SpGrr=)X~-FEd1LHhlk{2wgyvOXKk zEH8=xaO3Udh|XADZ%vTTB@JXxAuyrf|bLLL7A1 z@!NK{f9b3GpAfHH`%B>e-w0poZBCC5DuVK;oV^;Op0*QO9yvSoi0D`eNQZlKfDn<6as2I)xukIuU9fVNEEBmkR3D;E9Syx+DFn#9lr1ECFt zt3z@v7vb8L+c?op4zkd1ubU{$K3muMoQP-xJ3VmW;?8c?xne2GZSBbuoXEbaS0Q{p zh~ii48_@G{_&ZOqniG63NY@*9>_H8HxT%d$x*R?QM)L7NkWCoCu%2;d#7%+ADkMht zI*p;}gxz`hC&!G*Y4<3JkNXICO}W=-VDLApy*y&DLAat}G<9l>VfZ+LaNklpY*AT7 zTj}Pa^4%z+rRz0bI~oFr$1&THJzfj0Aq;lqL-KDDC8y=xZ#u9kvNXwsH0B)`@;-}5 z-pz3vhd~>~7R+e@(p1{|{-$kO!xw>_@iU(V(^rzf9v^#={gfeHZ@0N7+lvhJ!Fz#; z>88oKmlCfA%05nJ8XuqVzGL&nz)s!}xSOw%7k??flttmK1Q5?`Mu04V)N4vcT^)Hl zMATm*9$T}ae*nVNrq%u7#Zp)h60%B+-3*=-AmY!;>ZR$fU8DT+7=9)){jLrTJ+b+G z#J1U$Jz_XtUyfiaMggzj1gb?fvzE=RSdwAt0JI?11TQdMNI!MIy@5Sap!Cu>=UX1z za32bt_&qH7pI-k>Ceh|Rdq!vhcgT!H*eu$~l_WY!EHc>b2%_C_>VmVbk{S0*!u_0_&@u43E{c`4k8irhe?ndRAh@C zn-ap_!g4NgNmobl^6s)0$jEu`oHwLwJlV?E)r7V{7ncWNy@0F*yN$X^pvl&8oUid9 zvDp!Ctt?<(8fy~hXC9eQM|Y*1X3NMeCRYg=LVm$3Q(FWJ+sLoG;q*J7c;s!KFnRiy z=K=)RBI;~Gx)Q{qp{4%$XJe^z<=k_Q`6A6Rbt73>A$QkowrWBhHK(!9>;oDQw^xr& z+6+T17wt<17_>Qv5d-KY`;+%9L{#oH-9~gZdcHD}*g`6~auCDPw!{UAd z5n*t=FYjv1YTQ)D*2uKh`i3i~85`Z(^Qoig@qnY#+Jy#!W!p?(>w)CMH^4&35Ir1B zY(>8IqbK?OLKb1-ZTqxbQc|*xX25&Z`wc$BjG|lpH2)P}G*vb~kd=`9Hq}5ex)*{S zSm`^sFR(rhZ7k2svX-`clT2h0tgEmq896*Y#8#C$V9(N4RW`?QvEJ`e0>VfHEU$6_I>RZIj72;0(^Ov36H*3xJ|zS5G>Kf=J-0JEO9JMzU{`wg z9FNMBNyPH_u6{snw*L5#wHaEw3$#+ZziJ}LOV#9HB*vWV+-mRhAmgKtLb4@F#52^Y zyt7;X0+(ppKORUlKvEtY&6c51Ao z869#77J=}YSi1gZ`d@KkzkTiRA%$;MCtm446uB0u7`aRthv;eff;2O6X$sqHmbSGj zk2+@y6?yblb^pb8dy@Bob8}vSCt2YVvU&rJy$+^lTz4XGyaH+VPILEbC}V@Jfj)>%^f1Q2m8K5=4Z-o zxoTd)rp0>%J`A;z88BlvJq3UL1lkber*rqQ2=Od3yl=Q#f$w&T?6HR|cxAFhsn+E+ z-&f&5J63rYL^kQscbA`cq>Y60`nC5w;{Q+HyWHMpdQ@5-wXz@-O!&Dr7=|W~83w&O zSBq!K;ctIYK&I8&iTi%i_&%`3PXRQaG%;)?^CcsWq%JY8Lg#Dl40aM6#3q&qQu`ka9wgK+b{LCeEPv9kR zNop4wQ3aq7A-Omv#?@*vCeYJ@0;=@U9=UMKFdn3c1Mx>H>|CW&#m=)tOQB&z zg?=K&se>f_lOYI^P)~{Lfrb>+r=Or-zo39#C;ETJ#Kekhm$*mA zv14J?-zXkyndmV)$Rz>wb$KXG*}$Qy@dJ>TDfO`#9xQnnR7*ZIygBzVs z#Rz%l?1^}7cjTUK+&z%Q;eBJAGb9Ps6v>iNPmkZDXd?b-952S0_U=ybytp}ktS@X+ zgC-B2w&kIpHpFq~axE20Kzsl~#r$b%0#94u(-a^w&-u#^i9gd3|4#b8Y<^_Fdw5dr z@+op>hbb+AXs+B@R^Nf)mOSaRo+%1J4TX0`=VjFrSwGy}{2cL?(iRic@j?V@C?7UP zOgJra?ZouB=9%fs*&7;2&JR^S)0~mEIG#1DHeU1;^m#pz&4JYnpAs=2E>W&0t+_B$ z{kYdPs0!Jo~^WWKns)?u%@Yhj@xuL`!egG-- zojDV7&zww!wCj$)$Q0IGlE)4iw(5FL*q-Ls1xX3{0h)ol(v0%K{E-6=B^%-GMy~t$ z_)kDOZ&vxL-`;drYMxlD8ciQy+#-j)1W84sbwx8#J*fV6?V7gA3u<<;t*0urLb)5~ zapeg_5}>H#XES0n`!&wq8m*i|;Cr=?Xo9Une`KHgc-Ow{&1=b!OmDF)cpO_EIaQYH z{RlUR5W^2m;f7uT)o#v1Cr1@vRe}YA_4pjGT(Nr;-PWn$C$ z4R0P91P6guGjiSuHYXyd@r9cuE?&IhhP{V-bdHPrPBN=QKZ)k`-Q`xlR0-#-#OG>5 zOBV_VpG@6;et#^M&6tg~l)?g4g$?d-pk++)P~#Sgx1`JU##53RGPjH8H{Be6t+<=N zqKoj!;*VrIXfeML-uoDaYoan8eC@+f%z{~RG^{$?Bs#R%$)*3m&|;Co#2WSmU?_ne zuFX}h?mVg*?%hgz*`ud5FL@gJog11Qi`cy~G@QcaLt2Q3ggFP1f>F`~08^6?C{PQG zH3`^}I4(`Rtt9s!_9Y_4vOiJUZ%be zA#%vPPDli35@WX3GUKQq#^6xdxTj1iIo}fd-N@3`&IOLuGA7}ORx4&4Eeq?HeA>_+@^_xbTjJU;*U)OP$Vm+LrUccXkpj zYN3a#gx|XNjuTJdhe3)gKS8|H$)Ax>Mh9HkV=*_zf-iL+l^Q3`euBQPAgNCSvevof zL#30ZjjIE{-4~-`DAL0TPdG&%94NKZ@4(0`2?v~Zj@vcqKez!QK)FnF~p2az)%>dUv?2yH>W_9(Ri{txfUn3o1$w_sRL) z7UAlw`UL0z*O2woeB@3d0)rqA0qoSMb><%SdWd*3FNpB#HW6Zz%U{zMBN!&cIagGG zeMIotC-Mjt>bMks5JY_M*6-Y%Z0wm_9Gsbv-f80X7uocmI8u7u6ELFj@T~0od2UJ- z_UQ{nEHgK;0@ZW6nyJK4ApAHHBJ+&+=AurmVxhO?yV8WD!?2eYl!jc?KS6{)LE{ZO*+>3%CfS86LT__( z1SNrRQ;q$5w}+O6T-q>>0OoLAT&kq;GAwCDvy zi~b|I=GV4+l$Skw<*J8<)vT+cG3&gCt&*kR%o*mH4L8jTMMax?ZGXkIQ2;dn znwy|--4~t9Rt=iAqD(Z8+DpCw!?^y*J#w)64u6VR3bRQ{7a3h?L2+*=#_A8;Tk%wf zt^>QAXJup9SAez+>4})?z##<4N5iLg^-(M`O2`F(;kGDTm-`p;@eM#eb^_$1U~5;p zbi+&w02hN}{e*iDxq|0=?*78PMfZkx*1J0uz;25r`0BbrH;3OT#qWB4bevG{$;8#ZzW`==@U+U`O zK*xD<7ke=4l|8+<-YYVM&>|_Z-+2IVD!)he0ZKT+f!+4Ele{9}Rr2+5zG2X}n=K2z zDHvUWEu;A7Kf?MbMJJO!O9=`6j~m)zZxS=Q{pGCbZVLf@1aE&iOBAbs%`XiJ-ztWK z=JzWtGq}<()3xXTR%1HOtV8eOatk$|wW>0LO?NB&{`UtqS8RYv8o>jC&+cLGa37F> zea_vcFZcc`0%qJ2bePBWo6rDK!=rX}|CjayxdwLIdv(7U-vdzZREih4OJRa9{I5&o+24PZ9jsC^aZ0jy95tf&B@#@?WiS7RX-dUu_|$f&zz3%;x} zLCegWk@5k6E3@#@p33Dwr0&A~YQQBeZn}FEnnP|v*~$}6p0nbEa}E`vA1$k3kBWcX znI0m4@6fAyMDfXSPaVNGfIlg1q$@hXK_@)0(af^6N_{Yxm2WUR7g=BYO@5F3n_W!V zRFJhZS$<34b5gig#Np9OorKsx?xEz%rf(jKRwRen9VtBK>o^JR2_c6e5IGP1sN0Mev5*9 zAe*a?!fHYl-%WkRjK}p`@bB+to_-5(D-wqvYNp3`4+6-oEWSevur!OJECQp51T>00 zgt&zexE`7OY&eBz(2C8+j~-x&FS27lR&y z%>T&ki=#WbsQ2{$la@f|#{WgqGQ-u}GJArs;LyvOxKQ6;Kd9Jw$!%=$d?XS#ran;e zNp2U9XrF~$$TtOb8m z6$4t2P7Hr-J;F`@bK^;_%o{`O{0$C8K<}qQ*8A~W%x3m9fW9d({wV=$`3 zfipSPS`_OSBm7odn2L5PTsSFg!-GzPC4bu-u2Xy=h(GY<>vUXZCo?CSG#S@QYVmG= zadk~4aZd(#I%L+;$Xo5*i&=(uo*+V4nzVfq%*I?#ZF7t%Z((1kOR8Qsx%HL4C>JH^ zV!<25r>x6m!S+Nps?$&zUKU5b9iO?dl~)XH`;7xQom-*djzgciRfh7}a*PHfWlcH5 zIfbkBWfFCUA)f+?%>&sTIur?`O+0NpQctK4Vtx$g6)f(tYE7>BWxY1P!Q~vvn~n)v zzPE9^x6eg^r}gf6)Kv0~gn}JzIvQ`M2Wrk#VfaKMK_6uKe6zq`b|*~Al?d)n8NKY8 zSy^7CVWQIWc?v=kp{22L9~+fcxMK^BykjeTd>5FpQ}|b)&7Z&ar<=IH8q2x9_Uq&N ze|pUM-~ZmXN?_vtpxgN7FXC)fp`FO_gdmAa8K zj@?n8)N#56B*Lb=eOhpP^$^kK*DXXwXvUk<(fPW}d@I@!SV%&YyHSxqGqk+B;>8k7 zcsgW%bN&7fG5Ho$x|d8iGEHTz+KwInnQxNsJG7yMrG%yPK{j*ZPQk&)g+&2+Vnd0w z@aZonPYMP;61xpdjXv}ZOv&sYz}T?_#?T2Jm;rC;mGA%lmHpR|qJI}iTG(lrzjoo? zJ4xIL*EDq=H<)gqb(A{mc8ClB)`CI3e}dL{fFUw-FMoo93}vUy4_I?L;12b=yg(B# zM)ZPa!;=7g0KO|d6FL6WLF!&C)AkAbLP zj&wKRZxYN(D6Lec(R(lfoKKCVwNKqeKtow7Rk3#0x;4)L^ zVXk4$PwKc$H{^$np%1j~A{7w5y=NZCW&!8PWQ*I6%RK>67Cq6 znLYx-3sd-Jo{^GC)?&n1T$sgMV*=h(>dE2+%sl~hK6qn~M~&CnOp*?IZ; z)U+v|f2!${Y0IYE2zqQg1`VxBHgfQw7<|ytj@5s_8`LQ}Jsx6XoIugX$jg&~p=MSa z=-A7%f55fTWvZBR-*F`hz((q6aj>+sU(IQ_w4Xpz);EDsM_{>+|D7I-`dfe3Ygfu5 zqoqbEhjcw#=V?BwQ4>7OWhkhOOQ!Txqr@3qR$Vv!yr&uOG`gpyHcm4qVE9^#1!sYt z^D}(r#hDK&DI%pLXLp)#q=B)&V1808&)g!$+9CwFTuW;8OJ0=s-p|>77MrtaEeF#I zmRI|OI|C}Ch%A{@p^wx#BG|Lvi_RrR(~`C4nPj@r3@ZBnzn_BB+aJFo5d&une`Dyf z0v$c6tb4U!5ylkLBv?_kcPmL?IVvGG)O|v_8ojdH^3^@nnlieXSsP%&-IutqDY(k7 zixZCvF!!)w?U>O~)#P7Nc-gFGBx$=7VjFcSUPzA))UfyS+Afq)sSQ8wV>=-Rz{$6X9H;!YvM%<*_gBA6|=QAo@FdiWtH|L`G^FbP8%m%EF#xmj89 zSsuy0l4iOB>1<<&<}Q9fhA}yQc~qmW-Dn$AMlH0#YMWK1=45?xGiH4wDkS|?%38Uq zew8j>D)?@RJtd~zMM9ZFJ@lc{#7M-BYef!3yyJ%u9&7{oxNf+Nh>Zdx(ll~m8xaUI z;X}{4fFA92a%JX8#qkr{FB`2z!LO2iUk9*F%KjdSVr3uPpscF4+8)1niFw>H&iu1o z+Xi+6|E1C=1P_6aT`yf_!m;j(hI-s~WG@%>!G{Ea^OdCjT7p!gV*>V8Du)wY0y%X= zFO$pCxnayiVTmMvD~ae*)N4P79Px`-vSQZ}yLl!HV?=H6mcdi6luYeXRgH;L@^C}rSZvz z{P8|TvrKwM#)jhvh5jwJGEWsMF%EVKv$-(##r$V32E}=~Z%k9?4a|)0Pf;sXaNxtu zESX~LKD03sM=G2FOT0>ph>%Z50YON$uW;1&ag*KV6D)I~>D(kC{Qggd+QG@b?b|o) z?d2z|`?4iDjesr?m85^C<^T2TXe~|^Kf+BX4vBNEj4E_yx|8rH@{+$21$ejpAdx2Q z!Io>$UIY4+PvjZ1jMrk^h-to!t2}p$Lh6cB+qK>jiOMF_1!3w!rZ#2>2g+(b@FA42 zkXjLqJXI-5Z^FwW32%`mBpqcQVWY2c$2qiHXM9pK^60(qspN%YeNj>vm~nlXi+l(T z`+WhEJvQ1}g;H2)NxMl+8nlj*^cH{Wdo1@&3guyDW?r>lrovzdgzdY`*1h(8T6enH0f-$@O~4wtF%Jru7T zD!cQMy?ni1go^(=+7!^TRwwD|YMP-c=ICmTS68dc%#e#G?=IN$My9R92A?G`E_9UQE%&Gc#;wh6u60RBH)EJ)L4kf1Yh+Yxv|U56CT>Hm z6O-b8 za)wWGw2yhfDkCbX&NvmHxF(Vw#w`-9Yl$i4#SC^@*~>Pj$&`&K`Cn8(d7O!WwMjF5 zbGoOt>_v~KoO(5?M@Y#Ud!8?;CjSIkI}2#?ZI%umPLe$xHn+PFWqhX@+V}x|>FF@u zf>O&!KCrxvKqy7i&&Vhx>W2`n5{sayr~*SX=$05?R(S8dyxQ<7@z7VqQI59nlN_H^ zI!0YAV0PoxNa)XhG~;b>keIT^SW8?Td%fD3G90gL2F`JA2j++W1PM*o2OJKC9Vj;J zMNU;K`J|b;GqI05XEX1)q-tgW!v*saazy*L?D)8O$oSE4>CcL`#kNgcd3enE2fz}d zvV@N{vWPK3PhH-Ax*FZvqQfF?eJmf%Kp>TO;yyc9jY6qOfmVUHX#WEr=>Ia3yXud7 zQV*}@RVWcX$+C8RZY1H#<`Os&5kMCR4-utgN$*Rmv(s!^hkMtj0MIT#GR zD_!3F_<>oXF(>Tv8KNi#rK35nL=1(jd&h+4)| zRqJbZW^mRj{0KT4EI_PP23z4akEOdKP;F+tX5Bk~1z}DTM6Q@6TEHv!DY(vcIZrZ3 zDw|qk4LaY8Y^#x;$622ap(HfUIIQ52DOE`VJWU5A(IH&w&4g+QZZB{-8Bq){WiTmo$M-n3et~$Rwn9 z-=VW+w8Oma`6%I)ddmHwaL)rbrsrQs$=z?U8Ut2m0#+Y3GeP-(F&Z5(T9vPOTYc$F zGNhM~t=}ETx4qasY4PgVOi*KvW?p0Jn$2+q&10F+D5}kD|9Rdb&`ypvR$B~Yob9PB$=F97$?&;2-8MI*ayJUOy^!~? zX(`u$CGW>lB(O*l6tVK$OuQ4gf{wmRxVoEJrlH#OdN6GW0!}S2^VMI1_Mr_WYO9C2 zY3=Go$*GHE@zm;}Sx{Otd_26HYMVG%4^9up#?j;*N?Izfmh@|993Lw?lpy93+Z}JE zv~fBjY(tpFjntZx-eU~$$%|GSj)1fGNxgsVqTgc!>eW7K(RB}LspngL>s-r`Bp{(2 za<0(`Oq87cj-4?lxT%rgGk*zgio(!O9Tzgcy;8d z9NXq4Tyy1aS`7uH9&9Q5touyk#el0t{eUF{5p$(;veQ)hTI^m^zKFW=y^9K#yn^Y< z%CVCHK~X;(oofgS1}Sy>?Rsj?=jIaW~)Q-uQX1b_a9NaybIPuUxe zT9YyPi8X@W>*>+bo02NjHcT z4d>OZ1{LrR=fEK4AF!xB+i|0+x5+woMJ@n!hTqn}$_$24v$c3AS{V-Gl=<~?(471+ z{Nd#5KHYY`o6nxHF9!2i%PJOn2G^-H`89XsC7TNK&=dnZ1I{b5P(j?Rra$F+CRuUN zN+00*kP3+VFTce7>TV93LR^|bou3S8>Z#k}Rlcx39YiGzok^zo*$(rD1ePn5_fVKU zT<=sC?G`!^{4sLQ_s1h+SC@1ME}a(lP_!3?X-JpdB-7gmP`k0kzD4xi^vh~yaMw8h z^rST^hz<8pMBJ_YXM6eM#dA0pQQv#BfLuvg>suO7Zk9xiT)uGmHIQe`eP2_L^AA{u zJUTejWkUr{uB-mk_xoA4_=kW^SV94g%p{1BCzRvr0rIkR?smuJU|I z;EScb4VF;UoHmbPio_*&v|~ZUq>Eub%Nw<|7NhP0vtrWlo-ZLtO& z*Q9|Q?m}vND(CU%G>U%eb1JAYHL z+Ok;#-*}24=yUJ3EuDE$&At_4X9F7j-f>;ynpBJ zo9}Uf6u0pE33`{ab4g^b!3VNBCLr+3Fyh|Snik{#W=lNvaz7kL%yvK9PNVi+uoG90 zsb#YRchIJ?`^UC^7?^D@ZFAC6<0D!f*F`f$W4!r%0FPRxp2~IthBL96tB94(ryggS zKwy5scj^y7%+`lIDy{J3(~|K^p1xSNGYqE{U|*(Otib#D?#!&mXfPTafNYp=-aQ&} zbq{@|>HysEtXCfe*_?cU`qiT5|B+KpuauWvwTisy3)6IW=^&AR$%^l+x6E7w zTd}svSg7LCz0sVpr@GKhaFG!mrL}m-_mb_)WvAWQGV~4Q>K?BW*D*8sX-Vv6C4xD< z0#R6M$qx6ajVIXPI?mg{J|$^Fl2~$gw23)6yLNi5bit~oTns1ihL7C%uorv39*1l) z)^F=r%)nyZUH}8iYZkD5n#P=THU=fi!h3}(-PF}d@89;dd3J|$m9Gx`^;BZ6+rHM? zFl);AggweZC_&87H+`}6eNI9K?JnM7B;J}_@2bQdUYe)ILeusG7dth`T#3}Us#pLv zs&_-R7lG(f4dh>L|4GS#Aw$vDWB&w(3=G++#d(H=#hQ>ls_BT>V0O~~Qb z(aTT-=o}EGM?%GR^`2?olT)p7S5w$reZwzW7s`!&Fi`CutRlV859rkePSK^yl0Oaj zuO=Qq0O|2v*YnP8VNJhz`17XYRx1GH-8FlYC_P#&h$C5(|CA zul|vf_&rueVM&iJwT_x_>}YM|+*yP8x-!F?Z$#NPCTd1UhqPZExmCd*eu6%EWo-&D zf8wQwy7{CnuP`Q^drlP|&-DXXb8k@Dw*3xne(ZJo(th1B>z^PEwH2gJIkGoGAOwQ_ zWE;8ccrvDbGPZF9#aWxc)HKM+nwY@UV<-l7A8Kao;?2$C@;{m0_C+3)pY998f)YM8!{%x zV2*HP*04Xf>op!%yZo&hg6O$DoGiKyhmEcJwawFp(C2|mc}0s*%;e$vM# zeV7EPnp-;_N1?)$b(jm<&Z|GkI|{h>7)Jvjl0*ZWBE^N}k;A%vUwW+u9`0{VHkLcL zxAK|fC(diA?q~)x?!6u8?l}IWf_>f`iSl7y31@BNMTuBnvxH|O zZ$B)jEn=mO+13zpQa+3ev`v&LEh#T7lQ0#f^1MN)yr_;QD~4+WKcG??`w?x`b9+ss zjz9jXdV*zxEX&x5f3ZXqc&p%(a~jaR80Cv%9p3*>AfkWuH~%gh@te%yZ+*^m|o z39ER7G~(UyR>?6>tI|(>#qje;xp_CzhSRHLNd0e<;%Nt&EP;MQ!;uD^I0VjqI&{94+t#FtkcJMD~-@fccx73mDH2b)cWK&hwy4 zP+N@W87YAW0m*DyY9~WnP=SDdP^*WLg55FCx3jr}i-;l6kA6~HJT87ss0JQDg4molp z0G^qu^l6W-<%sRYHW{Co9r+1NmjL_7z!1-7C>7$<*2;0fMab)v+7=|I(Ypmc0(IKU zN*ocSj{t!>g$HO2uY_y&bXcHAuKPQ74L!++%F?!u235Y3n_goa_wA@0i7j z9!MXhaT0?-{lHqINQ!DzVqPp3OJNNcj*26sNP&zd=L}?zbau1Wa6U)i{HQoV9Xcx^ z!(In$vDrJ>)3iKcw9{gy*kw@SC~6W}Y!-A|`U#?oC`Q;$09@lPzN?V)tZzN5lVel$ zQjA=aqay=$v15h8B#9$wXc*5=5ci7X8iByx(c%u_v`a42UKWB~DSES>NPkpc1F}t9 zo?x&~Zl_J<>4Z+TVj=9l*}xnu2`6q*s;+oFZ~?mv|I+SSs(Pe|=uL$+OW*zpGHi00 zp)3+zez6fmcK|w(uEm~L3E4AkvMG{e;*(aiZk;3{&`mtW2S@HIlt<71f5p9ZRF&)6 zFFa|aK|qicCW3T#BQcrMol3`Kf^*7vyfUi)2p@3qf( z-}8-g4rBNS_weDl@B6tre%BrP+>!&zuKP72$;N*Eg$6=54n+Xy5o&&ZVqC4tRuNc- zo0Elj9*(tM6h8PwV1NBtdCL5;n!|~Vr!kiZgyA=kgZ#NQkcWPd8Od=q)`H?ie3qcv z2=(#R{Bb?{-A85O&D}`Y+53^(AMzPeKX2>YZ`^wTGpOX)M-5-kl~^K+*teqh^lpgf zdC+~+Fi;kmHQ{HY5qn(7u$%iE=q!JFg=2p%(>7P2nx;;B!p?*r z=->?kZ@2QV@(1+{`V>Kb6syzkk@ZURXV<#!}DQ_<mdUzErz|fEb~H*U#47}_s<_1Q+T7Qkz3;Gc_<7 zvmTgYI$S!_e^9&D429!Es(WW$2ALd}1WEkj@-iWHO%n$dTiOVy1s$>FPlC)f?7PF+ zqIb4+Ine8Kz8oEn0+xZa+|oGxZQk3BmXv?ujV2qT+o*?0WzsC=~zPPc`bvOnKv zG~Thu+!Gr4sc-3@XbpVaLr)7tqn>YyKPMTrxmGGqW*F2*k7ft|c+9p;*Wi>>?*#y8 zJ-1%;KKSxp`zx{KeHmuA%lOOI<(GW0L+2!F)Z?X$d()k&5q>9&rl;DB)0=%G-*lwI ztFoS}77JqKfHVrr%rJXbwFP0O%v0_qj2t>ZW}7)m`aksOBL5#=$20vGNAq7S-R=Kz zFKxL-SEN{2go~Jm-lCDdfs8Zkb$w;+nEd8X3q!i^;a@!jF^;fyx3lnI?QRXKGqyU( z+GG0Z@@grkf(&FX(Up&0?<=TwxeZys9^+MbKWa{(gNaZm? zhsWGc%s<4HxCwz{gwOZ}cfMd8vhsOTMD7*VQSiR=tbA(v({Umpy^ZoUSIYS=^EPUAXq6&{>&kp(O$0 z%OZ}}_chwd%gIk#?M>y zf+i%1v1s9}rK_MV7^MX*bj+ zxZRg+H@VRoP&~h>ujClxWY&7F(p3CKDgtQWaB382#%gaoPLIiPb$2f2F@_2ZVhhk% zD0hhZj%T>L^0mr7lZ84fY89)y=kE_(kp9r0=|t-uH-bJsut}G+x)eOpX38#NAsu=gFQ+4bbK6MaT-u@dyi78bvF-2Yj~5wbI2Ks#LuposAhnG1Lj}ZVgE1I z{*Tyx+5T7?EjEr9Y(Qu8kb^rf>0GMlvSq#JtmgcStOH49?lzYrBRDbASHBMLT)&b0 zW*`U*{cvK)?LSeF;Bg>j7{>lKllE==JM+i{GTq<62}mrp1x?!LH_9}F8+7v-4^vPq{Jgxp(B@j*=CUboKw)?;q_6R{`~n@*dfP2M z#z_{A-<66>?9U(5et$a;0IC&y2{g!NiUFrXP4>IBZ}I{FcY)O(zgCt0SdIPX$Myh( zGa)Hp7qGAfUDpI*=H?z=SlK|S0%|M=Em2v}zAv-Nl?rd{&$B;Xv}a27?8)@uxd}Zv zJN#ok*LnzHZ500cW@ew(HE8o+FD&Q&;Sl}3FW>Sl5panoO*?=fHM@Vf_~*44mGBfw z4k9RCbu5ovw1Hd`+B{mNgnNG*BTA)s;^ZW*jG~hCCA86#oI<5hfC+T7(GB*H2oAm{ zMgkg#A8f`0VVjhr`v>fR-d?0R&e);(u~iO~~gbg1wblwhW)WI7;X1Gb)If7OU4=VY1)Mmb@^ zSl8M9Z(jS~KE*#Sc30^B$+lZ1M665rt7=!?214oaj+UkqY zHanWd+kM{Hss|uWbHoI zMDRkq;S9~fQHgkY5Snxl&>ag%V4DAuz<=?=SDaa1So(iS6n`y6Xij#@{r<-26R&+_ zz8#jWe`*NQOCr}4b&}69;%rM)mEWixI`|;PiX^A63df)W_ddiszqIp;|?k z5^g#5qRG^Y?cxKn(eei7Dq14Z7JtQh{HilRrO4!gh3}S1!Db9YwfsGzue15%18#lB ze`0|Eyn%mFA%Atg|3o(bbXD&FnZfr7>K3N=P`D=7O0H>PAQxHk^i-0yQG9(0Ffjk+ zMfFd|5b!tS+T5IoxjC2#wV(;TT-QCd>g~|WKlH`_P>vLC0z5lU>$y5*Dm@U8Dc7W` z7p_IwJAnXE^^*WPcCy06tZ&)AuPH;Q(1!4O#gitg29*v+Y``_)h8?xpd13wQRhc&mxXNEY&nMVpeX=LlL!vaJ z!u&2(N|ec#dA_h2W$QG%T<{Wlw|*<(?d^ zOo<^$L_{|?;}kOyLL~k*CX0n#Tz#gWcBBd-N4tf=Y*T>C89|2X3`L)IRr9J3ZI+)iGRsYp ztvBvTk|kVSs>QEQ?007YZfUJ-;7qML#M?2nD~iNZ8e^Eah(S(S0BiqD$*r@C6#a)v z{2X6=wkv`=mZCz-;uR*}*3`w^yCp{NL8t)*2Zg@0SyO}s-Mm@@7@toUr{akUWgw>$ z=7{I&;=>z!6@-wz__(JtkKy7-EfYffriOT#ltjKYQLHG?qS(pbtuFg5O0+o9WYP;H;xoiygspj`e zxv}Y^%@zTw{ApYCulp*bwQ)Ya+aQdC4e?6g^cUM?nvGS+ehI2D%1aqmP+aJlkNp6W z=m-f|@DwG$7pj2%f(LwsWy?k7IJpRnuvA!LS`cQMd2WU@Cu&F>fVdi&voOa8_lhs zVwRS;l9S}Hip7SU8II}}pjJ$zgmyjqS%zntdR8>Yn0z3ZdE?eHL_a?D;-Q!{qIAO+J z5Ph?xNIu_Y+MQf33EPI9lx~Kd(8&)kNc_ATMrUz+p1ydIPBk3liJ7S$O3(xIe2Hf% z2Tpq?hM`g*iHl3qs}^m!Cv$>2MS2~j!{YEzzou5bhUryvc*mzd(@p29_& z3oq)MXFgmG!p7Z&&!v)%F_t29YV66aL^Fbl=*J3y7+i^K;jEsa{`|s1FY&apSpZcf zK;@EvQy|iHPZ!(Sgu9ttnOeklQS_D~#F7?y>x$T%s5fgX4btzL2N#>qO&-U+fO76uW?br_Nk}Fuy(+0H@ z*G7CKW{zSaLm5l*$BNKf4>n2NV%Af2`ifzQm;b@n7nVN@_4F2&f)gDgha zQE7CMUL?os4_a4tn|Tu}9+7!W$!!EBn$;4B;2T?`AmdZpPU{HU=b^~+wQihpTu{|j zB^?S95E#u%u!N$}t{}IWrySK9S&qCMTbp3j`^rtKUj0l*PF*YU-6cY{ zeDe?+yw-s@J6E_E`oP=!{jl5c`yLh47&*tFC>_6S>tkzuO>NR6Z~!*X;FH0=eJD>u zBRE5FD+2T_J7P-|UT%ElC;4bf-^YD!Hz0Xq!FOskYD*(wn+r$WR+Uzr@Ui zw8kn@h#QsBs^qL+H=J5xJ#;anz17}80sjg^?&S~T1~n@d_NjWE_o^lGOuf{A4Fi;!gU@`N&5YtZ(cwshq@kad5Bz z0H?S!!HsX|ZyHObK3Y$wr0uX4!3YzxL)=BYKw=}ShIZm2pU3$a;P3->yZovfv+_jL znp6T^yB}0en9}IEke(~l=9iHCge)5yMKtKo6DJ%ijkHsLNoX`L>x>$xU?F+Z zbOsRkL>*dU+GF47*2HSNz$Ly2zTwx!oInY`7w8#zq5`VF?h4An`Te`Pt@|e@DziQY z;;}_@dC|p^(I%z-l=Dzp^B-iK9Z`tpP)P~z&=q8gOsZY=!Ml&9aP4lsC$E~EJ5G%a z<7VRhHQqgOT2)Ld?C&`g5yLmGI>Aj?)UC~urzQfa+A#iwhzXHio3BT5Z7 zsi`?>eaB#KV@5ZxSr113iNfsMAh8!eYQbfb*#?$>5V(7u~|!&lB-aqNFgP2c)?{8pgI*yc4u z_Zh$=T;gsuNEwd(*X|V(N?yJ|sf29nJ12Q!niKxx`!>|7#>IT(K|eRUZj#7 zk4-3aK2prmljGpcLVeO)ck73k#f1bm7=q%048?-Bz{zuK;)hyHns zAywhIL$?ZuhqsuXDCCUff@ETMAYwJu#eQCG=YBXric55g;2<2Y?KRIlW7Icvod{7` znlN6G$P>U|;Mbkv6GWQv&zrw`6bGdanhr2*Er==S^;N#ceY4AQ!=z4%_zk2~daVraZYsSq}2r!lW9g?PS0PRD`A|(O6GWQ zH$}B-%t4(4-B?74p+-)2fNEJ3bRpgLy-}1v7{E2WZg8B@g+~DNLPff zrP+TePfr0%Li;!QH@@!*YqdZ9p#X)`bQP20>OWiMK<#s`cj?ub2#C+Pzq@~%EA`Qa zbH<7<;i&;DB9&tvx%`E>YT zn_G|)AxYJj+}*T)PjS3|o!1()Ec(K~Bf25gJfV5+l;7JbNMGd)Vdpg`zc{ThG z1T2oZ3pjzpPRb8N8@8t7@KQUeh3h8m@jeuez^F^v83NEgIpg+p_DA=%AH^d)_-1q- zWLGB>6q&3L2moP>0-DXAhLg49ww2n2`!}c~R&_>X5sjxs; zxdU%x8GYvu{h?L3d2PbWf%$S;jm{3K@5?mIHNM~Qoc%E|N{})8c4WE95Sa9#T*1-O zg=e5w8rz`arlgPX+?{0M=#31cx>_aHion;!+{zeF%XHEhBiUV359BCtg;?vZ7_7CY zt`H6NCnEOGh2~0~C#(uFVkFUmuR_EU&!gtPx8`Le7aH_Cf@0*8)`$&F@-$Ij1q}Hi9x=HDCNVcAhc3u=|T&aQqIW1lRG18m=U%5%<)`Ro>R6wCOa2d=OA>}4^ z6oOZ9GSww-t&via@~m=op}u~?oel&P>5aRuK`00aEl3DF1urkj^5_^iZKq<-L9rMr zJ>eD(y4!e-dA*i8SUJ|FaK3awd#372)Z4|oZE3eKfq{nNoY6rpf5s>fLQwDGK^(e1v1fG_SN}|a+JL9I;HVAO%sy+|d zh#w{aus&5xprXff?chtm7wgH(kbhO2AQU z=P2sL9T%j`-0frQivt`UqhY-Ua`r=GcQJGUAut{=dJ_xqzN+gg2=uI>n2nE!hJ;LT zTT5|ytKsCPBvOJXMLT`xB$iuRRW+Y$t20)$clRp;*`ea-khXm>FDEe=eb=>d4qJC< zO^&BSkxlo-{D50omrC@>))iz9Q;b_^FAlV$A$@Wgwn(HRzWfHIT;0G^=ZmF021eD6 z;dJO58iv}3J560)@ub1vM{&E;n8|8saf%OsGB(UMjKV|og0R)*=9BcL>J+}E1UTRO zh+UC%{}DmIM1OVXkiG@S^vP6|*VJ!`%^(AT6T{h)@3OT*?_mI0u ze{++w1NpaSC9T>6!$*oK3=?*YkK8_++$|K?s7J|LYSYu_OO+Z4F7196sG6kz=5UrA zv!q+c--tzIjNYyO;0LB%TxX?tmMXe?*ewcM$oXcnpLq2};ftxuZ?-jE<>YUf`pP+H z`}@fFwv*~dXXm;d2%X{!CC#1oCCR?O1!p=B0hR6~8tM~eywLz|;ZCsjaa7H#%vT^v zgfhS=N+PNgc4sBz91e%3_4Sby-xKSLH&&d_X|-qwY@llr_)0do(}*#({ITvg zecu}TR6S+KK3UE&`fi?=j83v5trE`yG2QKNOVHO4=g&V#-q%mX6Pq6ZX8@ zp*f`^Vba}$@IcxJR&D)) zRG@Y6F}IjCGOpFjnr~g$!iFPaog|@4Q>00HoZJdp`xqMI1%<^1XB^bq+|NrbgWSjn z%u<%5_Fwm{jCtfhCfM^TCu=REc2i)@@;JSOBNYDB+?%+*vuW>TUrd&jZ^TT5)VD-xq-(G7 zAh-in5|cjyxboY5k&l^~IttsUCu&uZv|QQbrXl0`Us{s~8UQ2wRl+_%YAPT+2=CtB zjS~OpVMVz~eZvd;BrB{OQ9LEf-J~RA&g0sti-v)zO9dPO?R_$q5(`~7K?_Q6Q-At# zA7~zId?cS&quu2px8BQnUgbXT1Ct-mE{2?5`|#F}w`kDD%KhS%NdWCI_A&N_dqE=XHyaTJg(3OfIIHoM2bbDN8@!OFW}1 z+sz~8WWAE%dBFDi+1#+wsWoGZpgfs+t7| zYpVXqJ0j?zL@PwRfFcx6vO>Q$;o#b$cD%iEh)7{n@Fv&)wS1w(fIoMwA9i zNPm%7iosHgAFI3Fr>>YVA(Nn>wAJvkYe4EIydLJ}_Ew)xu@$zA<%r^gzI+(k(AeUd z3YqvC!cQx!LFtgj!eTGtF}%NWo`p@#Im#!-W8^?$GkyIbAYT0x+eUy_u@$T^&%EN? z$khi9#ML_<2|hTi9c20Gs_XEWrKffL$mNXl;BB87HTNx)S=>@!zuu6@#mvTMLeaMBYC$6WeCn7uttC$mXt%_V6qFM|>$wE#7#$8~8 zB*8dmLyqD?$D4Vs?(mA8NAAluHPJbUv89#X%^pt?iC{rNAU5ESPomLHs;_sF(!ZAK z_%y-OE>6!#p-UbnTtiAeO!%JU$*1A3R#Dmwfewrg2S!h!M+amsgCU_ymmbbwyMT@w zywb*Y2}Mx%@nsn834f-auVYEs_qxRnLj$M-#%wv9odE782nK%uQ6gPBc}-^*2a};J zoJ?Ggn?(o)5N2nAtFX8bUMapH;;44b8UmmQHm@AodolDh)x?=E$9N|x*%U54D&@dW zRm!dDyHcuEHL`e-g?-Nk06E_>2&2q&`|{2CD0|}UrF1nxbj+89v9WKGFxqxW=Vgf(U4?j zMftHiHGlbooKwVXyYYv!EwTHqAEHAgEm8Kpf@eb@_0igJUNLqm@j1vwC{C%I`_e!# z;vr*tNgC1&i1`j^xK_K%CKL=Bjvb~KH(qU~{;41C^#mvm(Ks`iA3lYaISE69}NrV!bp+qvNvpbY+zKJ2h#l7&wFu# zT|hCiz)a{4e$^c>wuy#8U$GCKa1MBKA1+RC)jD~|dZcoX(@u@7{Om*Jk@ z<#1^YS+ls*XsM$>4m}NkcKF??j{w=klT2b4x^3s4zDeA==Q&Oh3WRnM=NWtoN7i<`zlG5?99Rj>~?UZRX3NnOGn0mW9p`_Zz7Ab$c2f z3OXOmv%%JcBwCVfT4c!Sl4#s|W+B*vbUi^OTx|u@ zBU+lD(6K5e}lS|uLaYm zI*Y_a?3kh2*nDsEX@S*fD%U&3^E!%I3&J_yK%f}&bBMn6@Il{;9Eu5_dEG!y(Jj8b zRf<#*m4`;#b5B1~U4%{r9zLP`Y+d7Y^TH(bH3b4m6wbF0qGkCii*#3VPm-i|kL+wn zcxiw!Nns^Qu7W4o38P5NcJfyTK1QCWa;nLu-m-E^dUsef?=%8h~ALFrCPPe}1ggO{+H!5kRgD#MqT!l+iLWVw`Nz4WkWy@TUruKM`F)Z_c^ zb0I*pVhs?GS}q{>u5 zWB@Eb2n0g%7yca-zz{#$j&BdwRD#H+sV6H*t^2!?eg~;+i09pQRFvQrtMEyCufn+j zi@|RoswLgM$GQpy#VzcHEb|7&d#paWMf%QTRDvY3Wmic*#c`?Q3DL<5& zy@l8x9uC#AVDy>+aqz5f7-Cr-ibir1yF|OB;T>m-I}Ituax=@g%mlMYdJVTND{1HD z_e^vFSe-<2d697uAXa6_dLuYNsje1C2pM=oJ+eX|ER+f)a>~k}%cuE(PstK|V#i}GpePdHFJ6#9VEy)ZOnrWmMXISO!K8#V5q=TUB<2Jkw-3$ro@ez&DJN{X?X`LxU~iU#fh|K+#q98fP3R^ zFBPO$9|VPI*30-m)@e^N%Hk~rQKV8SgKmqx{tj>Tzw5I#j!^04Z3VM;0pLv53i>Re zktXmx%7}eVB*qXibXX3Mk=$#q^gv*b!$oz9R!a)OkG%$V8EXxXEx`6TQa>dbQ7Y(~ z=dqfOv+sa^o7DzB6!Uw;|Hu z;sdJV#zUadSeDWv7KMJQuDEU}Bz>KBTxeI`)-5O>cKtGZYmC*(FSK>? z>z*2$Y`wDX(fFtsx=^wNp9BRjI;LH}IMg&G-6nW_!dmms^VC$KIbGG6R55A7`>3`< z9}ar9#0_dmlCgd`&R92VJU!&WQ+|4e2V{b`EFrO-r1keamG^yBEfL;&OM7aQPc84R zn;H%;1+F`wRx!jJYHhRxR}^YthR1ndLbD7CLMb4$O|1>dgQUHVn$fz>oA51^Lw}wL zlx@eX7+y);5D;=Nczt3*+fBR4_ftbnL&Orn*Cj&*V)L|wfCYrgy2dvot)ru?ahIFO zy@F>B4ipf2kTNk&2FBA;Ew-00g7n7N&*sy@?X#`ZqRf%G;xmBqBG`5AHgF(Q zoeXB9(Nn7!NS`zqDF^jbI!0(uZwasJge0l7l&#$4xNL5Kjn(4A^LoYCG7=P(uiXkw zRVis~NtZ~b*K1z1X~ui`W;q!pDC5&+tGYk)&J@>9iVLY4 zgcks0A=!#+ZfG>?=d%Uo36++?x>9t)rX&t1D%kQ+b8RRT_A%4}CzzaEu$O)+p-cGI zn5NPothT8oNuID9ZbSVOY+ryfgY1X5g`AqpA785Q89bhD1uJn4enEEE4L;~KLpTYa z97t08u2B%Wm&$sBn`CBT?)9g-FhrwOU5wyuTR0wQQ0}!%)X%V}X|2`fbonk)^}7OX ztH*Eh)kWsPn$YcUH~<4QunxFol(d(|A}L-^1v%vQ=XUZ(TU*Ag8T6fk-zm=F3mMxVgoN{!4^$`!30m+2Gj3h~IbRH#Qq=~~Ptb{W5p>T%!KGIB=T zK-EBD0$KfW7whESOuMnOpuAZ<{npY`R%wP7OJ+?p8cz+>L$P@j(Qjye*mJVLQFVLW ziwv(kpxC-Yyw$k@VPAKvx$yeP*dLtJGm*Z}wL+v%47&;!S-`ROT%_F6VxFJJrO;-?Yb} zi(ksL^&P<3=iY*e*ZKTm`bU_0A!f8m7xZ^HYyk+7P6lTvtN#-)_%IM7&-y{4U zY8Phv^PCsfJv%Gt>E|;VI$7j6!BmsIyXqo z0l_kpT!MetC%BLq-bdHWJUDTNO=bEUNTunE+lf_1T)LMHn%!Goi zFj;UC8eOp13o}m|Q`jleTiDc&1+5AreWGea^*g_25n(YBpM{r}eXI|7v#**ZOCu>* zq(+E@S)7IF1@aa}1-~OBVIZyFl+1f~@#L4?#U(`z%$a{ygJrj?D*PlfmJ{kRNL!Cg zQ7&8KA^uF=T3;7t6r>#EVXtq-tA46lv#J^N$qGfxPrN`Ku*=}W-qW6$YvQtHI(|_pyuxg+>g_64 z3}F+_tCMdiX~#=-<>enrMhfoY9eAgH@L9Ve(W^L$kBtPy5lXfhB6EID+~zkmvX=4{ zo`>Wa&dIWBSGRv4mH9ygStp}4BMHi#wU!_-c}ZaGVUrCZc}}$GQ{ha_lcehT0)f82 z;T&ZqR*57D0%3sgCIAx>_}NBd1Lj_f3@HzEuScw%&OO-d3l+-4Ga*dQnvQkUm|1XI%5c><_@q45uVA++_cDoJKPA;jdY+)QO_m~=3Djn~t$4l_FGjm|{xLxJ(1ZvL z^*T*+LQN9Q3O21%?}h%+F<KgZJbVLsai@@@BKxIo}4 zPxQRkI$x))ICW5`O4jbt>oFK^QCD(PkO&Gls;Oeftnwc2|*42icZ0dDTz>Ly!<7C_{`1d3}i3KKzufB@DB)jZ|(TzGcF)J!5I)ygzY4VSW8Spt9sOB zr{ZXfV7`m{D}W8*|M!5hGMWi21wE%+#i3XiCgA1U-ST9Al{Q!ppK(ZK9n~k7I&B@@ zpQlB}aScX6rP9TydPB)ibQq*Um;*j80(~G0oHc1C>k+)3mM^iJ=)Ev^yY{_(W6J&! z?o9hsP?y?mWg`s-B`WHN;qt}GNk(R&hukhO6>};DBjO5dZ0=To0y`4ru6N2QA_qd^ z6V_f-`9Y@egTa1qjK!zM_lHSqGLwtV2|tniIYGoo?1|YsDO5EEJ8qo%&xcx*!1Ik& zQJEV47^^yfvi#*}{NMgZPzWYMqA#d(bQdR?kXYq~ML*qzvnVvGiRaQf-Y0U}^Rx4i z)B5a-AJbEA0NT_o%PjBjND`E98djqtp zqcPVe&bo;&>4P04aEv4b+YHRqqYN}`{m;<*UZjwfzw%&=W{8kw-rmKk6qCXz1pc^F zp%zva2pvRKWc|16Cgc8uj^(t& zE-Jzwvx}-fe3{J%7zfafIx|HIeNvlBiltA_>9)iKqkDOrmJS zJOwvh){2K3&Uc(VhpfsFKv1fFI%X;|W#O@O=)X%AxGoPev)G*LE?&C(y!uvQRXaYS zC)oE=XawUs26)5Y+gIQPFi(@dsNfSzcLPWunPhcmJX(4#NfsL`iI2$i?aOa4C2lmw9ErVQmk6UeO z%%szIg&x7h9t<>Mq Date: Fri, 2 Feb 2024 05:16:10 +0000 Subject: [PATCH 67/67] remove useless file --- include/mscclpp/core.hpp | 1 + mscclpp_vs_nccl_comparison_num_nodes_1.jpeg | Bin 60467 -> 0 bytes 2 files changed, 1 insertion(+) delete mode 100644 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index af969108c..02c277a3e 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -493,6 +493,7 @@ struct EndpointConfig { static const int DefaultMaxCqPollNum = 1; static const int DefaultMaxSendWr = 8192; static const int DefaultMaxWrPerSend = 64; + // the recommended buffer size for NVLS, returned by cuMulticastGetGranularity static const int DefaultNvlsBufferSize = (1 << 29); Transport transport; diff --git a/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg deleted file mode 100644 index 119e0eef4ee733480cd99cf5df49bd19b3c29890..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 60467 zcmeFZ1yo&G)-HN*cXtgC2<{FcK!D)EEm&{}?j9s)aDoSSg1ZHGhv4q+et0`o)!kLq z|NnROeQ&%w#=Q+=!{M+utU1>==iKZ2=Gsp`o>oBU(h^bAR{5;qoClkQ4mwG{o5Z;Z6FMINOh|gP<{BFez9? zVX+kT;V5mf*?gnE!c)C0Z^uy@JE3Mbu=7Je#KprWAbdtcOZS|fgOiJ!hnG+6mAHiD zYbj~vw<@Y?>KdAcM#k?=OwG*g9UPsUU0mJ#KL!K_1%C>Oj){$nPe}Zdl$n*Clbe@c zP*_n}Rb5kCSKsixqqD2Kr?;@quWETct7bG+^6g1pVyC5K)fddKy8is-u7E@FaPTv-blFb($`(@PE@^%C& zcBK;>1G_OqTxyQhXQw|+`^B=qW|-gql4XB4>>qZ`fsmmffXRcx011Px@8~jp5dU-h z#|Qtx!2fp`SalF4(p&bBn0ny%w6P+IWjg&aMEK3wWtz*;BO0MIBMf46#!*-HF{)rs z&5NONwCQV!*%F0TsMzzll4O^$$@G_D%fZ+XwV5|p*UG2E^?6cOpt0f-qqI5osj!M$ zJ6CJ_6pUsgD3Am3|HO+|*PlrmPv@=KyDhjyCMZv4 zI*~jDaHA}n81SQFF^%;3aV?#xH^7`j7WOW3voPBo0|Uu6SW>nGLGtBng>PiS2VOv1 zsm#}<1a#9ujdG%4Sa5@B8%6~1evT zH-W>$?ui?(i)ZAVAP*M;-p)OIU2xqv9jP{^$+209B)th-*J8BFwV*)?wb*)Tp!WJo zz`D&N&qufK_bzM8=`_uuqx5i)R1E3^qZo?T&1?{MP7t6OpeJT=wi0S^bJnAS057FVLnTQ z$M^B_+E0+w6~?Dyc(CWoTUnWSKI_`-{DfZ}u{fBo?juvm=Hp1c0z$DDB0WtH65uU! zX_-);IxIdS(~1_4P!gp#2sSx~JIg>B64N*p9|=F^5X6)v#kaEjTF>Zi!%&%Eg5O^U z@@NSvG4}rMUTRKA8ZnJK=UG$WSlkk0zg;wh$o3-O3s*%nf@p+$dI^h}n{%iT?H<-+ z#rZ*G=Ij90tgaL&j%0&8`lB_pVOfdPIvL7tyy_Ct=mBMPf2W0XVa)`Gb$y7^>krOG zX}3cls~u8`X|k?_7J`|~a*H}E%qxau^Z}yoK{e!JDOG55?SP@?Vr{1i+fIv<7dxqmS>3fZDGyY(cZ?L;T60d)U>(J?lY`5X?!56^m((Ha z^5I$}Jn+oU@4M${ZH$Sy>F0ic_2Fh}Qpwand9$#GfIfk2;WStbZMlM@y^j2vDxhz} zZK6sK(@d8F>k0IJggOwV#h}lI`bA@91c_*S`}SO&j0_1cm>^uaGr@j$d~c>h)gu3G zd~IBc0PRrQjGNbU?RB@11m3Fj-~t>K|*(uP>eNsbT_!$J&O*y zoX&8*Y8#(o#Bi=sKpxwf3{!vzxwj1a^B(<92Ziq^2-tdN6iNGB57mhW1}AFc@fS?v zT_5TCoZ1MPg**a|d^@DPXpg~Stpu(Ia_(SfTw}=CZfmh)zKF#rE{4c2 zx+&!Y{1C`yUi+qI;83=hCQ`6LOQ1k7F`BJ{nL-0U#eamN(P69fMp-3Y=OjL+NQc@? za5LM!w;U3iosJe1ZF@6qdKwY5hNN{bOL?+XF*`stfNW_X0?VR@zhK`s?851La^2H{ z6WIz`)lnGfY;&r%yZU&zsp7GL@yr@W2! z+mKxg=8WV06A0a6PNJ1GvC{1O;^!X-Tv7Cu=rU=xzCWBxE2J>#(xmX|$zOC{L6x{h z26&jQHnx6RdUnPcY9?GI9_y5>4!(^eX3_gf>BB#QxuW?E3iti2M5N+`XgRVYEtX1x3p=` zo&#V z>B1W~=!|s{%QLjVO2PZoz`CvF2t6O2>1aLqOd_{@&tQFA{+9&6;u=HbdcD zpmAfkM$Ok<_XIMTII4PGQc^~O5W$NpVG{OPNU5!WU_a7)+vAI}h#q=%%DOlyGhc1I zzSZ{$>t*iu?BP$Krfxoa5t8`0bsop#V1*xM&S2~0#~6;Y@1u!}?==iS-yBg}8PZT8 zhArfnNnH%AR2+U>s?V)%o}u5`Y;vS>A+Ua{acQ&Ms)w)EBEx!6sY@13@OlCv%OmO* z#z}hf30N8M&ez!q+Wwf~&lb;JxKWcknXX)CRPEowKxW%n)(@5<&TsRn3w0&(q44D; zj+yomlz)5L6Xam^QU>Wzq-0Q<@NERbvol`;qW26H_en%OpY>W?%BYU9Gg?R#fwSV~pYU64B z?!#xp$9%h&em$Bs913>Jd%R%y@&x+G(AIjvWU6-qhj*j+1cF!d_1@7QZ9PF{KFL5$ z{_RA%^UNnuTEP>@6LHIX74d}m5h)1|4ABL~_Wq8uqIZYK!TbOf`alPj`;QX^c3QU! zy_a9sr{BD6LV5zd+zNOC`Pg~~kevj$;DG~%fcHRaPas6lZqsk4M*cRwQWG$G8RQda zwEnC23bEd*Jihsiz&d3w!4}bVS|K>kV z`vV~UgS6HE#La(_&3{to-)>6&@2xkG14LX?Q8I=+u<6~}k2@4GU@*nG z8QpUrkv>S-gss?zr4P>7iojqi{`-eLDcut&%D?>ybcN&nNL^Q&Rv(G&-P^wM1iD9T z1xITuz&`kJ9*Bhbgh_b16kL^MJQU(Rfg*QW7i_i|qMJ`8N7+xSNIsF96G_YPoO?W$X(Tw{;+uXTk+7`=aqE10RV? z#;d*1cSxCGLRmv_Imt7etx_(iD8_c7E_P(!s-*aM)})C`^=e7_cA~G=xu5$TN=A)(1jH~ynh@GlHE*>Bqt*elZIRCWLe*Zc zae4Bqg2kCHZ%{X99MI~A5M0z z^DJaJC#pSWBp#1fR}E>-t0XIiz;CF2YvhVPR)F{re5;MsP3+j?#gKe5TrZT6v71D4 zvnx-JI@IlSzPK{d=rnf$gV-Nt!lRF!qNCC%Ymgc zBj^=XVcKIpQ#%VjsE zND`q!w7lWvOZL1Nf)raO4;>nt-{4k;V^B57Q=-eA1@uQFrWV8XOO}A#K!|PV5OANx z4NQe-CmzIY+&N>F>8fMQu;}zyZR4Uo$SDSi9k^gG;|t!o=&sF3g^0O3*9_x^ur0|a zz9c~9qleTI=9fm>zqS#i?hrD z56{iNE#7`MpZnO|A)gw(ieBBHuS@`K*Uuvmc8S9l9nwgn9EP(oA}>#5)+kb6*MiJr6n({#W-q}Kd$ieoD-uI!X9mhL8x$gLoT%j7GE|p(m&cb zBGsbg21i&?*?|>}yG67mh=lTT?57-aTb+vSRO2aaAIED;x2tvuQ)^nkwijr1P>R|G z#L+0=BS-Z@ub&siXD7dw+W)48gXVKAz;9h~qoLjxT|btcJNUUxQ4jP6|9&PoYFk!~Jf#rr7=694MbYV-2gpr~F zG+faWX!lu{!{wk$7FYre?g{imIX68WcM=ku4OKD>57Eb>qP!oQEmaM5E*BL$-&|o- z62DUG2P+W4rJ-q-XWayKw@@U=jAyl!S43OUGrn317HcZ6!DdzB3N#!Zg%xagdvDqr zzw+>mypyg&!iv$A(S;~gmI&mdNUz0OPT)qeSBly+ajRCkI;8za6J2*zO0qV@IVO~jFtMZ%heL0t zf+C8`iZ4QiF*Fx-7K@{^k~t(^g+X?nNFW&NrfSt{&d*8N;N6grPRqCmcRY+pWH#kA zsjbb=3NU)TAa^;+a|%I{iNTwbkD@XB%(`xkWS{#=(*lw+T4{M!Xi_uJ-sB( zclk!It@*auzJ1^H#1-3=vBRwC{f&mrt4YouQRg)(n;N zTbGIJ-iep`k_DlBbV@V4*xtiQUn_ia`4Ft~m>XMtzWW4%o|~kc@}lU2sPJ2VJ)TKj zauMBR{|1!}2gis0gV2@aS!rHJ3+A{0t0m<`iBF6*sq>b^gFBkOuQoV!jYM9)hg~(=_Uc5;oY`C}b z+1PcCyq3(incwl&oEj4_%rieve*P++FzMtedSw@MvfOugc3?iH%ehWs(MgIDPlAi$TAIag5+SL# z+?|m^#eE7D*+UYfCw4Lub`goGCBC5}kq{snJvB_CMZ7}RB-lk~K=qtQkdP*g9xFNb z!gMD;Gh|`+|Ccs~=LCKQ@5$*1s4sB(_Lec5m~&*Wj9a;{KdxF;U9dNj$tQ%}V=|cY zxK%GR($zeCPL{DRDaGq1!*O7SGZ&Mqi`Y6kp);eZg~IUfU63M<8m1RM93j2&52xz5 zx_egXP(PJDn0Sf7O#W!cFxX^slHAQ^!I!mH|uT0LZZy|Zlm(wmK4 z^F(7_GfZo~1O6f$V65qrLQc0ebL1mr?ygaWzo6^t}6I$Ak}XJfA>Wj751FkRf#c#=($2EucaA ziE%)Cm#|1f(`N>6?0YW$3@Conp9OTLBFExM2bGao|0vldV_cjYXOXqXmpY&sT6x6| zuM$?SAQk=Ui2r@glzMUS&S;NkOj>>k!-DeJQ;GExV}KgxD<;wnqAhSilbW=#nzy}-)do#WunU9(xx?l2-WH6V_2hKh*>3BDe- zm@zFj3HlgQ6P~Qvy;eBtKqA>c)mkQ$mPE>Mvi{wvcBb~j{vAx8ktj9Wa5SZ3yYeVj zFV$(8PWjlyB8x$fd`-=D$TQ_zUp3P_iH~Lyz2`gn>(X3{*SriPP?D}+_X*wgDrDWg z?gE0iLw#=&?mtfzcK02+P+G zgeKjpH93V1wa8~mw=Fh1Stj6Uv909RRNVKG4Av= z?rQwv+j0XoIT_rIl@A$eQYsFQ(i3lMT@P~34W+Vulg+gGSS5*l$6`Z3TL%Itq{*)D ztZZ=kiBP01_ASKyAvvRI#L)TcAW~C^oxAyGC_KNKpAQ+XmQe01X)!RWSW~c@$r5T& zzSg60;x|_C+EGC0?OSWshkryl#UUMq49ltqyhhSjui5kwkYw*>*x;3 zASn!ghUWy(7?T(xg7Sb68^HKMJ*KQw&dzJlenfArwB^3d=~^={@F=6ng<>oVz3Lfv za^hr`1VIIB^}6aXn%jPUalu%CurL1DSGn+8oKkyDlaE-kr=)TDFf1HsJtMz$Rp-X=UiiQw&q`r)T9L{%c)`JAyH+l3YI01Y z;q81YJ)6UGuKAFe-ZgOh+A_*{o=$1wyVkQqmw8%6@L(#@=|dqnZrzhsq3#~1FF|$^ z4#mQ)Ma$$z8ru0)X?(tU1I?F``CG!cST%%sm)nFLx5)1a)LAFOF|@P&WF6kLk9z0K zW7>KTRB03^$3v?_YKKmtPau}ZS4SW>s!P(29C61}`em8QMDsh@&4?VrO)Db;jR@Pb z*Rb%{V29?4$es*$N3u00y@6x?B>J?t0@0#f)wzK3L2is7J;gn=I$p8>?Gj~K9U_{f z2B7vP0yqEvMhg22=O$B!V#`ydP(luRf{NuaZaTzIs-ztmzR`MLs+OaiOS{7;j;wtU z+)re28R;PpjRxb5F8gB=FAcYBgEjnbA1x{(5bYzY@^AGQFE6xO6NWqM9VFA2JUpj| ztHw2UuMB0?zIV`44H44{=|XRMKj;r_c1Q5`B&uY-y$MyEjdrK>&MjcWS z$rxlNN=mTsh2H4Pc*-l^Oo!vFs&>y5*PAf%GvX~2w?c_y`-RYzDgj@9=kIMtd7>RY z?G`be$HIdYBn$C#BiaclAFpgx@2kkL>S2yYE)?9!C(wZVdW(VJE5ol>oXJJl2i0DZ zmTY(9kphps*YWAjYG-fu^XdYpag)~6*FL$LMET|4#b`5s@^ICE7FKTR%~;v z*^{VDl5OataNT7px;fXA(qla@3VU(y;m3?1H`Uy|{efL4-7GSCv89)>hSen}(^HubQ`@Cr-h z*}`90;WGH1S{L6%l+y-T7jp(zsfb(5hgz|%+0;O?Z)8b)gy~tf+`f^tGQ?Kg(ZA4f?yf;-a^|2Sc2;z zgZ&2M$H)*5#t(OkR~ck)KR>G;4WN2yWx@-~21QREH?X)xQ&3aS zHRq2#GCYC>2G3e-yM-FPb2Ks=Fl&`_pO-9-aM$avA~6~&pI?N_%ec?f$N%U?8A&?x z7zA6F7i0QkKlOB@tGmKjLZA_@J!*{T261)qZ zP+*Kjw0lY5lSuB~>2TTXp5C>w+1Zfy!8}X|Z9=ZGU@u{MI$>OAp~n_JZB~6A=M&Yf z`gY9bHWi6y<(sSo?xcv;wda=ANXbPMsL58g&@6h@$+$6IxA0`{&=7vo1^Dg5GE7X4 zq4z_=(k+t$^PS^ab&6*B)v?@mxb$ZTpK&cgsBLvbzS{!Ma}l9A7Lcq3yYbGM?6G(( zFOubJR&1qQzwYvdXR9X6>BJ3H@tm!uEs|gnBSW&#K?OyX9XSf(NlaGHzIoa08t-n= zVRO}o`AV_j)kd6>lw!L;y6D%FHAx{fwTdq`S?>Ea7N3&!KYv+>ClZCmN^=>}MqY8a zCd7?$Hb#jXjx*%UrpC{RUnHQS%AKNuIw?|}j-OVoW`nQbzq9Qd!64_z&I>v%xoUP8HgFhv!{KwcXGxhvYUwX1#(ZT<3RhFBD)tT)W1wd zlKT&iFl-YW?$_R_B+)@Oa!#KJBsMj93~dod>T*4SGy}(uz}>C&K@_kfOKD!M%w#1x zHVs~JIC&4Q9pIG=SoWn`b5(exlp|OJ&_(W?SJ0-i#;-qPjbAkl{XA^nZ6&GYJ}j+ou<<$YN+ z_f5=q!0?d%F}!;Brk-GZa^KeLJpaS9^E)+^Q>gE0l&3mfRjQFDB(V^|YH_|Mv<0`G zthY%Qg!J^vDpdVf>_yU?hJB2YJU5PyMPCOBZVWC&W^juu#fe^v&~5P@6}tmKbgy26@( zgH%nL&qj;U@pu=IJ0W)cYm>b8(**!eGyLAu>Vn7do$Vlx@QgPfJnfuLrFG8{f+?sU zANpPHJet691{qC_f#3Vx8xv-c;f zQ51Sb`rbv4XPr!^b-Zb&ja0CH>OsZRB)@-2XPA zjHF@oA;}x!75YNt@@UrhCmZ74hk$&Vmu}jAwVxNRk_(-A>1Du$uYpifBKK*LTY1kz zn#k{3#GwGqU+Qrpkl$2x>>A`O|Hkcp0_WJH7JyGTmoF>%-NxY>;3S-ZCCisU$Zfl@ zqr**X0`^zF%heiIn3;AWmoa-M~1%xM2EsE`VM7>4HCIpTZXMDerF&{P}w= zdP!D{OYhQ`$c@m3TP);#0#~l@g>a!H(42Yf1|1!E&j2hw>S@L zrZ&y_0{+-f?G!e5Q~=^#0)~a#=HT2>=^nARZe+Z``Nzo2_0t|pH7j*#7Z0J>=3Ax` zv$N3UsmG3~BRe6Y6Y=yQqxe%R)pHduP4Lmm0LRD6+mMIWpmiMvAnbD9`P2`uyeCI{ zL((*r#1LI=9+xwsxELF)^%J0{o8lW!AooH31~EG^kLNOr zTLNMT&?rx!W&tuA;HqLp5p=kQWpb8|DyVO_dDK_E^F8~r%kG;dz5auX*B@~NK^FSG{Sqmxmk=RT9aDPKC^z;@OLR-uT*;Os?%OgI z*#|zfvV9GP)kBM}7c{4WJ+m*^t&-}aj}IK10;Hv_46{&w!%%) z>#qkw2dHD)3~jr6L{KI8-uXtv?bNtgMWho-nwUW&hL(F&K_ShMrE~Ii5xzcqVP1N( z)b_51&*(b;ib?K#=8P}Rgizti#b`ZK&B<=*eqMZ|#`8@ogLv(j)gs3uP5+M-7}k1# z8CWu?2RmkrxNm+{Cug#+BM63?fV1IzAF@at7qvN@y(g1oeLNWCIFcqAhZpyF%yP85 z8>#2^&V8BjlOO07NHZ(H2H6|GdiOyZ@{cipOBqT>f6l9pg4hsRp2xBBQE*?<@PKkX zV{?(i?A#zAhB^*=zLkdN@k^Et0j*xnRtJMAcS`bp%*u0LS?wSQ&cDhL81S;AvUanL z*iPf!&orZ+U@sK2!b%*OqtX!QjddpaqDDoUp^Ebd*=dbrs+E5ewnCQ*yRL4JNX>VsTYTr&J#!aQ{s(n$JA`! z6|aq@iLIeA{M=OS9e387p4Wbg@> zXHs890z~Sgy#G2k?n}0^qErC3;+u@j5IWFbIZ6$?$j(??gu&m-oCm<=og#sOvpE{gWyVc?eeM)CiMx3%Z_ zkvRvmNwa((=^Yh9a3nzZkn#U4pu+dI?`lhg$$xCpVDhCM$k`JwA8oC@Q1hcXJEqX*%zNb~MyJm0w*)s=iqa2)PlfVks_?P}CTT-fcdE9;BlMoGFT!pxV zw{?1}piKj~`_9O7KL+KlhKD(oDlR@dgzC_%vl2@1?$P$F+Y=~8^?@@(Y<2OOmC9f( z>YD`i+NF50sc=JK&ojmbph0Rx`8xW_!n$?y(Sav965h0h_{KLw3Z=3nb$Ihle~8_+%d0M*Py`60_`&(bK zOQ*c0rOVmE)+e-&r7W4O1i1Nak$-pFfD(VP>F@y#wsB$#NYJcOmZXW9WBi4!L{93$ z3LS}*V1R4W4Bj&KG3oTsR^^EYbB%jcT#Vv7*S8n)%1@v@5_L=V5?86ScI$iZWKX=^ z0wz`5EAaiA)xn%xMt zD^DNk?A=x~Zh!y0Tm`V_4%%-H2Z7j>1^eygXMs-DT zoXz+@wngY#ta%eHnzfLpEq7G3xUY)pAb)@f3YKj=dMJ#2r zPGvmwW=`Hc=U-G-`!;4dcdXbDv4pWW^1>6eC;zQf>kv$J>OGK4wbLNx*!0DFoQCXx z`6SZ2_o~1{a8-ssl1h7uxwV!#N0CyVViDVQ8>37_7_CjsP${<1{6cR=?~E&UN1u$5 zbA)rM1TR|`TyY=fiPXWT7nG@Q3~S-s>sjE!t(NC~kqDUqlsHqA?YdD(Q6p zGqDW*nQ_@-Zr#Y;Nxwn_x)H|Wr-1wcXf4mUU;W(1a}5rwDNXxd+wi$10PMG*g7bd@ z9ne34?xbvW1pkQ+K~4JuH}`vyc*x0WF|90FO@GMZF7(5(xC=P@jx|y zs9Cu=?ld^;_7Kb&2*$o>OmVbv)LjOYNxm2h@kX{HZXDC2E2bM6>4y6HpcCUnaY@|!xIwj)mOG4Ctk;mq9VuogY<2O@&0YZHP8 zU{-Iq_zCGUtGWLz;WuP!3yELD+WP(Lcb{bXsA_OT_M~$4^fY&Cly#SeKWn|r*@zm3 z3_0&*JZ^ce`)u({)@0>I&jGh2E=J*3-3B%gcmkz7E=$(b_zll7SeQoRQz}GthXyF{fu4_v6QgId zbeDIA;9$T)`i?^UP~J7L6`CIP+w`0DDr@xA+dOOOop0uc84V}V!DXlR{w{5E`MwzYz);&^MWt(?Fo-NTOJ(be#={B3a z#hXea@qDfO_BDYR(xh{hn?YV>{S0Lr4N=WkMeBK{9JvW{qwFS*v391whPci^W}&r}7bS#!s04)9e9?!A%QoP5QJ|zd5bmyMVps*z zCJAqNXPQ@z!;<4>`$khx?WhCUquXL;ie3-n@(On5@d)s((K`{SSOwqIo}D5hV7>HL ztJPjGQ)69Z;){br7E0t5mfkl<8c;7QHmOQ{|9sta!5zc@VUJ!ii4jAU_{5gXM!;Eh zO#9`0W*k(On;#p!^rjG*jx?AMJX|lltEqi#%=`d4y?N(*qoyW@xfSwRIb8s|5vU_? zfkOGOHxjCGt}kQCuFAtMn@^v^_^?UZZ0df}4RnwAzLp2`Q8i4>f}xdp=#nzl80Q$cU(-d%|GkB3oMkTG)EC2P zU)O@irnAWy8_41*9(U374fnh_r*t%)^AX-$Q{E96TOlRJKh)C3ma9EApgTbE&uQ_cy-n~i6h zO@~O|-_Kp&8*0{pN2gP~OZ3k~gd6b>1fM|hvJMxDn%#)>z32OD^lIsXl3*z(nM@%6 z<#+zMy8r&?zgp=xJk)TqXU4i?%%$@2*PXP8y!lzPlSk{weW3g!ByD?#9H)Bs6KKeA zB|=N)b~a`qjXqOxJc0N<>I{AG@{gR!#57cI-dH@~f?LJ3I?M>o#$zLHUZ1lMJ-cwH zj_W0`bm(S*h2`ln|53b%smz#MKD9a}>>`!lp&-Cb3>EgR)G*_c(Gi&IOrUX*|MBa#J85tX9GUvlj$nd$jVfr#1Ti1-x zDc>P`;y^!5^0ks`w3*DIauY`RE01;PqdiuQIp^R@Z`@SG@$u_>8ktnqvSS)Q>ZsZv z_3z{#M0DmqQDgbQk(fe_{!@>o=1E|YW6_R~@vTI2ZC%>1bGTDclTwwj&tYnPkL5IO zd6~wZ7vo7zHu@d{wPtoe(z<*aHcxfRha+FDG6PRjsD}R{tN*3--|w!2N<6)c#UsV3 zZ;ZC7Tu}GU(URlK4jn}+lkZdmIy#hq-LOTScnzgxw%dF0V*xdhrYa$9a?-xb0w&8X z7WJphK^dn)agXWtwgMQgelPXOal2i3;UesB5IawfJ3RS2G&|kmaxV9KL;4kGoxV}w z^N>YOB5tPgHuOuG&D@>w6e{KKciZY7>+Sa*>|}=Q@xT0S?ojzAz|_lFWsEI{EtTfA z2)kQ79S}E0Ne32@r+1ujrC-+I+nIB8vLhpMX(_b51k`jRDkvM!`#Z=Z)++L{bI8({ zf!&;2*v+%jWWudrl&;JW2IUXGGyT7DRp+VNCo2zzuQK@yP}E;A=G$7`%;XnEFTol` zIdYEgzrr|OVL44DF7ceA9N5jf=!sbFHzbnt>{_4!P2j1g#~1BOC(A*_P0U~C0SWcK zE>KtOI>-JxwHa78p-zRbc%$f;l`rhO&0^m8zonGP19lNJJg_onK#>I>`|5Z)85F+6 zP&b!5)oR^JxMfTjy2Ms_+yWLCAFzMF!ocaj$+H0kIXgBUu9oiJW<0<^8 z(YO|gfdbpl;m2HMt|q&wF^^gyT<`=ka?ZWc>c|tXiC;U@tJ<3h-_ltiJ$xJk_a1`9 zjkz{%sm{TTX@R6ErXBm7K+kWSv*+@)+StcJoAD;$$FOfNYKF6Aw~)UiUcSE; z&LfR0!Ipx4j%{d88K9W5f(j!PGA1thDuOSrP{mr=*=qUcu-RGuh0FiqeF(!BmrF){ zs=69Ne*EkCLF?k#zIbjU&sJfC&mYh@C)8LCSLq0>jhfvjle8K?*`i$JH)^UHQ>gfm zJ;zCzaC+UJ8DmCe_`15Gs><}JEBJFg)xn3v349V!+O(!e`p35u7dspSPJxSgM(>y2 zOf=>3Kow!y_DI*N;TRjN6&C}Ru=)K@S*4y)rh{;CIphgcb9gy`9wmV}Ja;T>7EmKW z5?kDh#4XH90Oi|lKRqTwyp*@v{L)jUT9!dWN9fF5rMkh;VkqJ2Pp*k{99+?4jk0NXZ7=!B;%_wM?GeC-H`%N)RgZn+Rm=ppk1_f|r%n*et5Z)ya|jF3GVAOKYY{P4;AqnYoqN5va-*DousJ zoaD%t-$-T8nZ)(Jpm%Wk=WgXO$bGTh{l0PAzC(Ted34?AXFW5o8AC zdM+n=>|2{Y{*!zTY8?C$hz{Zxc5@Ya0s%gta9^gwfF^zK`kPAhlT`+^q_=>U?!F(RX1(!0wlE2F2hd3{Rh<%IE!}oE^tX`2n-S z{P$S_`b5u3DL(oEJK&en&}xDClqTKx@aaGuMeW#1iJAXzaG>`+6@ZKAM#r~bfzUSp zHy}LR`eFE<{WhaNRaC#9Dk{}5phx|xq5>0R_fwf`bPb~VIWb?4xp+;4@5j`JC|xH4 zc&0txQttVw;Q(lYCMs>3CViu;4)=>n0N9es{U?4IkceR4mlz8bLfBSgG3 z7N(#gAYYEO`LV zi>&`756BCV)1RA<b+i&zdKBS?aP@|d5l1HK+R(k7iVenHW z0Y*sd%4!KZ1hn;w|4Jpf_$jWz99{oIN(UbGcc%o`xJaWPWC}!(JZ;V2#96?8C?JAN z;(pU*=1ZHVfN9ZyvIj6Q17#ze3Zxn}G}oX(bU&f3X9B;dx-b8f2{bZ&=(QJ{lVoh| zH^faa@uE#)!wX@hsPG>xf<_OsPg^SC!b&kEDr#UjiWmZZ_-vzaxzh5ieC9*tf&EfC zMB7P*RU1&8Z9H0ajaIZk(UY>WA(FSpd*^Py^Pz*J38Mqqf><&Cl{h{ks14+%mN_%I z%V?z}a}@e$e8a3HM7!}y_uK%-53+~9J<&%fomn`C-*F_Rqd&C&U`fC4;b%KQQpkDe zPZ6r`rwHY{O^orY2=xFsH|qTNusmw&A$`LS;zvueleYvEXaOFtWV zWn2{5;=Nsow)q5_?i;qZSd2KntDg<%=r)@w%BE^f-o*hnL%|eSh>RV-pL~DHl!qUq zP#1<|n#(UVKmbOz_m9Xc`_@+)mj)=Y-$b;xKLCnqF6xr`5RPI zHK6K`^u*!C{9Q6K0N~!0;ltBCN<&Sio!X?5L?0%-dCZmjgZzcp4O`kTE@%34O8Z%( z(Eiv9p_I@#=n{wtPxJ(Ga}inwCQ3cihaAnZMdVjt{+8O4Tdw;@0Hvw%=(Y#{a(JmV z06-Y|KeahzqdA{%!ArpP{T7u#aNvY4-M$M3q~PB32+&U{7)a2+Pia3Rtj>KXLK7Ye z*%Hz~^k-tP3IpH_pl6%kH_=Cs#p(QKMRfjCR?7vj^COWQEvgD3;%6(!PY`G~Z(*Uo zl)AtDHLpK+)^RNcs^~Y=pGiFz(3ttsLO>;eEwq4y`L}iuzy*t>zg8&uUqyDHN8-cX z!Ca>|hJ)(Ffn{FuJ=XbUAW6?)59!jBx9kZx zJ-dR4)gk0lx#BTR6t)}8d#Z~ZPpCIB@_t<?0KI3uv^hZe5~hKf06? zXc-fBZ?PX3?u$I7#}K%#YR@`AwEW75PX0Rx`iEgEurrP-b?#3fu1>Rpe9z!{93kN0 zPJPUcJi|w}5pm-{*O%G!Ckr$55bR-Wqcb@v>N6>UjTtP#oTlq@b9)zeu^3saS986( z$P4TQO<5E?J3>(~jGz>}O_-G2S$1q8`nLtQ_M{vN8Cd38jpajwYd4^2@k zEUvT986sLB6R1|Gvm(xhR^=Ssxt|pKPnPaqvR2wXf7k@TkoPAc4;x})>@P4Zzvn3a z%4;=}b?=*aJyS0#U6}CE5c?<48>3tG^t{W`?n(i%oN$YdnLAt1msR68SJs8zEDU_* zcK$N4WjFlf zhZO$1(WnuvM`Gw|an=T2U=ry~v6ZJyk9lJc?MB!h?}w>9KON}2>kc)`RX3}}(iC^z z;+S~VrBQSEJ&-uEIcC4ho(%F*t1+EOugAV+ce`RhTUw^qm*4|8=?D9XA8u-6?W+y8in^<|2l{6eqcB=>;iOqYkug(i2nJSn$y^E7ilF~Ysp|~Gj+J1mDpUDh4 zp?@1ERX?=YJQQ8yD~|p4ACJg-lQ4Lu*Ehd)$j|+_a5c=Yma?}*J>?-#46k(u8hi zjeq;z-QJaG4b ze+Iz6|AHF-GNeJfcb*wry_aFoxR5uDE3}<9 zikAB5Xr%)DrKqoz(LiFeCg{PBSa`TdWitkg;?pkJP@(Q&iy>QlWWJ#ct-ju+ga=9R z%ygNlgoKg#TzTF8ux@YnMdl)l6H1qHa#)I^^7f z<&V5|Z!sJ6Y2=391`N{&{OG?P8}WYvM*cSejZQY7cp7oq*-T8dKp$tq-ZXyd=6dUU zerx3pKD0Tfd4_xxhT|6$1fT^v($&(1T#&I5`O;cfk91tS6(7S>J^&xdlWhFUM^SM2 z-ifb;o%>_(JqF8>BNhHmo%j2Mm+O3h8*?YWlk@TBRD#et?3lypE0`V@{So&M7`7hw+iOzhhnO3jI=3&#jkW4IfC` zt?L;c?%EsO;cdXh zoR|N3HcGS$%=b0AJ$QNMxA=1?y9SRW*pLU%vNeJ8@vq%N8nN@c2L=41=jF!|HTxtB z_gwZ5WPr}1&Fh(kfc&X?Bg)7%@wo1$nC_gUz?cB%X#Pi3&`U4$kE@0FTO}JsdaX4#uF&l4u<9v(F)?)2=6mvwh@xmE>Xy0lWk_? zE6U{g+Gw+H!)(1dDFH|y7SoefTHgvt^N%6C0{8OD;*LkUd3ZUn zDzL3=sXBi!G~2gPhUjqH<(a^Cmw4>fnG*JAwoS)l7F6^~{0JP4Z@d4R!SV~);z(e)nG^xhU2dTBHs12L2gd_iLypD_^s zYe7@`bBhX)ivA>82LTZG|IBy-9j6*`>w$vAY)7GdFjVVfd%&ZU?g;t&}doqM}RC@nX z$|X#GgPt!Tkv}nOZAeDQ%U((-f1|bxO`_|E2CsE=El3(8%DAFU}H9nWI^Emw0oJGK+?@<<*zpT zE~i4`nd6#&QD&yL6es7s(xWbO4D##3>h(t<>WBgG$KjNBklOyvZ^{f z`s+oQJ0CD_K(pNB(XCnD6GU*J@M9dG;WaltEfxNq&f>E=R2r1J|GyBZsG$A3!5T)) zpA6Yp9~^%)7%dZaE)o7X^l4a6R!ROVyv^1J)e{GKYQh82N;T1vx>>VgVBV1@?2!uX z?Ic4X;KZE==xMrYdOa@Z)ICi?MPcnne7nB7F-K|M5Gpi>%w959h1k73)!u%)Q#&|2 zn|Hfe%nRzMo8e9(lzJ9Bfx7&|$*O`XB!a4sgm4x9fcgHuOzSy$sYVM|lwCl=HD@%{ zF-%>pDFdHJ8gl9O-4y1-7N3i8f~cL^i`7z*#~ah^uMcWX%n~v`m}_5t;FiQe8NIX| z?6OFzWW3Oc0Y0^Bu5PF=`pVolA))soyj#x>_OyvaqkS={*JS?iLJ|*RWJaQ}=Q@#I zD0SgGxD_^?_QM9l_pB@1H86bQJ)bJu{=)02?oza>V6hYaBUuftHx)xVgDXSkRQ0i! z8YMj`B0G)Y;eCfWWiY39#>b<)v|P#$<~Hfb-+)>?dihyh7T$9(yGhSqvY%`zoo|0N zwTErf)&x-`B+_%O5FS@&E-50rGT_0rDpEmQX@F8>DE$Bq+ z&M9^o$g%9tD%{jv^a<#^mJx;N?5evr!_LLn1WTfuPBFw6(uPo*7a6j@$G^(K0lYgpDkUo^LZfYUPyGH zbx4}45MHDzOLkc3CpTU&bw=L;BOGBpe_`YsGpqTL^59WuZB@kL=X@va(eU=eJTeZL zst_DDwu4!%aXL4t#T;e3C4H;fBrVIB*rUjD$%Nf%3N|KMMRg^6EQ#& zYE4#YQ{jPD(yOwoggND}vtK@i$CDX2d8wBIH#d$sn&pqmUyk)MOFKXG0K35|3H}%iVc43+#S(+y1s#=V=;JNNbwX;t*>E2| zF08Ke+&E?c9*J7#WV zK#?Efjo?D}fm5(TT8@6q+B?!RgMWg2lJeym$?wWQ&{m@Q^Hx3d7-I9jgkNBO>mTDY~@ z3elYtJ<3EFT+bvmBb>je(P?Eg;52;cUEQXZZ+l@gq!8}J#s)Mv% zdnw~QpVCyFdX@NeVZfFo-8vM8iG_}NdofMF;3Ja(k39wMX7tD(TjYWLk693F z*E4eSM4E_i71%J@^I^ZKtGxTLyQ(x;c!C?Vkd`!NFqqAsEf{jw_gW^|`;51@3=vc= z1e3aT$dw|a@WJtP6I#uTSNVNCEYUa`x#b*59tT+Y4 zYlJ{}olZp^rV01;gHov!ppn+$0I9U`B0v3K#_XL{Mwqq4!5r0b&paA9_R|iQ4#+$@ znQ%$jmF^v`UctPI7jNnne}amnYaH11pC4U}Wsvw?Z{=@i!9-(>Y;^aqm>EYYI~EpX z2e6CZT=Nv{aGRBmJzEHevz`eeTllvY4dke1cXUu}iu1m6cA|c?@pb0N#!KgJ%E}0z zCq3E~Ni_VH8Qq8tfC5pT>Y!4@mkSf?@cc@l6^r{5#4pV&a%>C`K)rABXA;#f>`+uf zr!9TZ*tpc)#k=DA^@;2P+;X;bueA-i7~?YOBhL}D9VkfrY~Q&H_-{sQXX+HA0%_0G zJ9L?k>3^V;uzm#yA6JyJBj19CM4)el$A^=C|I)*9IYx_Lnq_pWL|gDtaiy7y?d%MS zDRNKO0z=0#rfY!TgcZ!R(zh;1?iGwbX#z>;TU8ycR+O8dq&<}pU(v^(kO=UfNOOyBu1=5;GpWpo~$HAJESRN(i3XG^C3IDRGcquzB%0@g19m;P4Yf%BmK06B*t~S)8G)ol|`ui1^{_c}SCG>GFms>jVfEAA~ zqs*S&C|ANr)O2-43G%G1>1UBX&n7tAuT0^UCHeYV8`;7|{k!uAPNV~M{u^iN*uaUf zsdvbXjraU|iW~h zu9N6sT|yteb5aE0xp!9<-u8zSzj*}`>u>}9*_lCvx~Qh$vDo{!qGtIN>j#b}EhxPD zUm^hTW8Vh-Xv|w`;7{iVi5(~>3GkCbjCZjg{$f7&D_goCwJpGrZ=$^kL0mbT4`2xn zU`Ym2Ro@{5u>bWyX`g8En79%lOMO!<#VNi3cZCqh?Q8-5P04v(rKvzicopQ?PzJeT-kQ_J zsXYQb-xG=Hap(?z2-dH;ehS3oh zRcsEPEki^h9k(QTAv7d5)6Q{Rr_S@kzP#K*hYV8eA`XJ&hJ>_RqA?@kT6z0%KBs~x$aZVqcFEsEkgz6V?MAJ)fAQ@)0MgZ!x7 zDWEX{*MGoVc7NRN05oG^PS`wmVLlgr2>Es#s#G_ckT^PJCyclK4cq0C8%eQbvZ?vl z>*e6U#>O4IR356p&k_Rq->Ks+uB&Jg^zUfbC900(rG_ijXsUsWXZTatR;gxZ4|vOi z=4bIVW2JFuI@H-{f`an%ySAwK2t)gQkqN_t&_;9DG(tK#@h2v8j^n zFCAmjnQ<0h#v*}%{<4SFMQm*({-M%feRUmUdff{kX0^8DJh9mIS>2JxK+JwKJST-& zt={CWEN{Q# zYjU>)-GXt0f8?n;4xrko0o4XRHXhN#=W)!UXq3b7eoCw!azQ=z?v7bL0e|iBF{*M7w4DG2?zb}4ss;XVzIu37-d1GuCFAA_UWi{+w*g333PfA~ zw|X;%;#T?H5}?t4s8s2UUFqz)`kdfEditnyH`Gtvw6&#^f81lEmidd-S#H0kDMReY zAD6aQ(q?U2df$ez49tICs6S?D2+(uzUR|tS2rvzms7T1;PEK0>;tE_P!Dg-EZf{Uj z$Gq)kuce9LpRzPc2U-NATnZpnNdX7L2OJF2Ex{&ZTz|r)UguV^iYSa12?dhpr_ukFFr) z>)jGeX@UQR*0gLjD`ku7O{pJbjsW~*yXPN%lKhQA?~6AmJmTsmr5h-2obHY?)g`9^ zDebp{TwN9=>jZJJZ|m@OLSt7$g*lIUse;SK)I>+i=?*3&%mxa2Z?s5c6wx(%ydJsv z2RWGmT0~aQZ5TQ(T91=K&rFl@9$s0l1TMWj{*b2EGhyisv9vvGAL`*Fuv)VuxAu(( zB1^MsU*ASOwRt9t-5Gc=w5OO?%a}SL!B*H}TW=3$x@eE%1Xq0S5#(It6Oyo=hqG^R z>V|Q7zPkVEwIeHS8~61x?;RKUtcWcszN@C}rLEDYb84vkE-}+v)ny~J~ zqc`S~8O9jFQ`T_xpP)d<9Ww}tGi_Z}Y3VDAe!{VA(HW-)K2-TsKYlFUtOFNRRCtff z@VGjK(doX;^nHel^<9Z$;hq@W-ZyU>vedsdb8oM`oOPhR()&gwC5Ziam3F}2e`ss= z=oBm4Bt90|aQ#)&iL7ReyM47%Srj~GIQ7Ou^5G@JIE&d3GsPBtBLWsw4)~$FV0rK|p|&SpGrr=)X~-FEd1LHhlk{2wgyvOXKk zEH8=xaO3Udh|XADZ%vTTB@JXxAuyrf|bLLL7A1 z@!NK{f9b3GpAfHH`%B>e-w0poZBCC5DuVK;oV^;Op0*QO9yvSoi0D`eNQZlKfDn<6as2I)xukIuU9fVNEEBmkR3D;E9Syx+DFn#9lr1ECFt zt3z@v7vb8L+c?op4zkd1ubU{$K3muMoQP-xJ3VmW;?8c?xne2GZSBbuoXEbaS0Q{p zh~ii48_@G{_&ZOqniG63NY@*9>_H8HxT%d$x*R?QM)L7NkWCoCu%2;d#7%+ADkMht zI*p;}gxz`hC&!G*Y4<3JkNXICO}W=-VDLApy*y&DLAat}G<9l>VfZ+LaNklpY*AT7 zTj}Pa^4%z+rRz0bI~oFr$1&THJzfj0Aq;lqL-KDDC8y=xZ#u9kvNXwsH0B)`@;-}5 z-pz3vhd~>~7R+e@(p1{|{-$kO!xw>_@iU(V(^rzf9v^#={gfeHZ@0N7+lvhJ!Fz#; z>88oKmlCfA%05nJ8XuqVzGL&nz)s!}xSOw%7k??flttmK1Q5?`Mu04V)N4vcT^)Hl zMATm*9$T}ae*nVNrq%u7#Zp)h60%B+-3*=-AmY!;>ZR$fU8DT+7=9)){jLrTJ+b+G z#J1U$Jz_XtUyfiaMggzj1gb?fvzE=RSdwAt0JI?11TQdMNI!MIy@5Sap!Cu>=UX1z za32bt_&qH7pI-k>Ceh|Rdq!vhcgT!H*eu$~l_WY!EHc>b2%_C_>VmVbk{S0*!u_0_&@u43E{c`4k8irhe?ndRAh@C zn-ap_!g4NgNmobl^6s)0$jEu`oHwLwJlV?E)r7V{7ncWNy@0F*yN$X^pvl&8oUid9 zvDp!Ctt?<(8fy~hXC9eQM|Y*1X3NMeCRYg=LVm$3Q(FWJ+sLoG;q*J7c;s!KFnRiy z=K=)RBI;~Gx)Q{qp{4%$XJe^z<=k_Q`6A6Rbt73>A$QkowrWBhHK(!9>;oDQw^xr& z+6+T17wt<17_>Qv5d-KY`;+%9L{#oH-9~gZdcHD}*g`6~auCDPw!{UAd z5n*t=FYjv1YTQ)D*2uKh`i3i~85`Z(^Qoig@qnY#+Jy#!W!p?(>w)CMH^4&35Ir1B zY(>8IqbK?OLKb1-ZTqxbQc|*xX25&Z`wc$BjG|lpH2)P}G*vb~kd=`9Hq}5ex)*{S zSm`^sFR(rhZ7k2svX-`clT2h0tgEmq896*Y#8#C$V9(N4RW`?QvEJ`e0>VfHEU$6_I>RZIj72;0(^Ov36H*3xJ|zS5G>Kf=J-0JEO9JMzU{`wg z9FNMBNyPH_u6{snw*L5#wHaEw3$#+ZziJ}LOV#9HB*vWV+-mRhAmgKtLb4@F#52^Y zyt7;X0+(ppKORUlKvEtY&6c51Ao z869#77J=}YSi1gZ`d@KkzkTiRA%$;MCtm446uB0u7`aRthv;eff;2O6X$sqHmbSGj zk2+@y6?yblb^pb8dy@Bob8}vSCt2YVvU&rJy$+^lTz4XGyaH+VPILEbC}V@Jfj)>%^f1Q2m8K5=4Z-o zxoTd)rp0>%J`A;z88BlvJq3UL1lkber*rqQ2=Od3yl=Q#f$w&T?6HR|cxAFhsn+E+ z-&f&5J63rYL^kQscbA`cq>Y60`nC5w;{Q+HyWHMpdQ@5-wXz@-O!&Dr7=|W~83w&O zSBq!K;ctIYK&I8&iTi%i_&%`3PXRQaG%;)?^CcsWq%JY8Lg#Dl40aM6#3q&qQu`ka9wgK+b{LCeEPv9kR zNop4wQ3aq7A-Omv#?@*vCeYJ@0;=@U9=UMKFdn3c1Mx>H>|CW&#m=)tOQB&z zg?=K&se>f_lOYI^P)~{Lfrb>+r=Or-zo39#C;ETJ#Kekhm$*mA zv14J?-zXkyndmV)$Rz>wb$KXG*}$Qy@dJ>TDfO`#9xQnnR7*ZIygBzVs z#Rz%l?1^}7cjTUK+&z%Q;eBJAGb9Ps6v>iNPmkZDXd?b-952S0_U=ybytp}ktS@X+ zgC-B2w&kIpHpFq~axE20Kzsl~#r$b%0#94u(-a^w&-u#^i9gd3|4#b8Y<^_Fdw5dr z@+op>hbb+AXs+B@R^Nf)mOSaRo+%1J4TX0`=VjFrSwGy}{2cL?(iRic@j?V@C?7UP zOgJra?ZouB=9%fs*&7;2&JR^S)0~mEIG#1DHeU1;^m#pz&4JYnpAs=2E>W&0t+_B$ z{kYdPs0!Jo~^WWKns)?u%@Yhj@xuL`!egG-- zojDV7&zww!wCj$)$Q0IGlE)4iw(5FL*q-Ls1xX3{0h)ol(v0%K{E-6=B^%-GMy~t$ z_)kDOZ&vxL-`;drYMxlD8ciQy+#-j)1W84sbwx8#J*fV6?V7gA3u<<;t*0urLb)5~ zapeg_5}>H#XES0n`!&wq8m*i|;Cr=?Xo9Une`KHgc-Ow{&1=b!OmDF)cpO_EIaQYH z{RlUR5W^2m;f7uT)o#v1Cr1@vRe}YA_4pjGT(Nr;-PWn$C$ z4R0P91P6guGjiSuHYXyd@r9cuE?&IhhP{V-bdHPrPBN=QKZ)k`-Q`xlR0-#-#OG>5 zOBV_VpG@6;et#^M&6tg~l)?g4g$?d-pk++)P~#Sgx1`JU##53RGPjH8H{Be6t+<=N zqKoj!;*VrIXfeML-uoDaYoan8eC@+f%z{~RG^{$?Bs#R%$)*3m&|;Co#2WSmU?_ne zuFX}h?mVg*?%hgz*`ud5FL@gJog11Qi`cy~G@QcaLt2Q3ggFP1f>F`~08^6?C{PQG zH3`^}I4(`Rtt9s!_9Y_4vOiJUZ%be zA#%vPPDli35@WX3GUKQq#^6xdxTj1iIo}fd-N@3`&IOLuGA7}ORx4&4Eeq?HeA>_+@^_xbTjJU;*U)OP$Vm+LrUccXkpj zYN3a#gx|XNjuTJdhe3)gKS8|H$)Ax>Mh9HkV=*_zf-iL+l^Q3`euBQPAgNCSvevof zL#30ZjjIE{-4~-`DAL0TPdG&%94NKZ@4(0`2?v~Zj@vcqKez!QK)FnF~p2az)%>dUv?2yH>W_9(Ri{txfUn3o1$w_sRL) z7UAlw`UL0z*O2woeB@3d0)rqA0qoSMb><%SdWd*3FNpB#HW6Zz%U{zMBN!&cIagGG zeMIotC-Mjt>bMks5JY_M*6-Y%Z0wm_9Gsbv-f80X7uocmI8u7u6ELFj@T~0od2UJ- z_UQ{nEHgK;0@ZW6nyJK4ApAHHBJ+&+=AurmVxhO?yV8WD!?2eYl!jc?KS6{)LE{ZO*+>3%CfS86LT__( z1SNrRQ;q$5w}+O6T-q>>0OoLAT&kq;GAwCDvy zi~b|I=GV4+l$Skw<*J8<)vT+cG3&gCt&*kR%o*mH4L8jTMMax?ZGXkIQ2;dn znwy|--4~t9Rt=iAqD(Z8+DpCw!?^y*J#w)64u6VR3bRQ{7a3h?L2+*=#_A8;Tk%wf zt^>QAXJup9SAez+>4})?z##<4N5iLg^-(M`O2`F(;kGDTm-`p;@eM#eb^_$1U~5;p zbi+&w02hN}{e*iDxq|0=?*78PMfZkx*1J0uz;25r`0BbrH;3OT#qWB4bevG{$;8#ZzW`==@U+U`O zK*xD<7ke=4l|8+<-YYVM&>|_Z-+2IVD!)he0ZKT+f!+4Ele{9}Rr2+5zG2X}n=K2z zDHvUWEu;A7Kf?MbMJJO!O9=`6j~m)zZxS=Q{pGCbZVLf@1aE&iOBAbs%`XiJ-ztWK z=JzWtGq}<()3xXTR%1HOtV8eOatk$|wW>0LO?NB&{`UtqS8RYv8o>jC&+cLGa37F> zea_vcFZcc`0%qJ2bePBWo6rDK!=rX}|CjayxdwLIdv(7U-vdzZREih4OJRa9{I5&o+24PZ9jsC^aZ0jy95tf&B@#@?WiS7RX-dUu_|$f&zz3%;x} zLCegWk@5k6E3@#@p33Dwr0&A~YQQBeZn}FEnnP|v*~$}6p0nbEa}E`vA1$k3kBWcX znI0m4@6fAyMDfXSPaVNGfIlg1q$@hXK_@)0(af^6N_{Yxm2WUR7g=BYO@5F3n_W!V zRFJhZS$<34b5gig#Np9OorKsx?xEz%rf(jKRwRen9VtBK>o^JR2_c6e5IGP1sN0Mev5*9 zAe*a?!fHYl-%WkRjK}p`@bB+to_-5(D-wqvYNp3`4+6-oEWSevur!OJECQp51T>00 zgt&zexE`7OY&eBz(2C8+j~-x&FS27lR&y z%>T&ki=#WbsQ2{$la@f|#{WgqGQ-u}GJArs;LyvOxKQ6;Kd9Jw$!%=$d?XS#ran;e zNp2U9XrF~$$TtOb8m z6$4t2P7Hr-J;F`@bK^;_%o{`O{0$C8K<}qQ*8A~W%x3m9fW9d({wV=$`3 zfipSPS`_OSBm7odn2L5PTsSFg!-GzPC4bu-u2Xy=h(GY<>vUXZCo?CSG#S@QYVmG= zadk~4aZd(#I%L+;$Xo5*i&=(uo*+V4nzVfq%*I?#ZF7t%Z((1kOR8Qsx%HL4C>JH^ zV!<25r>x6m!S+Nps?$&zUKU5b9iO?dl~)XH`;7xQom-*djzgciRfh7}a*PHfWlcH5 zIfbkBWfFCUA)f+?%>&sTIur?`O+0NpQctK4Vtx$g6)f(tYE7>BWxY1P!Q~vvn~n)v zzPE9^x6eg^r}gf6)Kv0~gn}JzIvQ`M2Wrk#VfaKMK_6uKe6zq`b|*~Al?d)n8NKY8 zSy^7CVWQIWc?v=kp{22L9~+fcxMK^BykjeTd>5FpQ}|b)&7Z&ar<=IH8q2x9_Uq&N ze|pUM-~ZmXN?_vtpxgN7FXC)fp`FO_gdmAa8K zj@?n8)N#56B*Lb=eOhpP^$^kK*DXXwXvUk<(fPW}d@I@!SV%&YyHSxqGqk+B;>8k7 zcsgW%bN&7fG5Ho$x|d8iGEHTz+KwInnQxNsJG7yMrG%yPK{j*ZPQk&)g+&2+Vnd0w z@aZonPYMP;61xpdjXv}ZOv&sYz}T?_#?T2Jm;rC;mGA%lmHpR|qJI}iTG(lrzjoo? zJ4xIL*EDq=H<)gqb(A{mc8ClB)`CI3e}dL{fFUw-FMoo93}vUy4_I?L;12b=yg(B# zM)ZPa!;=7g0KO|d6FL6WLF!&C)AkAbLP zj&wKRZxYN(D6Lec(R(lfoKKCVwNKqeKtow7Rk3#0x;4)L^ zVXk4$PwKc$H{^$np%1j~A{7w5y=NZCW&!8PWQ*I6%RK>67Cq6 znLYx-3sd-Jo{^GC)?&n1T$sgMV*=h(>dE2+%sl~hK6qn~M~&CnOp*?IZ; z)U+v|f2!${Y0IYE2zqQg1`VxBHgfQw7<|ytj@5s_8`LQ}Jsx6XoIugX$jg&~p=MSa z=-A7%f55fTWvZBR-*F`hz((q6aj>+sU(IQ_w4Xpz);EDsM_{>+|D7I-`dfe3Ygfu5 zqoqbEhjcw#=V?BwQ4>7OWhkhOOQ!Txqr@3qR$Vv!yr&uOG`gpyHcm4qVE9^#1!sYt z^D}(r#hDK&DI%pLXLp)#q=B)&V1808&)g!$+9CwFTuW;8OJ0=s-p|>77MrtaEeF#I zmRI|OI|C}Ch%A{@p^wx#BG|Lvi_RrR(~`C4nPj@r3@ZBnzn_BB+aJFo5d&une`Dyf z0v$c6tb4U!5ylkLBv?_kcPmL?IVvGG)O|v_8ojdH^3^@nnlieXSsP%&-IutqDY(k7 zixZCvF!!)w?U>O~)#P7Nc-gFGBx$=7VjFcSUPzA))UfyS+Afq)sSQ8wV>=-Rz{$6X9H;!YvM%<*_gBA6|=QAo@FdiWtH|L`G^FbP8%m%EF#xmj89 zSsuy0l4iOB>1<<&<}Q9fhA}yQc~qmW-Dn$AMlH0#YMWK1=45?xGiH4wDkS|?%38Uq zew8j>D)?@RJtd~zMM9ZFJ@lc{#7M-BYef!3yyJ%u9&7{oxNf+Nh>Zdx(ll~m8xaUI z;X}{4fFA92a%JX8#qkr{FB`2z!LO2iUk9*F%KjdSVr3uPpscF4+8)1niFw>H&iu1o z+Xi+6|E1C=1P_6aT`yf_!m;j(hI-s~WG@%>!G{Ea^OdCjT7p!gV*>V8Du)wY0y%X= zFO$pCxnayiVTmMvD~ae*)N4P79Px`-vSQZ}yLl!HV?=H6mcdi6luYeXRgH;L@^C}rSZvz z{P8|TvrKwM#)jhvh5jwJGEWsMF%EVKv$-(##r$V32E}=~Z%k9?4a|)0Pf;sXaNxtu zESX~LKD03sM=G2FOT0>ph>%Z50YON$uW;1&ag*KV6D)I~>D(kC{Qggd+QG@b?b|o) z?d2z|`?4iDjesr?m85^C<^T2TXe~|^Kf+BX4vBNEj4E_yx|8rH@{+$21$ejpAdx2Q z!Io>$UIY4+PvjZ1jMrk^h-to!t2}p$Lh6cB+qK>jiOMF_1!3w!rZ#2>2g+(b@FA42 zkXjLqJXI-5Z^FwW32%`mBpqcQVWY2c$2qiHXM9pK^60(qspN%YeNj>vm~nlXi+l(T z`+WhEJvQ1}g;H2)NxMl+8nlj*^cH{Wdo1@&3guyDW?r>lrovzdgzdY`*1h(8T6enH0f-$@O~4wtF%Jru7T zD!cQMy?ni1go^(=+7!^TRwwD|YMP-c=ICmTS68dc%#e#G?=IN$My9R92A?G`E_9UQE%&Gc#;wh6u60RBH)EJ)L4kf1Yh+Yxv|U56CT>Hm z6O-b8 za)wWGw2yhfDkCbX&NvmHxF(Vw#w`-9Yl$i4#SC^@*~>Pj$&`&K`Cn8(d7O!WwMjF5 zbGoOt>_v~KoO(5?M@Y#Ud!8?;CjSIkI}2#?ZI%umPLe$xHn+PFWqhX@+V}x|>FF@u zf>O&!KCrxvKqy7i&&Vhx>W2`n5{sayr~*SX=$05?R(S8dyxQ<7@z7VqQI59nlN_H^ zI!0YAV0PoxNa)XhG~;b>keIT^SW8?Td%fD3G90gL2F`JA2j++W1PM*o2OJKC9Vj;J zMNU;K`J|b;GqI05XEX1)q-tgW!v*saazy*L?D)8O$oSE4>CcL`#kNgcd3enE2fz}d zvV@N{vWPK3PhH-Ax*FZvqQfF?eJmf%Kp>TO;yyc9jY6qOfmVUHX#WEr=>Ia3yXud7 zQV*}@RVWcX$+C8RZY1H#<`Os&5kMCR4-utgN$*Rmv(s!^hkMtj0MIT#GR zD_!3F_<>oXF(>Tv8KNi#rK35nL=1(jd&h+4)| zRqJbZW^mRj{0KT4EI_PP23z4akEOdKP;F+tX5Bk~1z}DTM6Q@6TEHv!DY(vcIZrZ3 zDw|qk4LaY8Y^#x;$622ap(HfUIIQ52DOE`VJWU5A(IH&w&4g+QZZB{-8Bq){WiTmo$M-n3et~$Rwn9 z-=VW+w8Oma`6%I)ddmHwaL)rbrsrQs$=z?U8Ut2m0#+Y3GeP-(F&Z5(T9vPOTYc$F zGNhM~t=}ETx4qasY4PgVOi*KvW?p0Jn$2+q&10F+D5}kD|9Rdb&`ypvR$B~Yob9PB$=F97$?&;2-8MI*ayJUOy^!~? zX(`u$CGW>lB(O*l6tVK$OuQ4gf{wmRxVoEJrlH#OdN6GW0!}S2^VMI1_Mr_WYO9C2 zY3=Go$*GHE@zm;}Sx{Otd_26HYMVG%4^9up#?j;*N?Izfmh@|993Lw?lpy93+Z}JE zv~fBjY(tpFjntZx-eU~$$%|GSj)1fGNxgsVqTgc!>eW7K(RB}LspngL>s-r`Bp{(2 za<0(`Oq87cj-4?lxT%rgGk*zgio(!O9Tzgcy;8d z9NXq4Tyy1aS`7uH9&9Q5touyk#el0t{eUF{5p$(;veQ)hTI^m^zKFW=y^9K#yn^Y< z%CVCHK~X;(oofgS1}Sy>?Rsj?=jIaW~)Q-uQX1b_a9NaybIPuUxe zT9YyPi8X@W>*>+bo02NjHcT z4d>OZ1{LrR=fEK4AF!xB+i|0+x5+woMJ@n!hTqn}$_$24v$c3AS{V-Gl=<~?(471+ z{Nd#5KHYY`o6nxHF9!2i%PJOn2G^-H`89XsC7TNK&=dnZ1I{b5P(j?Rra$F+CRuUN zN+00*kP3+VFTce7>TV93LR^|bou3S8>Z#k}Rlcx39YiGzok^zo*$(rD1ePn5_fVKU zT<=sC?G`!^{4sLQ_s1h+SC@1ME}a(lP_!3?X-JpdB-7gmP`k0kzD4xi^vh~yaMw8h z^rST^hz<8pMBJ_YXM6eM#dA0pQQv#BfLuvg>suO7Zk9xiT)uGmHIQe`eP2_L^AA{u zJUTejWkUr{uB-mk_xoA4_=kW^SV94g%p{1BCzRvr0rIkR?smuJU|I z;EScb4VF;UoHmbPio_*&v|~ZUq>Eub%Nw<|7NhP0vtrWlo-ZLtO& z*Q9|Q?m}vND(CU%G>U%eb1JAYHL z+Ok;#-*}24=yUJ3EuDE$&At_4X9F7j-f>;ynpBJ zo9}Uf6u0pE33`{ab4g^b!3VNBCLr+3Fyh|Snik{#W=lNvaz7kL%yvK9PNVi+uoG90 zsb#YRchIJ?`^UC^7?^D@ZFAC6<0D!f*F`f$W4!r%0FPRxp2~IthBL96tB94(ryggS zKwy5scj^y7%+`lIDy{J3(~|K^p1xSNGYqE{U|*(Otib#D?#!&mXfPTafNYp=-aQ&} zbq{@|>HysEtXCfe*_?cU`qiT5|B+KpuauWvwTisy3)6IW=^&AR$%^l+x6E7w zTd}svSg7LCz0sVpr@GKhaFG!mrL}m-_mb_)WvAWQGV~4Q>K?BW*D*8sX-Vv6C4xD< z0#R6M$qx6ajVIXPI?mg{J|$^Fl2~$gw23)6yLNi5bit~oTns1ihL7C%uorv39*1l) z)^F=r%)nyZUH}8iYZkD5n#P=THU=fi!h3}(-PF}d@89;dd3J|$m9Gx`^;BZ6+rHM? zFl);AggweZC_&87H+`}6eNI9K?JnM7B;J}_@2bQdUYe)ILeusG7dth`T#3}Us#pLv zs&_-R7lG(f4dh>L|4GS#Aw$vDWB&w(3=G++#d(H=#hQ>ls_BT>V0O~~Qb z(aTT-=o}EGM?%GR^`2?olT)p7S5w$reZwzW7s`!&Fi`CutRlV859rkePSK^yl0Oaj zuO=Qq0O|2v*YnP8VNJhz`17XYRx1GH-8FlYC_P#&h$C5(|CA zul|vf_&rueVM&iJwT_x_>}YM|+*yP8x-!F?Z$#NPCTd1UhqPZExmCd*eu6%EWo-&D zf8wQwy7{CnuP`Q^drlP|&-DXXb8k@Dw*3xne(ZJo(th1B>z^PEwH2gJIkGoGAOwQ_ zWE;8ccrvDbGPZF9#aWxc)HKM+nwY@UV<-l7A8Kao;?2$C@;{m0_C+3)pY998f)YM8!{%x zV2*HP*04Xf>op!%yZo&hg6O$DoGiKyhmEcJwawFp(C2|mc}0s*%;e$vM# zeV7EPnp-;_N1?)$b(jm<&Z|GkI|{h>7)Jvjl0*ZWBE^N}k;A%vUwW+u9`0{VHkLcL zxAK|fC(diA?q~)x?!6u8?l}IWf_>f`iSl7y31@BNMTuBnvxH|O zZ$B)jEn=mO+13zpQa+3ev`v&LEh#T7lQ0#f^1MN)yr_;QD~4+WKcG??`w?x`b9+ss zjz9jXdV*zxEX&x5f3ZXqc&p%(a~jaR80Cv%9p3*>AfkWuH~%gh@te%yZ+*^m|o z39ER7G~(UyR>?6>tI|(>#qje;xp_CzhSRHLNd0e<;%Nt&EP;MQ!;uD^I0VjqI&{94+t#FtkcJMD~-@fccx73mDH2b)cWK&hwy4 zP+N@W87YAW0m*DyY9~WnP=SDdP^*WLg55FCx3jr}i-;l6kA6~HJT87ss0JQDg4molp z0G^qu^l6W-<%sRYHW{Co9r+1NmjL_7z!1-7C>7$<*2;0fMab)v+7=|I(Ypmc0(IKU zN*ocSj{t!>g$HO2uY_y&bXcHAuKPQ74L!++%F?!u235Y3n_goa_wA@0i7j z9!MXhaT0?-{lHqINQ!DzVqPp3OJNNcj*26sNP&zd=L}?zbau1Wa6U)i{HQoV9Xcx^ z!(In$vDrJ>)3iKcw9{gy*kw@SC~6W}Y!-A|`U#?oC`Q;$09@lPzN?V)tZzN5lVel$ zQjA=aqay=$v15h8B#9$wXc*5=5ci7X8iByx(c%u_v`a42UKWB~DSES>NPkpc1F}t9 zo?x&~Zl_J<>4Z+TVj=9l*}xnu2`6q*s;+oFZ~?mv|I+SSs(Pe|=uL$+OW*zpGHi00 zp)3+zez6fmcK|w(uEm~L3E4AkvMG{e;*(aiZk;3{&`mtW2S@HIlt<71f5p9ZRF&)6 zFFa|aK|qicCW3T#BQcrMol3`Kf^*7vyfUi)2p@3qf( z-}8-g4rBNS_weDl@B6tre%BrP+>!&zuKP72$;N*Eg$6=54n+Xy5o&&ZVqC4tRuNc- zo0Elj9*(tM6h8PwV1NBtdCL5;n!|~Vr!kiZgyA=kgZ#NQkcWPd8Od=q)`H?ie3qcv z2=(#R{Bb?{-A85O&D}`Y+53^(AMzPeKX2>YZ`^wTGpOX)M-5-kl~^K+*teqh^lpgf zdC+~+Fi;kmHQ{HY5qn(7u$%iE=q!JFg=2p%(>7P2nx;;B!p?*r z=->?kZ@2QV@(1+{`V>Kb6syzkk@ZURXV<#!}DQ_<mdUzErz|fEb~H*U#47}_s<_1Q+T7Qkz3;Gc_<7 zvmTgYI$S!_e^9&D429!Es(WW$2ALd}1WEkj@-iWHO%n$dTiOVy1s$>FPlC)f?7PF+ zqIb4+Ine8Kz8oEn0+xZa+|oGxZQk3BmXv?ujV2qT+o*?0WzsC=~zPPc`bvOnKv zG~Thu+!Gr4sc-3@XbpVaLr)7tqn>YyKPMTrxmGGqW*F2*k7ft|c+9p;*Wi>>?*#y8 zJ-1%;KKSxp`zx{KeHmuA%lOOI<(GW0L+2!F)Z?X$d()k&5q>9&rl;DB)0=%G-*lwI ztFoS}77JqKfHVrr%rJXbwFP0O%v0_qj2t>ZW}7)m`aksOBL5#=$20vGNAq7S-R=Kz zFKxL-SEN{2go~Jm-lCDdfs8Zkb$w;+nEd8X3q!i^;a@!jF^;fyx3lnI?QRXKGqyU( z+GG0Z@@grkf(&FX(Up&0?<=TwxeZys9^+MbKWa{(gNaZm? zhsWGc%s<4HxCwz{gwOZ}cfMd8vhsOTMD7*VQSiR=tbA(v({Umpy^ZoUSIYS=^EPUAXq6&{>&kp(O$0 z%OZ}}_chwd%gIk#?M>y zf+i%1v1s9}rK_MV7^MX*bj+ zxZRg+H@VRoP&~h>ujClxWY&7F(p3CKDgtQWaB382#%gaoPLIiPb$2f2F@_2ZVhhk% zD0hhZj%T>L^0mr7lZ84fY89)y=kE_(kp9r0=|t-uH-bJsut}G+x)eOpX38#NAsu=gFQ+4bbK6MaT-u@dyi78bvF-2Yj~5wbI2Ks#LuposAhnG1Lj}ZVgE1I z{*Tyx+5T7?EjEr9Y(Qu8kb^rf>0GMlvSq#JtmgcStOH49?lzYrBRDbASHBMLT)&b0 zW*`U*{cvK)?LSeF;Bg>j7{>lKllE==JM+i{GTq<62}mrp1x?!LH_9}F8+7v-4^vPq{Jgxp(B@j*=CUboKw)?;q_6R{`~n@*dfP2M z#z_{A-<66>?9U(5et$a;0IC&y2{g!NiUFrXP4>IBZ}I{FcY)O(zgCt0SdIPX$Myh( zGa)Hp7qGAfUDpI*=H?z=SlK|S0%|M=Em2v}zAv-Nl?rd{&$B;Xv}a27?8)@uxd}Zv zJN#ok*LnzHZ500cW@ew(HE8o+FD&Q&;Sl}3FW>Sl5panoO*?=fHM@Vf_~*44mGBfw z4k9RCbu5ovw1Hd`+B{mNgnNG*BTA)s;^ZW*jG~hCCA86#oI<5hfC+T7(GB*H2oAm{ zMgkg#A8f`0VVjhr`v>fR-d?0R&e);(u~iO~~gbg1wblwhW)WI7;X1Gb)If7OU4=VY1)Mmb@^ zSl8M9Z(jS~KE*#Sc30^B$+lZ1M665rt7=!?214oaj+UkqY zHanWd+kM{Hss|uWbHoI zMDRkq;S9~fQHgkY5Snxl&>ag%V4DAuz<=?=SDaa1So(iS6n`y6Xij#@{r<-26R&+_ zz8#jWe`*NQOCr}4b&}69;%rM)mEWixI`|;PiX^A63df)W_ddiszqIp;|?k z5^g#5qRG^Y?cxKn(eei7Dq14Z7JtQh{HilRrO4!gh3}S1!Db9YwfsGzue15%18#lB ze`0|Eyn%mFA%Atg|3o(bbXD&FnZfr7>K3N=P`D=7O0H>PAQxHk^i-0yQG9(0Ffjk+ zMfFd|5b!tS+T5IoxjC2#wV(;TT-QCd>g~|WKlH`_P>vLC0z5lU>$y5*Dm@U8Dc7W` z7p_IwJAnXE^^*WPcCy06tZ&)AuPH;Q(1!4O#gitg29*v+Y``_)h8?xpd13wQRhc&mxXNEY&nMVpeX=LlL!vaJ z!u&2(N|ec#dA_h2W$QG%T<{Wlw|*<(?d^ zOo<^$L_{|?;}kOyLL~k*CX0n#Tz#gWcBBd-N4tf=Y*T>C89|2X3`L)IRr9J3ZI+)iGRsYp ztvBvTk|kVSs>QEQ?007YZfUJ-;7qML#M?2nD~iNZ8e^Eah(S(S0BiqD$*r@C6#a)v z{2X6=wkv`=mZCz-;uR*}*3`w^yCp{NL8t)*2Zg@0SyO}s-Mm@@7@toUr{akUWgw>$ z=7{I&;=>z!6@-wz__(JtkKy7-EfYffriOT#ltjKYQLHG?qS(pbtuFg5O0+o9WYP;H;xoiygspj`e zxv}Y^%@zTw{ApYCulp*bwQ)Ya+aQdC4e?6g^cUM?nvGS+ehI2D%1aqmP+aJlkNp6W z=m-f|@DwG$7pj2%f(LwsWy?k7IJpRnuvA!LS`cQMd2WU@Cu&F>fVdi&voOa8_lhs zVwRS;l9S}Hip7SU8II}}pjJ$zgmyjqS%zntdR8>Yn0z3ZdE?eHL_a?D;-Q!{qIAO+J z5Ph?xNIu_Y+MQf33EPI9lx~Kd(8&)kNc_ATMrUz+p1ydIPBk3liJ7S$O3(xIe2Hf% z2Tpq?hM`g*iHl3qs}^m!Cv$>2MS2~j!{YEzzou5bhUryvc*mzd(@p29_& z3oq)MXFgmG!p7Z&&!v)%F_t29YV66aL^Fbl=*J3y7+i^K;jEsa{`|s1FY&apSpZcf zK;@EvQy|iHPZ!(Sgu9ttnOeklQS_D~#F7?y>x$T%s5fgX4btzL2N#>qO&-U+fO76uW?br_Nk}Fuy(+0H@ z*G7CKW{zSaLm5l*$BNKf4>n2NV%Af2`ifzQm;b@n7nVN@_4F2&f)gDgha zQE7CMUL?os4_a4tn|Tu}9+7!W$!!EBn$;4B;2T?`AmdZpPU{HU=b^~+wQihpTu{|j zB^?S95E#u%u!N$}t{}IWrySK9S&qCMTbp3j`^rtKUj0l*PF*YU-6cY{ zeDe?+yw-s@J6E_E`oP=!{jl5c`yLh47&*tFC>_6S>tkzuO>NR6Z~!*X;FH0=eJD>u zBRE5FD+2T_J7P-|UT%ElC;4bf-^YD!Hz0Xq!FOskYD*(wn+r$WR+Uzr@Ui zw8kn@h#QsBs^qL+H=J5xJ#;anz17}80sjg^?&S~T1~n@d_NjWE_o^lGOuf{A4Fi;!gU@`N&5YtZ(cwshq@kad5Bz z0H?S!!HsX|ZyHObK3Y$wr0uX4!3YzxL)=BYKw=}ShIZm2pU3$a;P3->yZovfv+_jL znp6T^yB}0en9}IEke(~l=9iHCge)5yMKtKo6DJ%ijkHsLNoX`L>x>$xU?F+Z zbOsRkL>*dU+GF47*2HSNz$Ly2zTwx!oInY`7w8#zq5`VF?h4An`Te`Pt@|e@DziQY z;;}_@dC|p^(I%z-l=Dzp^B-iK9Z`tpP)P~z&=q8gOsZY=!Ml&9aP4lsC$E~EJ5G%a z<7VRhHQqgOT2)Ld?C&`g5yLmGI>Aj?)UC~urzQfa+A#iwhzXHio3BT5Z7 zsi`?>eaB#KV@5ZxSr113iNfsMAh8!eYQbfb*#?$>5V(7u~|!&lB-aqNFgP2c)?{8pgI*yc4u z_Zh$=T;gsuNEwd(*X|V(N?yJ|sf29nJ12Q!niKxx`!>|7#>IT(K|eRUZj#7 zk4-3aK2prmljGpcLVeO)ck73k#f1bm7=q%048?-Bz{zuK;)hyHns zAywhIL$?ZuhqsuXDCCUff@ETMAYwJu#eQCG=YBXric55g;2<2Y?KRIlW7Icvod{7` znlN6G$P>U|;Mbkv6GWQv&zrw`6bGdanhr2*Er==S^;N#ceY4AQ!=z4%_zk2~daVraZYsSq}2r!lW9g?PS0PRD`A|(O6GWQ zH$}B-%t4(4-B?74p+-)2fNEJ3bRpgLy-}1v7{E2WZg8B@g+~DNLPff zrP+TePfr0%Li;!QH@@!*YqdZ9p#X)`bQP20>OWiMK<#s`cj?ub2#C+Pzq@~%EA`Qa zbH<7<;i&;DB9&tvx%`E>YT zn_G|)AxYJj+}*T)PjS3|o!1()Ec(K~Bf25gJfV5+l;7JbNMGd)Vdpg`zc{ThG z1T2oZ3pjzpPRb8N8@8t7@KQUeh3h8m@jeuez^F^v83NEgIpg+p_DA=%AH^d)_-1q- zWLGB>6q&3L2moP>0-DXAhLg49ww2n2`!}c~R&_>X5sjxs; zxdU%x8GYvu{h?L3d2PbWf%$S;jm{3K@5?mIHNM~Qoc%E|N{})8c4WE95Sa9#T*1-O zg=e5w8rz`arlgPX+?{0M=#31cx>_aHion;!+{zeF%XHEhBiUV359BCtg;?vZ7_7CY zt`H6NCnEOGh2~0~C#(uFVkFUmuR_EU&!gtPx8`Le7aH_Cf@0*8)`$&F@-$Ij1q}Hi9x=HDCNVcAhc3u=|T&aQqIW1lRG18m=U%5%<)`Ro>R6wCOa2d=OA>}4^ z6oOZ9GSww-t&via@~m=op}u~?oel&P>5aRuK`00aEl3DF1urkj^5_^iZKq<-L9rMr zJ>eD(y4!e-dA*i8SUJ|FaK3awd#372)Z4|oZE3eKfq{nNoY6rpf5s>fLQwDGK^(e1v1fG_SN}|a+JL9I;HVAO%sy+|d zh#w{aus&5xprXff?chtm7wgH(kbhO2AQU z=P2sL9T%j`-0frQivt`UqhY-Ua`r=GcQJGUAut{=dJ_xqzN+gg2=uI>n2nE!hJ;LT zTT5|ytKsCPBvOJXMLT`xB$iuRRW+Y$t20)$clRp;*`ea-khXm>FDEe=eb=>d4qJC< zO^&BSkxlo-{D50omrC@>))iz9Q;b_^FAlV$A$@Wgwn(HRzWfHIT;0G^=ZmF021eD6 z;dJO58iv}3J560)@ub1vM{&E;n8|8saf%OsGB(UMjKV|og0R)*=9BcL>J+}E1UTRO zh+UC%{}DmIM1OVXkiG@S^vP6|*VJ!`%^(AT6T{h)@3OT*?_mI0u ze{++w1NpaSC9T>6!$*oK3=?*YkK8_++$|K?s7J|LYSYu_OO+Z4F7196sG6kz=5UrA zv!q+c--tzIjNYyO;0LB%TxX?tmMXe?*ewcM$oXcnpLq2};ftxuZ?-jE<>YUf`pP+H z`}@fFwv*~dXXm;d2%X{!CC#1oCCR?O1!p=B0hR6~8tM~eywLz|;ZCsjaa7H#%vT^v zgfhS=N+PNgc4sBz91e%3_4Sby-xKSLH&&d_X|-qwY@llr_)0do(}*#({ITvg zecu}TR6S+KK3UE&`fi?=j83v5trE`yG2QKNOVHO4=g&V#-q%mX6Pq6ZX8@ zp*f`^Vba}$@IcxJR&D)) zRG@Y6F}IjCGOpFjnr~g$!iFPaog|@4Q>00HoZJdp`xqMI1%<^1XB^bq+|NrbgWSjn z%u<%5_Fwm{jCtfhCfM^TCu=REc2i)@@;JSOBNYDB+?%+*vuW>TUrd&jZ^TT5)VD-xq-(G7 zAh-in5|cjyxboY5k&l^~IttsUCu&uZv|QQbrXl0`Us{s~8UQ2wRl+_%YAPT+2=CtB zjS~OpVMVz~eZvd;BrB{OQ9LEf-J~RA&g0sti-v)zO9dPO?R_$q5(`~7K?_Q6Q-At# zA7~zId?cS&quu2px8BQnUgbXT1Ct-mE{2?5`|#F}w`kDD%KhS%NdWCI_A&N_dqE=XHyaTJg(3OfIIHoM2bbDN8@!OFW}1 z+sz~8WWAE%dBFDi+1#+wsWoGZpgfs+t7| zYpVXqJ0j?zL@PwRfFcx6vO>Q$;o#b$cD%iEh)7{n@Fv&)wS1w(fIoMwA9i zNPm%7iosHgAFI3Fr>>YVA(Nn>wAJvkYe4EIydLJ}_Ew)xu@$zA<%r^gzI+(k(AeUd z3YqvC!cQx!LFtgj!eTGtF}%NWo`p@#Im#!-W8^?$GkyIbAYT0x+eUy_u@$T^&%EN? z$khi9#ML_<2|hTi9c20Gs_XEWrKffL$mNXl;BB87HTNx)S=>@!zuu6@#mvTMLeaMBYC$6WeCn7uttC$mXt%_V6qFM|>$wE#7#$8~8 zB*8dmLyqD?$D4Vs?(mA8NAAluHPJbUv89#X%^pt?iC{rNAU5ESPomLHs;_sF(!ZAK z_%y-OE>6!#p-UbnTtiAeO!%JU$*1A3R#Dmwfewrg2S!h!M+amsgCU_ymmbbwyMT@w zywb*Y2}Mx%@nsn834f-auVYEs_qxRnLj$M-#%wv9odE782nK%uQ6gPBc}-^*2a};J zoJ?Ggn?(o)5N2nAtFX8bUMapH;;44b8UmmQHm@AodolDh)x?=E$9N|x*%U54D&@dW zRm!dDyHcuEHL`e-g?-Nk06E_>2&2q&`|{2CD0|}UrF1nxbj+89v9WKGFxqxW=Vgf(U4?j zMftHiHGlbooKwVXyYYv!EwTHqAEHAgEm8Kpf@eb@_0igJUNLqm@j1vwC{C%I`_e!# z;vr*tNgC1&i1`j^xK_K%CKL=Bjvb~KH(qU~{;41C^#mvm(Ks`iA3lYaISE69}NrV!bp+qvNvpbY+zKJ2h#l7&wFu# zT|hCiz)a{4e$^c>wuy#8U$GCKa1MBKA1+RC)jD~|dZcoX(@u@7{Om*Jk@ z<#1^YS+ls*XsM$>4m}NkcKF??j{w=klT2b4x^3s4zDeA==Q&Oh3WRnM=NWtoN7i<`zlG5?99Rj>~?UZRX3NnOGn0mW9p`_Zz7Ab$c2f z3OXOmv%%JcBwCVfT4c!Sl4#s|W+B*vbUi^OTx|u@ zBU+lD(6K5e}lS|uLaYm zI*Y_a?3kh2*nDsEX@S*fD%U&3^E!%I3&J_yK%f}&bBMn6@Il{;9Eu5_dEG!y(Jj8b zRf<#*m4`;#b5B1~U4%{r9zLP`Y+d7Y^TH(bH3b4m6wbF0qGkCii*#3VPm-i|kL+wn zcxiw!Nns^Qu7W4o38P5NcJfyTK1QCWa;nLu-m-E^dUsef?=%8h~ALFrCPPe}1ggO{+H!5kRgD#MqT!l+iLWVw`Nz4WkWy@TUruKM`F)Z_c^ zb0I*pVhs?GS}q{>u5 zWB@Eb2n0g%7yca-zz{#$j&BdwRD#H+sV6H*t^2!?eg~;+i09pQRFvQrtMEyCufn+j zi@|RoswLgM$GQpy#VzcHEb|7&d#paWMf%QTRDvY3Wmic*#c`?Q3DL<5& zy@l8x9uC#AVDy>+aqz5f7-Cr-ibir1yF|OB;T>m-I}Ituax=@g%mlMYdJVTND{1HD z_e^vFSe-<2d697uAXa6_dLuYNsje1C2pM=oJ+eX|ER+f)a>~k}%cuE(PstK|V#i}GpePdHFJ6#9VEy)ZOnrWmMXISO!K8#V5q=TUB<2Jkw-3$ro@ez&DJN{X?X`LxU~iU#fh|K+#q98fP3R^ zFBPO$9|VPI*30-m)@e^N%Hk~rQKV8SgKmqx{tj>Tzw5I#j!^04Z3VM;0pLv53i>Re zktXmx%7}eVB*qXibXX3Mk=$#q^gv*b!$oz9R!a)OkG%$V8EXxXEx`6TQa>dbQ7Y(~ z=dqfOv+sa^o7DzB6!Uw;|Hu z;sdJV#zUadSeDWv7KMJQuDEU}Bz>KBTxeI`)-5O>cKtGZYmC*(FSK>? z>z*2$Y`wDX(fFtsx=^wNp9BRjI;LH}IMg&G-6nW_!dmms^VC$KIbGG6R55A7`>3`< z9}ar9#0_dmlCgd`&R92VJU!&WQ+|4e2V{b`EFrO-r1keamG^yBEfL;&OM7aQPc84R zn;H%;1+F`wRx!jJYHhRxR}^YthR1ndLbD7CLMb4$O|1>dgQUHVn$fz>oA51^Lw}wL zlx@eX7+y);5D;=Nczt3*+fBR4_ftbnL&Orn*Cj&*V)L|wfCYrgy2dvot)ru?ahIFO zy@F>B4ipf2kTNk&2FBA;Ew-00g7n7N&*sy@?X#`ZqRf%G;xmBqBG`5AHgF(Q zoeXB9(Nn7!NS`zqDF^jbI!0(uZwasJge0l7l&#$4xNL5Kjn(4A^LoYCG7=P(uiXkw zRVis~NtZ~b*K1z1X~ui`W;q!pDC5&+tGYk)&J@>9iVLY4 zgcks0A=!#+ZfG>?=d%Uo36++?x>9t)rX&t1D%kQ+b8RRT_A%4}CzzaEu$O)+p-cGI zn5NPothT8oNuID9ZbSVOY+ryfgY1X5g`AqpA785Q89bhD1uJn4enEEE4L;~KLpTYa z97t08u2B%Wm&$sBn`CBT?)9g-FhrwOU5wyuTR0wQQ0}!%)X%V}X|2`fbonk)^}7OX ztH*Eh)kWsPn$YcUH~<4QunxFol(d(|A}L-^1v%vQ=XUZ(TU*Ag8T6fk-zm=F3mMxVgoN{!4^$`!30m+2Gj3h~IbRH#Qq=~~Ptb{W5p>T%!KGIB=T zK-EBD0$KfW7whESOuMnOpuAZ<{npY`R%wP7OJ+?p8cz+>L$P@j(Qjye*mJVLQFVLW ziwv(kpxC-Yyw$k@VPAKvx$yeP*dLtJGm*Z}wL+v%47&;!S-`ROT%_F6VxFJJrO;-?Yb} zi(ksL^&P<3=iY*e*ZKTm`bU_0A!f8m7xZ^HYyk+7P6lTvtN#-)_%IM7&-y{4U zY8Phv^PCsfJv%Gt>E|;VI$7j6!BmsIyXqo z0l_kpT!MetC%BLq-bdHWJUDTNO=bEUNTunE+lf_1T)LMHn%!Goi zFj;UC8eOp13o}m|Q`jleTiDc&1+5AreWGea^*g_25n(YBpM{r}eXI|7v#**ZOCu>* zq(+E@S)7IF1@aa}1-~OBVIZyFl+1f~@#L4?#U(`z%$a{ygJrj?D*PlfmJ{kRNL!Cg zQ7&8KA^uF=T3;7t6r>#EVXtq-tA46lv#J^N$qGfxPrN`Ku*=}W-qW6$YvQtHI(|_pyuxg+>g_64 z3}F+_tCMdiX~#=-<>enrMhfoY9eAgH@L9Ve(W^L$kBtPy5lXfhB6EID+~zkmvX=4{ zo`>Wa&dIWBSGRv4mH9ygStp}4BMHi#wU!_-c}ZaGVUrCZc}}$GQ{ha_lcehT0)f82 z;T&ZqR*57D0%3sgCIAx>_}NBd1Lj_f3@HzEuScw%&OO-d3l+-4Ga*dQnvQkUm|1XI%5c><_@q45uVA++_cDoJKPA;jdY+)QO_m~=3Djn~t$4l_FGjm|{xLxJ(1ZvL z^*T*+LQN9Q3O21%?}h%+F<KgZJbVLsai@@@BKxIo}4 zPxQRkI$x))ICW5`O4jbt>oFK^QCD(PkO&Gls;Oeftnwc2|*42icZ0dDTz>Ly!<7C_{`1d3}i3KKzufB@DB)jZ|(TzGcF)J!5I)ygzY4VSW8Spt9sOB zr{ZXfV7`m{D}W8*|M!5hGMWi21wE%+#i3XiCgA1U-ST9Al{Q!ppK(ZK9n~k7I&B@@ zpQlB}aScX6rP9TydPB)ibQq*Um;*j80(~G0oHc1C>k+)3mM^iJ=)Ev^yY{_(W6J&! z?o9hsP?y?mWg`s-B`WHN;qt}GNk(R&hukhO6>};DBjO5dZ0=To0y`4ru6N2QA_qd^ z6V_f-`9Y@egTa1qjK!zM_lHSqGLwtV2|tniIYGoo?1|YsDO5EEJ8qo%&xcx*!1Ik& zQJEV47^^yfvi#*}{NMgZPzWYMqA#d(bQdR?kXYq~ML*qzvnVvGiRaQf-Y0U}^Rx4i z)B5a-AJbEA0NT_o%PjBjND`E98djqtp zqcPVe&bo;&>4P04aEv4b+YHRqqYN}`{m;<*UZjwfzw%&=W{8kw-rmKk6qCXz1pc^F zp%zva2pvRKWc|16Cgc8uj^(t& zE-Jzwvx}-fe3{J%7zfafIx|HIeNvlBiltA_>9)iKqkDOrmJS zJOwvh){2K3&Uc(VhpfsFKv1fFI%X;|W#O@O=)X%AxGoPev)G*LE?&C(y!uvQRXaYS zC)oE=XawUs26)5Y+gIQPFi(@dsNfSzcLPWunPhcmJX(4#NfsL`iI2$i?aOa4C2lmw9ErVQmk6UeO z%%szIg&x7h9t<>Mq