From 472ec60f9ec14a74403bd109145c65b85669da59 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Tue, 2 Jan 2024 21:20:42 +0000
Subject: [PATCH 01/67] nvls barely works now

---
 nvls/README         |    2 +
 nvls/align.h        |   47 ++
 nvls/alloc.h        |  270 +++++++++++
 nvls/argcheck.h     |   16 +
 nvls/bootstrap.h    |   32 ++
 nvls/channel.h      |   48 ++
 nvls/checks.h       |  160 +++++++
 nvls/coll_net.h     |   35 ++
 nvls/collectives.h  |   48 ++
 nvls/comm.h         |  473 +++++++++++++++++++
 nvls/core.h         |   41 ++
 nvls/cpuset.h       |   61 +++
 nvls/cudawrap.h     |  129 ++++++
 nvls/debug.h        |   48 ++
 nvls/device.h       |  463 +++++++++++++++++++
 nvls/enqueue.h      |   26 ++
 nvls/gdrwrap.h      |  252 +++++++++++
 nvls/graph.h        |  116 +++++
 nvls/group.h        |  137 ++++++
 nvls/ibvcore.h      | 1058 +++++++++++++++++++++++++++++++++++++++++++
 nvls/ibvsymbols.h   |   46 ++
 nvls/ibvwrap.h      |   92 ++++
 nvls/info.h         |  134 ++++++
 nvls/ipcsocket.cc   |  232 ++++++++++
 nvls/ipcsocket.h    |   38 ++
 nvls/nccl_common.h  |   33 ++
 nvls/nccl_net.h     |  333 ++++++++++++++
 nvls/nccl_tuner.h   |   55 +++
 nvls/net.h          |   27 ++
 nvls/net_device.h   |   29 ++
 nvls/nvmlwrap.h     |  214 +++++++++
 nvls/nvtx.h         |   85 ++++
 nvls/p2p.h          |   29 ++
 nvls/param.h        |   30 ++
 nvls/profiler.h     |   37 ++
 nvls/proxy.h        |  296 ++++++++++++
 nvls/shm.h          |   25 +
 nvls/socket.h       |   97 ++++
 nvls/strongstream.h |  140 ++++++
 nvls/test.cu        |  172 +++++++
 nvls/test2.cpp      |  143 ++++++
 nvls/timer.h        |   60 +++
 nvls/transport.h    |  128 ++++++
 nvls/trees.h        |   13 +
 nvls/tuner.h        |   22 +
 nvls/utils.h        |  524 +++++++++++++++++++++
 46 files changed, 6496 insertions(+)
 create mode 100644 nvls/README
 create mode 100644 nvls/align.h
 create mode 100644 nvls/alloc.h
 create mode 100644 nvls/argcheck.h
 create mode 100644 nvls/bootstrap.h
 create mode 100644 nvls/channel.h
 create mode 100644 nvls/checks.h
 create mode 100644 nvls/coll_net.h
 create mode 100644 nvls/collectives.h
 create mode 100644 nvls/comm.h
 create mode 100644 nvls/core.h
 create mode 100644 nvls/cpuset.h
 create mode 100644 nvls/cudawrap.h
 create mode 100644 nvls/debug.h
 create mode 100644 nvls/device.h
 create mode 100644 nvls/enqueue.h
 create mode 100644 nvls/gdrwrap.h
 create mode 100644 nvls/graph.h
 create mode 100644 nvls/group.h
 create mode 100644 nvls/ibvcore.h
 create mode 100644 nvls/ibvsymbols.h
 create mode 100644 nvls/ibvwrap.h
 create mode 100644 nvls/info.h
 create mode 100644 nvls/ipcsocket.cc
 create mode 100644 nvls/ipcsocket.h
 create mode 100644 nvls/nccl_common.h
 create mode 100644 nvls/nccl_net.h
 create mode 100644 nvls/nccl_tuner.h
 create mode 100644 nvls/net.h
 create mode 100644 nvls/net_device.h
 create mode 100644 nvls/nvmlwrap.h
 create mode 100644 nvls/nvtx.h
 create mode 100644 nvls/p2p.h
 create mode 100644 nvls/param.h
 create mode 100644 nvls/profiler.h
 create mode 100644 nvls/proxy.h
 create mode 100644 nvls/shm.h
 create mode 100644 nvls/socket.h
 create mode 100644 nvls/strongstream.h
 create mode 100644 nvls/test.cu
 create mode 100644 nvls/test2.cpp
 create mode 100644 nvls/timer.h
 create mode 100644 nvls/transport.h
 create mode 100644 nvls/trees.h
 create mode 100644 nvls/tuner.h
 create mode 100644 nvls/utils.h

diff --git a/nvls/README b/nvls/README
new file mode 100644
index 000000000..c385affc4
--- /dev/null
+++ b/nvls/README
@@ -0,0 +1,2 @@
+nvcc -I/usr/lib/x86_64-linux-gnu/openmpi/include -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -L/usr/lib/x86_64-linux-gnu/openmpi/lib -L /usr/local/cuda/lib64/ -lmpi_cxx -lmpi -lcupti  -lcupti_static test.cu  -gencode arch=compute_90,code=sm_90 -lcuda -lcudart -lnccl
+
diff --git a/nvls/align.h b/nvls/align.h
new file mode 100644
index 000000000..2a71dd1bc
--- /dev/null
+++ b/nvls/align.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALIGN_H_
+#define NCCL_ALIGN_H_
+
+#define DIVUP(x, y) \
+    (((x)+(y)-1)/(y))
+
+#define ROUNDUP(x, y) \
+    (DIVUP((x), (y))*(y))
+
+#define ALIGN_POWER(x, y) \
+    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
+
+#define ALIGN_SIZE(size, align) \
+  size = ((size + (align) - 1) / (align)) * (align);
+
+#if !__CUDA_ARCH__
+  #ifndef __host__
+    #define __host__
+  #endif
+  #ifndef __device__
+    #define __device__
+  #endif
+#endif
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z divUp(X x, Y y) {
+  return (x+y-1)/y;
+}
+
+template<typename X, typename Y, typename Z = decltype(X()+Y())>
+__host__ __device__ constexpr Z roundUp(X x, Y y) {
+  return (x+y-1) - (x+y-1)%y;
+}
+
+// assumes second argument is a power of 2
+template<typename X, typename Z = decltype(X()+int())>
+__host__ __device__ constexpr Z alignUp(X x, int a) {
+  return (x+a-1) & Z(-a);
+}
+
+#endif
diff --git a/nvls/alloc.h b/nvls/alloc.h
new file mode 100644
index 000000000..f8d954469
--- /dev/null
+++ b/nvls/alloc.h
@@ -0,0 +1,270 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ALLOC_H_
+#define NCCL_ALLOC_H_
+
+#include "nccl.h"
+#include "checks.h"
+#include "align.h"
+#include "utils.h"
+#include "p2p.h"
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+
+uint64_t clockNano(); // from utils.h with which we have a circular dependency
+
+template <typename T>
+ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
+  memset(*ptr, 0, nelem*sizeof(T));
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  return result;
+}
+#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+inline ncclResult_t ncclCudaHostFree(void* ptr) {
+  CUDACHECK(cudaFreeHost(ptr));
+  return ncclSuccess;
+}
+
+template <typename T>
+ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  void* p = malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
+  memset(p, 0, nelem*sizeof(T));
+  *ptr = (T*)p;
+  return ncclSuccess;
+}
+#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
+  if (nelem < oldNelem) return ncclInternalError;
+  if (nelem == oldNelem) return ncclSuccess;
+
+  T* oldp = *ptr;
+  T* p = (T*)malloc(nelem*sizeof(T));
+  if (p == NULL) {
+    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
+    return ncclSystemError;
+  }
+  memcpy(p, oldp, oldNelem*sizeof(T));
+  free(oldp);
+  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
+  *ptr = (T*)p;
+  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
+  return ncclSuccess;
+}
+
+#if CUDART_VERSION >= 11030
+
+#include <cuda.h>
+#include "cudawrap.h"
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
+  ncclResult_t result = ncclSuccess;
+  size_t granularity = 0;
+  CUdevice currentDev;
+  CUmemAllocationProp prop = {};
+  CUmemAccessDesc accessDesc = {};
+  CUmemGenericAllocationHandle handle;
+  int cudaDev;
+  int flag = 0;
+  CUDACHECK(cudaGetDevice(&cudaDev));
+  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
+  prop.location.id = currentDev;
+  // Query device to see if RDMA support is available
+  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
+  if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
+  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+  ALIGN_SIZE(size, granularity);
+  /* Allocate the physical memory on the device */
+  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
+  /* Reserve a virtual address range */
+  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
+  /* Map the virtual address range to the physical allocation */
+  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
+  /* Now allow RW access to the newly mapped memory */
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = currentDev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
+  if (handlep) *handlep = handle;
+  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
+  return result;
+}
+
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  if (ptr == NULL) return ncclSuccess;
+  ncclResult_t result = ncclSuccess;
+  CUmemGenericAllocationHandle handle;
+  size_t size = 0;
+  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
+  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
+  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  CUCHECK(cuMemRelease(handle));
+  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  return result;
+}
+
+#else
+
+extern int ncclCuMemEnable();
+
+static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+static inline ncclResult_t ncclCuMemFree(void *ptr) {
+  WARN("CUMEM not supported prior to CUDA 11.3");
+  return ncclInternalError;
+}
+
+#endif
+
+template <typename T>
+ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  }
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  return result;
+}
+#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  }
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  return result;
+}
+#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  *ptr = nullptr;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  }
+  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
+  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
+  return result;
+}
+#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+template <typename T>
+ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  // Need a side stream so as not to interfere with graph capture.
+  cudaStream_t stream;
+  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
+  NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+template <typename T>
+ncclResult_t ncclCudaFree(T* ptr) {
+  ncclResult_t result = ncclSuccess;
+  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
+  TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  if (ncclCuMemEnable()) {
+    NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
+  } else {
+    CUDACHECKGOTO(cudaFree(ptr), result, finish);
+  }
+finish:
+  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
+  return result;
+}
+
+// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
+// allocated on separate pages as those pages will be marked DONTFORK
+// and if they are shared, that could cause a crash in a child process
+inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
+  size_t page_size = sysconf(_SC_PAGESIZE);
+  void* p;
+  int size_aligned = ROUNDUP(size, page_size);
+  int ret = posix_memalign(&p, page_size, size_aligned);
+  if (ret != 0) return ncclSystemError;
+  memset(p, 0, size);
+  *ptr = p;
+  INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
+  return ncclSuccess;
+}
+#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+
+#endif
diff --git a/nvls/argcheck.h b/nvls/argcheck.h
new file mode 100644
index 000000000..8d8b74e8e
--- /dev/null
+++ b/nvls/argcheck.h
@@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ARGCHECK_H_
+#define NCCL_ARGCHECK_H_
+
+#include "core.h"
+#include "info.h"
+
+ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
+ncclResult_t ArgsCheck(struct ncclInfo* info);
+
+#endif
diff --git a/nvls/bootstrap.h b/nvls/bootstrap.h
new file mode 100644
index 000000000..400a479fb
--- /dev/null
+++ b/nvls/bootstrap.h
@@ -0,0 +1,32 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_BOOTSTRAP_H_
+#define NCCL_BOOTSTRAP_H_
+
+#include "nccl.h"
+#include "comm.h"
+
+struct ncclBootstrapHandle {
+  uint64_t magic;
+  union ncclSocketAddress addr;
+};
+static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
+
+ncclResult_t bootstrapNetInit();
+ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
+ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
+ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
+ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
+ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
+ncclResult_t bootstrapClose(void* commState);
+ncclResult_t bootstrapAbort(void* commState);
+#endif
diff --git a/nvls/channel.h b/nvls/channel.h
new file mode 100644
index 000000000..adc38749a
--- /dev/null
+++ b/nvls/channel.h
@@ -0,0 +1,48 @@
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "comm.h"
+
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
+static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
+  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+  int peerNode = comm->rankToNode[peer];
+  int peerIndex = comm->rankToLocalRank[peer];
+  int nsteps = comm->maxLocalRanks;
+  int rankIndex = comm->rankToLocalRank[comm->rank];
+  int step, delta;
+  if (coll == ncclFuncSend) {
+    step = (nsteps + peerIndex - rankIndex)%nsteps;
+    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
+  } else if (coll == ncclFuncRecv) {
+    step = (nsteps + rankIndex - peerIndex)%nsteps;
+    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+  } else {
+    return ncclInternalError;
+  }
+  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
+  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
+  int base;
+  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
+  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
+  return ncclSuccess;
+}
+
+#endif
diff --git a/nvls/checks.h b/nvls/checks.h
new file mode 100644
index 000000000..c9fd16176
--- /dev/null
+++ b/nvls/checks.h
@@ -0,0 +1,160 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
+
+#include "debug.h"
+
+// Check CUDA RT calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t err = cmd;                                  \
+    if( err != cudaSuccess ) {                              \
+        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+
+#define CUDACHECKGOTO(cmd, RES, label) do {                 \
+    cudaError_t err = cmd;                                  \
+    if( err != cudaSuccess ) {                              \
+        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
+        RES = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUDACHECKIGNORE(cmd) do {  \
+    cudaError_t err = cmd;         \
+    if( err != cudaSuccess ) {     \
+        INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        (void) cudaGetLastError(); \
+    }                              \
+} while(false)
+
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+
+#define SYSCHECKGOTO(statement, RES, label) do { \
+  if ((statement) == -1) {    \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+
+#define NEQCHECK(statement, value) do {   \
+  if ((statement) != value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define NEQCHECKGOTO(statement, value, RES, label) do { \
+  if ((statement) != value) { \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+
+#define EQCHECK(statement, value) do {    \
+  if ((statement) == value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+
+#define EQCHECKGOTO(statement, value, RES, label) do { \
+  if ((statement) == value) { \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    /* Print the back trace*/ \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return RES; \
+  } \
+} while (0);
+
+#define NCCLCHECKGOTO(call, RES, label) do { \
+  RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    /* Print the back trace*/ \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    goto label; \
+  } \
+} while (0);
+
+#define NCCLWAIT(call, cond, abortFlagPtr) do {         \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  ncclResult_t RES = call;                \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return ncclInternalError;             \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+} while (!(cond));
+
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  RES = call;                             \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    goto label;                           \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
+} while (!(cond));
+
+#define NCCLCHECKTHREAD(a, args) do { \
+  if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
+    return args; \
+  } \
+} while(0)
+
+#define CUDACHECKTHREAD(a) do { \
+  if ((a) != cudaSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    args->ret = ncclUnhandledCudaError; \
+    return args; \
+  } \
+} while(0)
+
+#endif
diff --git a/nvls/coll_net.h b/nvls/coll_net.h
new file mode 100644
index 000000000..f4b540866
--- /dev/null
+++ b/nvls/coll_net.h
@@ -0,0 +1,35 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef COLL_NET_H_
+#define COLL_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+
+typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+// Translation to external API
+static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
+static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+
+static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
+
+#endif
diff --git a/nvls/collectives.h b/nvls/collectives.h
new file mode 100644
index 000000000..0f965276a
--- /dev/null
+++ b/nvls/collectives.h
@@ -0,0 +1,48 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COLLECTIVES_H_
+#define NCCL_COLLECTIVES_H_
+
+#include "nccl.h"
+
+// CHUNKSIZE must be a multiple of SLICESIZE
+#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
+#define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+
+inline int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+  case ncclInt8:
+  case ncclUint8:
+    return 1;
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return 2;
+  case ncclInt32:
+  case ncclUint32:
+  case ncclFloat32:
+    return 4;
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat64:
+    return 8;
+  default:
+    return -1;
+  }
+}
+
+#endif
diff --git a/nvls/comm.h b/nvls/comm.h
new file mode 100644
index 000000000..328ffef3b
--- /dev/null
+++ b/nvls/comm.h
@@ -0,0 +1,473 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+
+#include "transport.h"
+#include "p2p.h"
+#include "collectives.h"
+#include "nccl_tuner.h"
+#include "proxy.h"
+#include "strongstream.h"
+#include "nccl_net.h"
+
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+
+#define CACHE_LINE_SIZE 128
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      uint64_t redOpArgExchange[2];
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
+      int offsFifo[NCCL_STEPS];
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+      int offsFifo[NCCL_STEPS];
+      int flush; // For GDRCopy-based flush
+    };
+    char pad4[MEM_ALIGN];
+  };
+};
+
+enum helperThreadState {ThreadStart, ThreadStop};
+
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
+
+struct ncclGraphHelperResources {
+  ncclComm* comm;
+  pthread_mutex_t threadLock;
+  pthread_cond_t  threadCond;
+  enum helperThreadState threadState;
+  void* ipcBases[NCCL_IPC_POOL_SIZE];
+  int ipcTail;
+  int ipcHead;
+};
+
+struct ncclUserRedOp {
+  int freeNext; // -1=allocated, otherwise index of next free entry in array
+  ncclDataType_t datatype;
+  ncclDevRedOpFull opFull;
+};
+
+struct ncclNodeRanks {
+  int localRanks;
+  int* localRankToRank;
+};
+
+struct ncclDestructor {
+  struct ncclDestructor* next;
+  void* obj;
+  ncclResult_t(*fn)(struct ncclDestructor* me);
+};
+
+struct ncclCommCallback {
+  struct ncclCommCallback* next;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
+};
+
+struct ncclSharedResources {
+  int refCount;
+  struct ncclComm* owner; /* comm which creates this shared res. */
+  struct ncclChannelPeer* peers[MAXCHANNELS];
+  struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
+  /* P2P operation counter, one per channel */
+  uint64_t p2pOpCount[MAXCHANNELS];
+  /* Collective operation counter */
+  uint64_t collOpCount;
+  int tpNRanks;
+  int tpNLocalRanks;
+  int tpNChannels;
+  int tpP2pNChannels;
+  int tpP2pChunkSize;
+  uint64_t magic;
+
+  // top parent rank to localRank translation table
+  int* tpRankToLocalRank;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;
+
+  /* proxy related shared res */
+  struct ncclProxyState* proxyState;
+};
+
+struct ncclChannel {
+  struct ncclChannelPeer** peers;
+  struct ncclDevChannelPeer** devPeers;
+  /* devPeer pointer array used for host side access */
+  struct ncclDevChannelPeer** devPeersHostPtr;
+  struct ncclRing ring;
+  int* devRingUserRanks;
+  struct ncclTree tree;
+
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
+
+  struct ncclNvls nvls;
+
+  int id; // index of this channel
+  uint32_t workFifoSent; // last used work index+1
+
+  /* comm split sharable resources */
+  struct ncclChannelPeer* collnetPeers;
+  struct ncclDevChannelPeer* collnetDevPeers;
+  struct ncclChannelPeer* nvlsPeers;
+  struct ncclDevChannelPeer* nvlsDevPeers;
+};
+
+struct ncclWorkList {
+  struct ncclWorkList* next;
+  struct ncclWork work;
+};
+
+struct ncclPointerList {
+  struct ncclPointerList* next;
+  void *ptr;
+};
+
+struct ncclNvlsMcHandleList {
+  struct ncclNvlsMcHandleList *next;
+  CUmemGenericAllocationHandle mcHandle;
+  CUdeviceptr ptr;
+  int dev;
+  size_t size;
+};
+
+struct ncclKernelPlan {
+  // A kernel plan is also a callback that reclaims itself. Hence this must
+  // be the first member.
+  struct ncclCommCallback reclaimer;
+  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
+
+  struct ncclComm* comm;
+  struct ncclKernelPlan* next;
+
+  bool persistent; // aka captured in a graph
+  bool kernelSpecialized;
+  void *kernelFn;
+  int channelUbound; // only channels c < channelUbound are present
+  int channelCount; // number of channels present
+  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
+  int threadPerBlock;
+  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
+  struct ncclWork* workHead;
+
+  int collOpCount; // zero based for this plan
+
+  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
+
+  struct Channel {
+    int nWork;
+    union {
+      int nWorkElem; // used for coll and reg coll
+      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
+    };
+    size_t collBytes;
+    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+  } channels[MAXCHANNELS];
+};
+
+struct ncclRegRequest {
+  uintptr_t buff;
+  size_t size;
+  struct ncclRegRequest *next;
+};
+
+struct ncclRegRecord {
+  uintptr_t buff;
+  size_t size;
+  CUdeviceptr regAddr;
+  size_t regSize;
+  int dev;
+  CUmemGenericAllocationHandle mcHandle;
+  uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
+  struct ncclRegRecord *next;
+};
+
+struct ncclComm {
+  struct ncclMemoryStack memPermanent, memScoped;
+  // List of destructors to run when comm is destructed
+  struct ncclDestructor* destructorHead;
+
+  struct ncclSharedResources* sharedRes;
+  /* map to top parent ranks. */
+  int* topParentRanks;
+  int* topParentLocalRanks;
+  struct ncclChannel channels[MAXCHANNELS];
+  struct ncclPeerInfo* peerInfo;
+  struct ncclTopoSystem* topo;
+
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
+  void* bootstrap;
+  // Bitmasks for ncclTransportP2pSetup
+  uint64_t* connectSend;
+  uint64_t* connectRecv;
+
+  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
+
+  uint64_t commHash;
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  int nvmlDev; // my nvml device index
+  int compCap; // compute capability of the GPU
+  int minCompCap, maxCompCap; // min/max compute capability in the communicator
+  int64_t busId;   // my PCI bus ID in int format
+  cpu_set_t cpuAffinity; // CPU affinity of the GPU
+  int cudaArch; // matches __CUDA_ARCH__ of device
+
+  int node;
+  int nNodes;
+  int localRank;
+  int localRanks;
+  int maxLocalRanks;
+  int* rankToNode;
+  int* rankToLocalRank;
+  int* localRankToRank;
+  // localRanks and localRanktoRank for all nodes
+  struct ncclNodeRanks* nodeRanks;
+
+  bool checkPointers;
+  bool dmaBufSupport;
+
+  // Counter for tracking CUDA launches (P2P and collectives included)
+  uint64_t opCount;
+
+  // Channels for collectives
+  int nChannels;
+  int nvlsChannels;
+  int collNetChannels;
+  // Channels (per peer) for p2p
+  int p2pnChannels;
+  int p2pnChannelsPerPeer;
+  int p2pChannels[MAXCHANNELS];
+
+  // Should this comm allocate LL buffers for network P2P connections?
+  bool allocP2pNetLLBuffers;
+
+  // Buffer sizes
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pChunkSize;
+
+  // Algorithm/Protocols thresholds
+  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+
+  /* This attribute can indicate the states of communicators and return code of
+   * asynchronous NCCL operations. */
+  ncclResult_t asyncResult;
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile uint32_t *childAbortFlag;
+  uint32_t *abortFlagRefCount;
+
+  // Device side of the communicator (for cudaFree's)
+  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+
+  // Operation pool.
+  int workFifoDepth; // size of workFifoHeap[], power of 2
+  struct ncclWork* workFifoHeap;
+  struct ncclWork* devWorkFifoHeap;
+  void* workFifoHeapGdrHandle;
+
+  // Work completion notificaion
+  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
+  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
+  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
+
+  // Intra-process sync
+  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
+  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
+  int intraRank;
+  int intraRanks;
+  uint32_t intraBarrierPhase;
+  char intraPad1[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierCounter; // only used if this is intraComm0
+  char intraPad2[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierGate; // only used if this is intraComm0
+
+  struct ncclProxyState* proxyState;
+  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
+  // Whether this communicator uses collNet
+  int collNetSupport;
+  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
+  int intraHighestTransportType;
+  int* collNetHeads;
+  int collNetHeadsNum;
+  /* sharable collNet proxy progress resource. */
+  struct ncclCollNetSharedRes* collNetSharedRes;
+
+  // NVLink SHARP (NVLS) support
+  int nvlsSupport;
+  int nvlsRegSupport;
+  /* sharable NVLS resource. */
+  struct ncclNvlsSharedRes* nvlsResources;
+
+  ssize_t channelSize; // User requested work size (bytes) for channel partitions
+
+  // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclProxyOp;
+  struct ncclMemoryPool memPool_ncclKernelPlan;
+  struct ncclMemoryPool memPool_ncclPointerList;
+  struct ncclMemoryPool memPool_ncclNvlsHandleList;
+  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
+  // this comm is not yet in a group.
+  struct ncclComm* groupNext;
+  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
+  struct ncclComm* preconnectNext;
+  int persistentRefs; // number of persistent plan-lists capturing this comm
+  struct ncclTasks tasks;
+
+  // user-created reduction ops
+  int userRedOpCapacity, userRedOpFreeHead;
+  ncclUserRedOp *userRedOps;
+
+  // Queue of things for the main thread to do
+  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
+
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
+
+  ncclConfig_t config;
+  // initState is to more conveniently reclaim resources when errors happen.
+  ncclResult_t initState;
+  // flag to indicate if ncclCommFinalize() is called
+  bool finalizeCalled;
+  // shared structures for finalization
+  int finalizeRankCnt;
+  // group job to support multi-thread FT
+  struct ncclGroupJob *groupJob;
+
+  /* store to buffer register request */
+  struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
+  /* store registered buffer */
+  struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
+
+  // Tuning plugin
+  ncclTuner_t* tuner;
+};
+
+enum ncclLaunchMode {
+  ncclLaunchModeInvalid=0,
+  ncclLaunchModeParallel,
+  ncclLaunchModeGroup
+};
+extern enum ncclLaunchMode ncclParamLaunchMode;
+
+void ncclCommPushFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
+
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
+  while (cb != nullptr) {
+    struct ncclCommCallback* next = cb->next;
+    ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
+    if (res1 != ncclSuccess) result = res1;
+    cb = next;
+  }
+  NCCLCHECK(result);
+  return ncclSuccess;
+}
+
+inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
+  int phase = comm->intraBarrierPhase;
+  if (comm->intraRanks == 1) {
+    // Release everyone (just me).
+    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
+  } else {
+    struct ncclComm* comm0 = comm->intraComm0;
+    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
+    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
+      // Reset.
+      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
+      // Release everyone.
+      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
+    }
+  }
+}
+
+// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
+inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
+  struct ncclComm* comm0 = comm->intraComm0;
+  comm->intraBarrierPhase ^= 1;
+  uint32_t phase = comm->intraBarrierPhase;
+  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+  if ((gate & 1) != phase) {
+    uint64_t t0 = clockNano();
+    do {
+      // Spin vigorously for first 5us.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+    } while ((gate & 1) != phase);
+  }
+  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
+  return gate>>32;
+}
+
+// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
+// communicator memory address. Used to catch bugs so that integer handles
+// associated with this communicator won't collide with handles of other
+// communicatrs. This function is its own inverse.
+static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
+  // Preserve the built-in values.
+  if(int(op) < int(ncclNumOps))
+    return op;
+  uint64_t h = reinterpret_cast<uint64_t>(comm);
+  h ^= h >> 32;
+  h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
+  h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
+  h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
+  int op1 = int(h) ^ int(op);
+  // Since builtin values are preserved, we also have to preserve their preimage.
+  return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
+}
+
+ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
+ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
+
+#endif
diff --git a/nvls/core.h b/nvls/core.h
new file mode 100644
index 000000000..a1754beeb
--- /dev/null
+++ b/nvls/core.h
@@ -0,0 +1,41 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CORE_H_
+#define NCCL_CORE_H_
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <algorithm> // For std::min/std::max
+#include "nccl.h"
+
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+
+#include "debug.h"
+#include "checks.h"
+#include "cudawrap.h"
+#include "alloc.h"
+#include "utils.h"
+#include "param.h"
+#include "nvtx.h"
+
+#endif // end include guard
diff --git a/nvls/cpuset.h b/nvls/cpuset.h
new file mode 100644
index 000000000..ec55cbc54
--- /dev/null
+++ b/nvls/cpuset.h
@@ -0,0 +1,61 @@
+/*************************************************************************
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+
+static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+
+#endif
diff --git a/nvls/cudawrap.h b/nvls/cudawrap.h
new file mode 100644
index 000000000..cc363c1ac
--- /dev/null
+++ b/nvls/cudawrap.h
@@ -0,0 +1,129 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_CUDAWRAP_H_
+#define NCCL_CUDAWRAP_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "checks.h"
+
+// Is cuMem API usage enabled
+extern int ncclCuMemEnable();
+
+#if CUDART_VERSION >= 11030
+#include <cudaTypedefs.h>
+#else
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+#endif
+
+#define CUPFN(symbol) pfn_##symbol
+
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure %d '%s'", err, errStr);	      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    CUresult err = pfn_##cmd;						\
+    if( err != CUDA_SUCCESS ) {						\
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
+    }									\
+} while(false)
+
+#define CUCHECKTHREAD(cmd, args) do {					\
+    CUresult err = pfn_##cmd;						\
+    if (err != CUDA_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+
+#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
+
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
+// cuMem API support
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
+DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
+#if CUDA_VERSION >= 11070
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
+#endif
+#if CUDA_VERSION >= 12010
+/* NVSwitch Multicast support */
+DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
+#endif
+#endif
+
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
+
+
+ncclResult_t ncclCudaLibraryInit(void);
+
+extern int ncclCudaDriverVersionCache;
+extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
+
+inline ncclResult_t ncclCudaDriverVersion(int* driver) {
+  int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
+  if (version == -1) {
+    CUDACHECK(cudaDriverGetVersion(&version));
+    __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
+  }
+  *driver = version;
+  return ncclSuccess;
+}
+#endif
diff --git a/nvls/debug.h b/nvls/debug.h
new file mode 100644
index 000000000..d10217856
--- /dev/null
+++ b/nvls/debug.h
@@ -0,0 +1,48 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_DEBUG_H_
+#define NCCL_INT_DEBUG_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include <stdio.h>
+#include <chrono>
+#include <type_traits>
+
+#include <limits.h>
+#include <string.h>
+#include <pthread.h>
+
+// Conform to pthread and NVTX standard
+#define NCCL_THREAD_NAMELEN 16
+
+extern int ncclDebugLevel;
+extern uint64_t ncclDebugMask;
+extern pthread_mutex_t ncclDebugLock;
+extern FILE *ncclDebugFile;
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
+
+// Let code temporarily downgrade WARN into INFO
+extern thread_local int ncclDebugNoWarn;
+extern char ncclLastError[];
+
+#define WARN(...) printf(__VA_ARGS__)
+#define INFO(FLAGS, ...) printf(__VA_ARGS__)
+#define TRACE_CALL(...) printf(__VA_ARGS__)
+
+#ifdef ENABLE_TRACE
+#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+extern std::chrono::steady_clock::time_point ncclEpoch;
+#else
+#define TRACE(...)
+#endif
+
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
+
+#endif
diff --git a/nvls/device.h b/nvls/device.h
new file mode 100644
index 000000000..56f8039f3
--- /dev/null
+++ b/nvls/device.h
@@ -0,0 +1,463 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "align.h"
+#include <stdint.h>
+
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
+
+extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
+
+extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
+
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+
+#include "net_device.h"
+
+enum ncclDevRedOp_t {
+  ncclDevSum, ncclDevProd, ncclDevMinMax,
+  ncclDevPreMulSum, ncclDevSumPostDiv,
+  ncclNumDevRedOps
+};
+struct ncclDevRedOpFull {
+  ncclDevRedOp_t op;
+  ncclRedOp_t proxyOp;
+  bool scalarArgIsPtr;
+  uint64_t scalarArg;
+};
+
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+
+#define WARP_SIZE 32
+#define MAXCHANNELS 32
+#define NCCL_MAX_NTHREADS 640
+#define NCCL_SIMPLE_MAX_NTHREADS 512
+#define NCCL_LL_MAX_NTHREADS 512
+#define NCCL_LL_LINES_PER_THREAD 8
+#ifdef TEST_LL_CLEANUP
+#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define NCCL_LL_FLAG_MAX   0x100
+#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+
+#define NCCL_LL128_LINESIZE 128
+#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
+#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
+
+#define NCCL_LL128_MAX_NTHREADS 640
+#define NCCL_LL128_ELEMS_PER_THREAD 120
+
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
+#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+
+#define NCCL_DIRECT_WRITE 0x01
+#define NCCL_DIRECT_READ  0x02
+#define NCCL_DIRECT_NIC   0x04
+#define NCCL_IPC_WRITE    0x08
+#define NCCL_IPC_READ     0x10
+#define NCCL_NVLS_MIN_POLL 0x20
+
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+  void* mhandles[NCCL_NUM_PROTOCOLS];
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+
+  int flags;          // Direct communication / other flags
+  int shared;         // Buffers are shared
+  void **ptrExchange; // Pointer exchange for direct communication
+  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
+
+  int *sizesFifo;     // Sizes fifo from GPU to proxy
+  int *offsFifo;      // Buffer fifo from proxy to GPU
+
+  uint64_t step;      // Keep where we are
+  uint64_t llLastCleaning;
+  ncclNetDeviceHandle_t netDeviceHandle;
+};
+
+struct ncclProxyConnector {
+  int tpRank;
+  int tpLocalRank;
+  int sameProcess;
+  struct ncclProxyConnection* connection;
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
+};
+
+struct ncclConnector {
+  int connected;
+  struct ncclProxyConnector proxyConn;
+  struct ncclTransportComm* transportComm;
+  void* transportResources;
+  struct ncclConnInfo conn;
+};
+
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+
+  int index; // This rank's index in the ring
+};
+
+
+// The root of each tree only has one node down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+
+#define NCCL_MAX_DIRECT_ARITY 7
+struct ncclDirect {
+  int depth;
+  int out;
+  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+  int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
+  int up[NCCL_MAX_DIRECT_ARITY];
+  int down[NCCL_MAX_DIRECT_ARITY];
+};
+
+#define NCCL_MAX_NVLS_ARITY 8
+#define NCCL_MAX_NVLS_TREE_ARITY 3
+struct ncclNvls {
+  int out;
+  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+  int up[NCCL_MAX_NVLS_ARITY];
+  int down;
+  int treeUp;
+  int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
+  int node;
+  int nNodes;
+};
+
+#define NCCL_MAX_CONNS 2
+struct ncclChannelPeer {
+  struct ncclConnector send[NCCL_MAX_CONNS];
+  struct ncclConnector recv[NCCL_MAX_CONNS];
+  int refCount;
+};
+
+struct ncclDevComm;
+
+/* ncclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclWorkElem. */
+#define NCCL_WORK_SIZE 512
+
+enum ncclWorkType : uint8_t {
+   ncclWorkTypeUnused=0,
+   ncclWorkTypeColl=1,
+   ncclWorkTypeP2p=2,
+   ncclWorkTypeRegColl=3
+};
+enum ncclWorkP2PType : uint8_t {
+  ncclWorkP2pTypeUnused=0,
+  ncclWorkP2pTypeSend,
+  ncclWorkP2pTypeRecv
+};
+
+struct ncclWorkHeader {
+  union {
+    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+  };
+  uint16_t funcIndex;
+  uint8_t isLast:1; // last work for this kernel
+  uint8_t inFifo:1; // is this work in the fifo
+  enum ncclWorkType type;
+};
+
+struct ncclWorkElem {
+  union {
+    uint8_t flagBits;
+    struct {
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1;
+    };
+  };
+  uint8_t nWarps;
+  uint8_t direct;
+
+  const void * sendbuff;
+  void * recvbuff;
+
+  size_t count;
+  size_t lastChunkSize;
+  uint32_t root;
+  uint8_t bid;
+  uint8_t nChannels;
+  uint64_t redOpArg;
+};
+
+#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
+static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
+
+struct ncclWorkElemP2p {
+  int peer : 30;
+  int proto : 2;
+
+  enum ncclWorkP2PType p2pType;
+  uint8_t nWarps;
+  uint8_t warpStart;
+  uint8_t ngroups;
+  // Important not to use any fields with greater than 4-byte alignment since
+  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
+  // there were 8-byte fields.
+  //void* buff;
+  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+  //size_t count;
+  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+  int chunkSize;
+};
+
+static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
+#define NCCL_MAX_WORK_ELEMENTS_P2P 16
+
+struct ncclWorkElemReg {
+  struct ncclWorkElem elem;
+  void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
+};
+
+#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
+static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
+
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS 16
+
+struct ncclWork {
+  struct ncclWorkHeader header;
+  union {
+    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
+    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
+    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
+    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
+  };
+};
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
+static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
+
+struct ncclDevChannelPeer {
+  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
+  // instead of the full ncclConnector.
+  struct ncclConnInfo send[NCCL_MAX_CONNS];
+  struct ncclConnInfo recv[NCCL_MAX_CONNS];
+};
+
+struct alignas(16) ncclDevChannel {
+  struct ncclDevChannelPeer** peers;
+  struct ncclRing ring;
+  struct ncclTree tree;
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
+  struct ncclNvls nvls;
+  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+};
+
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pChunkSize;
+
+  // Operation list for aggregation
+  int workFifoDepth;
+  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t* abortFlag;
+
+  // Channels, device side
+  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
+};
+
+struct alignas(16) ncclDevCommAndChannels {
+  struct ncclDevComm comm;
+  struct ncclDevChannel channels[MAXCHANNELS];
+};
+
+#ifdef __CUDA_ARCH__
+  #define NCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+  #define NCCL_CUDA_ARCH 0
+#endif
+
+template<typename T>
+__host__ __device__ constexpr T min_constexpr(T a) { return a; }
+template<typename T, typename ...Ts>
+__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
+  return min_constexpr<T>((a < b ? a : b), c...);
+}
+
+template<typename T>
+__host__ __device__ constexpr T max_constexpr(T a) { return a; }
+template<typename T, typename ...Ts>
+__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
+  return max_constexpr<T>((a > b ? a : b), c...);
+}
+
+// Calculate the unroll factor given:
+// * bytePerPack: number of bytes accessed per instruction
+// * insns: max permissible unroll value
+// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
+__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
+  return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
+}
+
+// Note that all unroll value logic should depend on a given cudaArch argument
+// and not __CUDA_ARCH__ since these need to be host-side executable where the
+// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
+// side code can elide passing the arch for brevity.
+
+__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
+  // Our collective unroll should move to the same bytes&insns model as NVLS.
+  return cudaArch >= 800 ? 8 : 4;
+}
+
+__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
+__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
+
+__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
+  return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
+}
+
+// The amount of dynamic shmem per warp
+__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
+  return (max_constexpr<int>(
+      /*LL    */0,
+      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
+      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
+      // NVLS needs an extra 16B to read unaligned data.
+      /*NVLS  */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
+    ) + 15) & -16; // pad to 16 bytes
+}
+
+// The amount of dynamic shmem per block
+__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
+  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
+}
+
+// Host-side table of kernel function pointers.
+extern int const ncclDevKernelCount;
+extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
+
+// Table of most specialized kernel function to run given func index.
+extern int const ncclDevFuncRowToId[];
+extern void* const ncclDevKernelForFunc[/*funcIndex*/];
+extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
+
+// Launch a one-rank reduction on stream.
+ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream);
+
+// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py"
+inline bool ncclNvlsSupported(int devRedOp, int type) {
+  switch (type) {
+  case ncclInt32:
+  case ncclUint32:
+  case ncclInt64:
+  case ncclUint64:
+  case ncclFloat16:
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  case ncclBfloat16:
+  #endif
+    return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
+  case ncclFloat:
+  case ncclDouble:
+    return devRedOp == ncclDevSum;
+  default:
+    return false;
+  }
+}
+
+// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
+inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+  constexpr int NumTypes = ncclNumTypes;
+  #else
+  constexpr int NumTypes = ncclNumTypes + 1;
+  #endif
+
+  int row = 0; // ncclDevFuncIndex_P2p
+  if (coll == ncclFuncSendRecv) goto have_row;
+  row += 1;
+
+  if (coll == ncclFuncAllGather) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += algo1*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncBroadcast) {
+    row += proto;
+    goto have_row;
+  }
+  row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncAllReduce) {
+    row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduce) {
+    row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
+
+  if (coll == ncclFuncReduceScatter) {
+    int algo1 = algo == NCCL_ALGO_RING ? 0 :
+              /*algo == NCCL_ALGO_NVLS*/ 1;
+    row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
+    goto have_row;
+  }
+  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
+
+have_row:
+  return ncclDevFuncRowToId[row];
+}
+
+inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; }
+
+#endif
diff --git a/nvls/enqueue.h b/nvls/enqueue.h
new file mode 100644
index 000000000..634f037cb
--- /dev/null
+++ b/nvls/enqueue.h
@@ -0,0 +1,26 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENQUEUE_H_
+#define NCCL_ENQUEUE_H_
+
+#include "comm.h"
+#include "group.h"
+#include "collectives.h"
+#include "utils.h"
+
+#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
+#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
+
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
+
+#endif // End include guard
diff --git a/nvls/gdrwrap.h b/nvls/gdrwrap.h
new file mode 100644
index 000000000..a64674cc5
--- /dev/null
+++ b/nvls/gdrwrap.h
@@ -0,0 +1,252 @@
+/*************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GDRWRAP_H_
+#define NCCL_GDRWRAP_H_
+
+#include "nccl.h"
+#include <stdint.h> // for standard [u]intX_t types
+#include <stdio.h>
+#include <stdlib.h>
+
+// These can be used if the GDR library isn't thread safe
+#include <pthread.h>
+extern pthread_mutex_t gdrLock;
+#define GDRLOCK() pthread_mutex_lock(&gdrLock)
+#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
+#define GDRLOCKCALL(cmd, ret) do {                      \
+    GDRLOCK();                                          \
+    ret = cmd;                                          \
+    GDRUNLOCK();                                        \
+} while(false)
+
+#define GDRCHECK(cmd) do {                              \
+    int e;                                              \
+    /* GDRLOCKCALL(cmd, e); */                          \
+    e = cmd;                                            \
+    if( e != 0 ) {                                      \
+      WARN("GDRCOPY failure %d", e);                    \
+      return ncclSystemError;                           \
+    }                                                   \
+} while(false)
+
+// This is required as the GDR memory is mapped WC
+#if !defined(__NVCC__)
+#if defined(__PPC__)
+static inline void wc_store_fence(void) { asm volatile("sync") ; }
+#elif defined(__x86_64__)
+#include <immintrin.h>
+static inline void wc_store_fence(void) { _mm_sfence(); }
+#elif defined(__aarch64__)
+#ifdef __cplusplus
+#include <atomic>
+static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
+#else
+#include <stdatomic.h>
+static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
+#endif
+#endif
+#endif
+
+//#define GDR_DIRECT 1
+#ifdef GDR_DIRECT
+// Call the GDR API library code directly rather than via
+// dlopen() wrappers
+#include <gdrapi.h>
+
+static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
+static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
+static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
+static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
+  GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
+  GDRCHECK(gdr_unpin_buffer(g, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
+  GDRCHECK(gdr_get_info(g, handle, info));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
+  GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
+  GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static void wrap_gdr_runtime_get_version(int *major, int *minor) {
+  gdr_runtime_get_version(major, minor);
+  return ncclSuccess;
+}
+static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
+  gdr_driver_get_version(g, major, minor);
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
+  GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
+  GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
+  return ncclSuccess;
+}
+
+#else
+// Dynamically handle dependency the GDR API library
+
+/* Extracted from gdrapi.h (v2.1 Nov 2020) */
+
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+
+struct gdr;
+typedef struct gdr *gdr_t;
+
+typedef struct gdr_mh_s {
+  unsigned long h;
+} gdr_mh_t;
+
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    unsigned mapped:1;
+    unsigned wc_mapping:1;
+};
+typedef struct gdr_info gdr_info_t;
+
+/* End of gdrapi.h */
+
+ncclResult_t wrap_gdr_symbols(void);
+
+gdr_t wrap_gdr_open(void);
+ncclResult_t wrap_gdr_close(gdr_t g);
+ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
+ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
+ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
+ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+
+#endif // GDR_DIRECT
+
+// Global GDR driver handle
+extern gdr_t ncclGdrCopy;
+
+#include "alloc.h"
+
+typedef struct gdr_mem_desc {
+  void *gdrDevMem;
+  void *gdrMap;
+  size_t gdrOffset;
+  size_t gdrMapSize;
+  gdr_mh_t gdrMh;
+} gdr_mem_desc_t;
+
+static gdr_t ncclGdrInit() {
+  int libMajor, libMinor, drvMajor, drvMinor;
+  gdr_t handle = NULL;
+  // Dynamically load the GDRAPI library symbols
+  if (wrap_gdr_symbols() == ncclSuccess) {
+    handle = wrap_gdr_open();
+
+    if (handle != NULL) {
+      ncclResult_t res;
+
+      // Query the version of libgdrapi
+      NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
+
+      // Query the version of gdrdrv driver
+      NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
+
+      // Only support GDRAPI 2.1 and later
+      if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
+        goto error;
+      }
+      else
+        INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
+    }
+  }
+  return handle;
+error:
+  if (handle != NULL) (void) wrap_gdr_close(handle);
+  return NULL;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
+  gdr_info_t info;
+  size_t mapSize;
+  gdr_mh_t mh;
+  char *devMem;
+  void *gdrMap;
+
+  mapSize = sizeof(T)*nelem;
+
+  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
+  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
+  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
+  uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
+  size_t align = alignedAddr - (uint64_t)devMem;
+
+  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
+  NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
+
+  NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
+  //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
+
+  NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
+
+  // Will offset ever be non zero ?
+  ssize_t off = info.va - alignedAddr;
+
+  gdr_mem_desc_t* md;
+  NCCLCHECK(ncclCalloc(&md, 1));
+  md->gdrDevMem = devMem;
+  md->gdrMap = gdrMap;
+  md->gdrMapSize = mapSize;
+  md->gdrOffset = off+align;
+  md->gdrMh = mh;
+  *gdrHandle = md;
+
+  *ptr = (T *)((char *)gdrMap+off);
+  if (devPtr) *devPtr = (T *)(devMem+off+align);
+
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+
+  return ncclSuccess;
+}
+
+template <typename T>
+static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
+  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
+  NCCLCHECK(ncclCudaFree(md->gdrDevMem));
+  free(md);
+
+  return ncclSuccess;
+}
+
+#endif // End include guard
diff --git a/nvls/graph.h b/nvls/graph.h
new file mode 100644
index 000000000..fdd634894
--- /dev/null
+++ b/nvls/graph.h
@@ -0,0 +1,116 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GRAPH_H_
+#define NCCL_GRAPH_H_
+
+#include "nccl.h"
+#include "device.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <sched.h>
+
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
+
+struct ncclTopoSystem;
+// Build the topology
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
+
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
+void ncclTopoFree(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
+ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
+int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+
+// Query topology
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
+int ncclPxnDisable(struct ncclComm* comm);
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+
+// Find CPU affinity
+ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
+
+#define NCCL_TOPO_CPU_ARCH_X86 1
+#define NCCL_TOPO_CPU_ARCH_POWER 2
+#define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_VENDOR_INTEL 1
+#define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
+#define NCCL_TOPO_CPU_TYPE_BDW 1
+#define NCCL_TOPO_CPU_TYPE_SKL 2
+#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
+ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
+ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
+
+#define NCCL_TOPO_MAX_NODES 256
+
+// Init search. Needs to be done before calling ncclTopoCompute
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+
+#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
+#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
+#define NCCL_TOPO_PATTERN_RING 4            // Ring
+#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
+struct ncclTopoGraph {
+  // Input / output
+  int id; // ring : 0, tree : 1, collnet : 2
+  int pattern;
+  int crossNic;
+  int collNet;
+  int minChannels;
+  int maxChannels;
+  // Output
+  int nChannels;
+  float bwIntra;
+  float bwInter;
+  float latencyInter;
+  int typeIntra;
+  int typeInter;
+  int sameChannels;
+  int nHops;
+  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
+  int inter[MAXCHANNELS*2];
+};
+ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
+
+struct ncclTopoRanks {
+  int ringRecv[MAXCHANNELS];
+  int ringSend[MAXCHANNELS];
+  int ringPrev[MAXCHANNELS];
+  int ringNext[MAXCHANNELS];
+  int treeToParent[MAXCHANNELS];
+  int treeToChild0[MAXCHANNELS];
+  int treeToChild1[MAXCHANNELS];
+  int nvlsHeads[MAXCHANNELS];
+};
+
+ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
+    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs);
+
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
+#include "info.h"
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
+
+#endif
diff --git a/nvls/group.h b/nvls/group.h
new file mode 100644
index 000000000..72251147f
--- /dev/null
+++ b/nvls/group.h
@@ -0,0 +1,137 @@
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_GROUP_H_
+#define NCCL_GROUP_H_
+
+#include "nccl.h"
+#include "comm.h"
+
+ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
+void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommPreconnect(struct ncclComm* comm);
+ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
+ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
+ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);
+
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
+
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
+
+typedef enum ncclGroupJobState {
+  ncclGroupJobRunning = 0,
+  ncclGroupJobDone    = 1,
+  ncclGroupJobJoined  = 2,
+} ncclGroupJobState_t;
+
+struct ncclAsyncJob {
+  struct ncclAsyncJob* next;
+  pthread_t thread;
+  ncclResult_t result;
+  ncclResult_t(*func)(struct ncclAsyncJob*);
+  void(*undo)(struct ncclAsyncJob*);
+  void(*destructor)(void*);
+  ncclGroupJobState_t state;
+  volatile uint32_t *abortFlag; /* point to comm abortFlag */
+  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
+  ncclComm_t comm;
+};
+
+ncclResult_t ncclAsyncLaunch(
+  struct ncclAsyncJob* job,
+  ncclResult_t(*func)(struct ncclAsyncJob*),
+  void(*undo)(struct ncclAsyncJob*),
+  void(*destructor)(void*), ncclComm_t comm
+);
+
+struct ncclGroupJob {
+  struct ncclAsyncJob base;
+  struct ncclComm **groupCommHeadPtr;
+  struct ncclComm **groupCommPreconnectHeadPtr;
+  ncclResult_t *groupErrorPtr;
+  volatile bool *abortFlagPtr;
+  int *groupBlockingPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
+  bool initialized;
+};
+
+ncclResult_t ncclGroupStartInternal();
+ncclResult_t ncclGroupEndInternal();
+ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
+
+////////////////////////////////////////////////////////////////////////////////
+
+extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
+extern __thread ncclResult_t ncclGroupError;
+extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+extern __thread int ncclGroupBlocking;
+extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
+extern __thread struct ncclGroupJob ncclGroupJobMain;
+
+static inline void groupResetJobState() {
+  ncclGroupBlocking = -1;
+  ncclGroupJobMainPtr = NULL;
+  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
+  return;
+}
+
+static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
+  ncclResult_t ret = ncclSuccess;
+  if (job) {
+    ret = ncclAsyncJobComplete(&job->base);
+    groupResetJobState();
+  }
+  return ret;
+}
+
+inline ncclResult_t ncclGroupStartInternal() {
+  ncclGroupDepth++;
+  return ncclSuccess;
+}
+
+inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
+  if (ncclGroupDepth > 0) {
+    if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
+  }
+  return ret;
+}
+
+// Add comm to this thread's group
+inline void ncclGroupCommJoin(struct ncclComm* comm) {
+  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
+    // the users program order yet insures siblings occur consecutively. This
+    // is required by doLaunches() in "group.cc".
+    struct ncclComm** pp = &ncclGroupCommHead;
+    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
+      pp = &(*pp)->groupNext;
+    comm->groupNext = *pp;
+    *pp = comm;
+    // Comms gets a new memory stack scope upon joining. Each task batched for
+    // this comm is allocated there.
+    ncclMemoryStackPush(&comm->memScoped);
+  }
+
+  ncclGroupBlocking = comm->config.blocking;
+}
+
+// Add comm to this thread's group needing preconnect
+inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
+  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    comm->preconnectNext = ncclGroupCommPreconnectHead;
+    ncclGroupCommPreconnectHead = comm;
+  }
+}
+
+// Comm has left group
+inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  ncclMemoryStackPop(&comm->memScoped);
+  return ncclSuccess;
+}
+
+#endif
diff --git a/nvls/ibvcore.h b/nvls/ibvcore.h
new file mode 100644
index 000000000..8d8ecf1ec
--- /dev/null
+++ b/nvls/ibvcore.h
@@ -0,0 +1,1058 @@
+#ifndef NCCL_IBV_CORE_H_
+#define NCCL_IBV_CORE_H_
+
+/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without
+ * explicit including of IB verbs header.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if __GNUC__ >= 3
+#  define __attribute_const __attribute__((const))
+#else
+#  define __attribute_const
+#endif
+
+union ibv_gid {
+	uint8_t			raw[16];
+	struct {
+		uint64_t	subnet_prefix;
+		uint64_t	interface_id;
+	} global;
+};
+
+#ifndef container_of
+/**
+  * container_of - cast a member of a structure out to the containing structure
+  * @ptr:        the pointer to the member.
+  * @type:       the type of the container struct this is embedded in.
+  * @member:     the name of the member within the struct.
+  *
+ */
+#define container_of(ptr, type, member) \
+	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
+#endif
+
+#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
+
+enum ibv_node_type {
+	IBV_NODE_UNKNOWN	= -1,
+	IBV_NODE_CA 		= 1,
+	IBV_NODE_SWITCH,
+	IBV_NODE_ROUTER,
+	IBV_NODE_RNIC,
+
+	/* Leave a gap for future node types before starting with
+	 * experimental node types.
+	 */
+	IBV_EXP_NODE_TYPE_START	= 32,
+	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
+};
+
+enum ibv_transport_type {
+	IBV_TRANSPORT_UNKNOWN	= -1,
+	IBV_TRANSPORT_IB	= 0,
+	IBV_TRANSPORT_IWARP,
+
+	/* Leave a gap for future transport types before starting with
+	 * experimental transport types.
+	 */
+	IBV_EXP_TRANSPORT_TYPE_START	= 32,
+	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
+};
+
+enum ibv_device_cap_flags {
+	IBV_DEVICE_RESIZE_MAX_WR	= 1,
+	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
+	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
+	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
+	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
+	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
+	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
+	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
+	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
+	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
+	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
+	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
+	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
+	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
+	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
+	IBV_DEVICE_XRC			= 1 << 20,
+	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
+};
+
+enum ibv_atomic_cap {
+	IBV_ATOMIC_NONE,
+	IBV_ATOMIC_HCA,
+	IBV_ATOMIC_GLOB
+};
+
+struct ibv_device_attr {
+	char			fw_ver[64];
+	uint64_t		node_guid;
+	uint64_t		sys_image_guid;
+	uint64_t		max_mr_size;
+	uint64_t		page_size_cap;
+	uint32_t		vendor_id;
+	uint32_t		vendor_part_id;
+	uint32_t		hw_ver;
+	int			max_qp;
+	int			max_qp_wr;
+	int			device_cap_flags;
+	int			max_sge;
+	int			max_sge_rd;
+	int			max_cq;
+	int			max_cqe;
+	int			max_mr;
+	int			max_pd;
+	int			max_qp_rd_atom;
+	int			max_ee_rd_atom;
+	int			max_res_rd_atom;
+	int			max_qp_init_rd_atom;
+	int			max_ee_init_rd_atom;
+	enum ibv_atomic_cap	atomic_cap;
+	int			max_ee;
+	int			max_rdd;
+	int			max_mw;
+	int			max_raw_ipv6_qp;
+	int			max_raw_ethy_qp;
+	int			max_mcast_grp;
+	int			max_mcast_qp_attach;
+	int			max_total_mcast_qp_attach;
+	int			max_ah;
+	int			max_fmr;
+	int			max_map_per_fmr;
+	int			max_srq;
+	int			max_srq_wr;
+	int			max_srq_sge;
+	uint16_t		max_pkeys;
+	uint8_t			local_ca_ack_delay;
+	uint8_t			phys_port_cnt;
+};
+
+enum ibv_mtu {
+	IBV_MTU_256  = 1,
+	IBV_MTU_512  = 2,
+	IBV_MTU_1024 = 3,
+	IBV_MTU_2048 = 4,
+	IBV_MTU_4096 = 5
+};
+
+enum ibv_port_state {
+	IBV_PORT_NOP		= 0,
+	IBV_PORT_DOWN		= 1,
+	IBV_PORT_INIT		= 2,
+	IBV_PORT_ARMED		= 3,
+	IBV_PORT_ACTIVE		= 4,
+	IBV_PORT_ACTIVE_DEFER	= 5
+};
+
+enum {
+	IBV_LINK_LAYER_UNSPECIFIED,
+	IBV_LINK_LAYER_INFINIBAND,
+	IBV_LINK_LAYER_ETHERNET,
+
+	/* Leave a gap for future link layer types before starting with
+	 * experimental link layer.
+	 */
+	IBV_EXP_LINK_LAYER_START	= 32,
+	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
+};
+
+enum ibv_port_cap_flags {
+	IBV_PORT_SM				= 1 <<  1,
+	IBV_PORT_NOTICE_SUP			= 1 <<  2,
+	IBV_PORT_TRAP_SUP			= 1 <<  3,
+	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
+	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
+	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
+	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
+	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
+	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
+	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
+	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
+	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
+	IBV_PORT_CM_SUP				= 1 << 16,
+	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
+	IBV_PORT_REINIT_SUP			= 1 << 18,
+	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
+	IBV_PORT_VENDOR_CLASS			= 1 << 24,
+	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
+	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
+};
+
+struct ibv_port_attr {
+	enum ibv_port_state	state;
+	enum ibv_mtu		max_mtu;
+	enum ibv_mtu		active_mtu;
+	int			gid_tbl_len;
+	uint32_t		port_cap_flags;
+	uint32_t		max_msg_sz;
+	uint32_t		bad_pkey_cntr;
+	uint32_t		qkey_viol_cntr;
+	uint16_t		pkey_tbl_len;
+	uint16_t		lid;
+	uint16_t		sm_lid;
+	uint8_t			lmc;
+	uint8_t			max_vl_num;
+	uint8_t			sm_sl;
+	uint8_t			subnet_timeout;
+	uint8_t			init_type_reply;
+	uint8_t			active_width;
+	uint8_t			active_speed;
+	uint8_t			phys_state;
+	uint8_t			link_layer;
+	uint8_t			reserved;
+};
+
+enum ibv_event_type {
+	IBV_EVENT_CQ_ERR,
+	IBV_EVENT_QP_FATAL,
+	IBV_EVENT_QP_REQ_ERR,
+	IBV_EVENT_QP_ACCESS_ERR,
+	IBV_EVENT_COMM_EST,
+	IBV_EVENT_SQ_DRAINED,
+	IBV_EVENT_PATH_MIG,
+	IBV_EVENT_PATH_MIG_ERR,
+	IBV_EVENT_DEVICE_FATAL,
+	IBV_EVENT_PORT_ACTIVE,
+	IBV_EVENT_PORT_ERR,
+	IBV_EVENT_LID_CHANGE,
+	IBV_EVENT_PKEY_CHANGE,
+	IBV_EVENT_SM_CHANGE,
+	IBV_EVENT_SRQ_ERR,
+	IBV_EVENT_SRQ_LIMIT_REACHED,
+	IBV_EVENT_QP_LAST_WQE_REACHED,
+	IBV_EVENT_CLIENT_REREGISTER,
+	IBV_EVENT_GID_CHANGE,
+
+	/* new experimental events start here leaving enough
+	 * room for 14 events which should be enough
+	 */
+	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
+	IBV_EXP_EVENT_DCT_ACCESS_ERR,
+	IBV_EXP_EVENT_DCT_REQ_ERR,
+};
+
+struct ibv_async_event {
+	union {
+		struct ibv_cq  *cq;
+		struct ibv_qp  *qp;
+		struct ibv_srq *srq;
+		struct ibv_exp_dct *dct;
+		int		port_num;
+		/* For source compatible with Legacy API */
+		uint32_t	xrc_qp_num;
+	} element;
+	enum ibv_event_type	event_type;
+};
+
+enum ibv_wc_status {
+	IBV_WC_SUCCESS,
+	IBV_WC_LOC_LEN_ERR,
+	IBV_WC_LOC_QP_OP_ERR,
+	IBV_WC_LOC_EEC_OP_ERR,
+	IBV_WC_LOC_PROT_ERR,
+	IBV_WC_WR_FLUSH_ERR,
+	IBV_WC_MW_BIND_ERR,
+	IBV_WC_BAD_RESP_ERR,
+	IBV_WC_LOC_ACCESS_ERR,
+	IBV_WC_REM_INV_REQ_ERR,
+	IBV_WC_REM_ACCESS_ERR,
+	IBV_WC_REM_OP_ERR,
+	IBV_WC_RETRY_EXC_ERR,
+	IBV_WC_RNR_RETRY_EXC_ERR,
+	IBV_WC_LOC_RDD_VIOL_ERR,
+	IBV_WC_REM_INV_RD_REQ_ERR,
+	IBV_WC_REM_ABORT_ERR,
+	IBV_WC_INV_EECN_ERR,
+	IBV_WC_INV_EEC_STATE_ERR,
+	IBV_WC_FATAL_ERR,
+	IBV_WC_RESP_TIMEOUT_ERR,
+	IBV_WC_GENERAL_ERR
+};
+const char *ibv_wc_status_str(enum ibv_wc_status status);
+
+enum ibv_wc_opcode {
+	IBV_WC_SEND,
+	IBV_WC_RDMA_WRITE,
+	IBV_WC_RDMA_READ,
+	IBV_WC_COMP_SWAP,
+	IBV_WC_FETCH_ADD,
+	IBV_WC_BIND_MW,
+/*
+ * Set value of IBV_WC_RECV so consumers can test if a completion is a
+ * receive by testing (opcode & IBV_WC_RECV).
+ */
+	IBV_WC_RECV			= 1 << 7,
+	IBV_WC_RECV_RDMA_WITH_IMM
+};
+
+enum ibv_wc_flags {
+	IBV_WC_GRH		= 1 << 0,
+	IBV_WC_WITH_IMM		= 1 << 1
+};
+
+struct ibv_wc {
+	uint64_t		wr_id;
+	enum ibv_wc_status	status;
+	enum ibv_wc_opcode	opcode;
+	uint32_t		vendor_err;
+	uint32_t		byte_len;
+	uint32_t		imm_data;	/* in network byte order */
+	uint32_t		qp_num;
+	uint32_t		src_qp;
+	int			wc_flags;
+	uint16_t		pkey_index;
+	uint16_t		slid;
+	uint8_t			sl;
+	uint8_t			dlid_path_bits;
+};
+
+enum ibv_access_flags {
+	IBV_ACCESS_LOCAL_WRITE		= 1,
+	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
+	IBV_ACCESS_REMOTE_READ		= (1<<2),
+	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
+	IBV_ACCESS_MW_BIND		= (1<<4),
+	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
+};
+
+struct ibv_pd {
+	struct ibv_context     *context;
+	uint32_t		handle;
+};
+
+enum ibv_xrcd_init_attr_mask {
+	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
+	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
+	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
+};
+
+struct ibv_xrcd_init_attr {
+	uint32_t comp_mask;
+	int	 fd;
+	int	 oflags;
+};
+
+struct ibv_xrcd {
+	struct ibv_context     *context;
+};
+
+enum ibv_rereg_mr_flags {
+	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
+	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
+	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
+	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
+};
+
+struct ibv_mr {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	void		       *addr;
+	size_t			length;
+	uint32_t		handle;
+	uint32_t		lkey;
+	uint32_t		rkey;
+};
+
+enum ibv_mw_type {
+	IBV_MW_TYPE_1			= 1,
+	IBV_MW_TYPE_2			= 2
+};
+
+struct ibv_mw {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		rkey;
+};
+
+struct ibv_global_route {
+	union ibv_gid		dgid;
+	uint32_t		flow_label;
+	uint8_t			sgid_index;
+	uint8_t			hop_limit;
+	uint8_t			traffic_class;
+};
+
+struct ibv_grh {
+	uint32_t		version_tclass_flow;
+	uint16_t		paylen;
+	uint8_t			next_hdr;
+	uint8_t			hop_limit;
+	union ibv_gid		sgid;
+	union ibv_gid		dgid;
+};
+
+enum ibv_rate {
+	IBV_RATE_MAX      = 0,
+	IBV_RATE_2_5_GBPS = 2,
+	IBV_RATE_5_GBPS   = 5,
+	IBV_RATE_10_GBPS  = 3,
+	IBV_RATE_20_GBPS  = 6,
+	IBV_RATE_30_GBPS  = 4,
+	IBV_RATE_40_GBPS  = 7,
+	IBV_RATE_60_GBPS  = 8,
+	IBV_RATE_80_GBPS  = 9,
+	IBV_RATE_120_GBPS = 10,
+	IBV_RATE_14_GBPS  = 11,
+	IBV_RATE_56_GBPS  = 12,
+	IBV_RATE_112_GBPS = 13,
+	IBV_RATE_168_GBPS = 14,
+	IBV_RATE_25_GBPS  = 15,
+	IBV_RATE_100_GBPS = 16,
+	IBV_RATE_200_GBPS = 17,
+	IBV_RATE_300_GBPS = 18
+};
+
+/**
+ * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
+ * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
+ * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
+ * @mult: multiple to convert.
+ */
+enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
+
+/**
+ * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
+ * For example, IBV_RATE_5_GBPS will return the value 5000.
+ * @rate: rate to convert.
+ */
+int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
+
+/**
+ * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
+ * @mbps: value to convert.
+ */
+enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
+
+struct ibv_ah_attr {
+	struct ibv_global_route	grh;
+	uint16_t		dlid;
+	uint8_t			sl;
+	uint8_t			src_path_bits;
+	uint8_t			static_rate;
+	uint8_t			is_global;
+	uint8_t			port_num;
+};
+
+enum ibv_srq_attr_mask {
+	IBV_SRQ_MAX_WR	= 1 << 0,
+	IBV_SRQ_LIMIT	= 1 << 1
+};
+
+struct ibv_srq_attr {
+	uint32_t		max_wr;
+	uint32_t		max_sge;
+	uint32_t		srq_limit;
+};
+
+struct ibv_srq_init_attr {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+};
+
+enum ibv_srq_type {
+	IBV_SRQT_BASIC,
+	IBV_SRQT_XRC
+};
+
+enum ibv_srq_init_attr_mask {
+	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
+	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
+	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
+	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
+	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_srq_init_attr_ex {
+	void		       *srq_context;
+	struct ibv_srq_attr	attr;
+
+	uint32_t		comp_mask;
+	enum ibv_srq_type	srq_type;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+	struct ibv_cq	       *cq;
+};
+
+enum ibv_qp_type {
+	IBV_QPT_RC = 2,
+	IBV_QPT_UC,
+	IBV_QPT_UD,
+	/* XRC compatible code */
+	IBV_QPT_XRC,
+	IBV_QPT_RAW_PACKET = 8,
+	IBV_QPT_RAW_ETH = 8,
+	IBV_QPT_XRC_SEND = 9,
+	IBV_QPT_XRC_RECV,
+
+	/* Leave a gap for future qp types before starting with
+	 * experimental qp types.
+	 */
+	IBV_EXP_QP_TYPE_START	= 32,
+	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
+};
+
+struct ibv_qp_cap {
+	uint32_t		max_send_wr;
+	uint32_t		max_recv_wr;
+	uint32_t		max_send_sge;
+	uint32_t		max_recv_sge;
+	uint32_t		max_inline_data;
+};
+
+struct ibv_qp_init_attr {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+	/* Below is needed for backwards compatabile */
+	struct ibv_xrc_domain  *xrc_domain;
+};
+
+enum ibv_qp_init_attr_mask {
+	IBV_QP_INIT_ATTR_PD		= 1 << 0,
+	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
+	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
+};
+
+struct ibv_qp_init_attr_ex {
+	void		       *qp_context;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	struct ibv_qp_cap	cap;
+	enum ibv_qp_type	qp_type;
+	int			sq_sig_all;
+
+	uint32_t		comp_mask;
+	struct ibv_pd	       *pd;
+	struct ibv_xrcd	       *xrcd;
+};
+
+enum ibv_qp_open_attr_mask {
+	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
+	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
+	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
+	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
+	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
+};
+
+struct ibv_qp_open_attr {
+	uint32_t		comp_mask;
+	uint32_t		qp_num;
+	struct ibv_xrcd        *xrcd;
+	void		       *qp_context;
+	enum ibv_qp_type	qp_type;
+};
+
+enum ibv_qp_attr_mask {
+	IBV_QP_STATE			= 1 << 	0,
+	IBV_QP_CUR_STATE		= 1 << 	1,
+	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
+	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
+	IBV_QP_PKEY_INDEX		= 1 << 	4,
+	IBV_QP_PORT			= 1 << 	5,
+	IBV_QP_QKEY			= 1 << 	6,
+	IBV_QP_AV			= 1 << 	7,
+	IBV_QP_PATH_MTU			= 1 << 	8,
+	IBV_QP_TIMEOUT			= 1 << 	9,
+	IBV_QP_RETRY_CNT		= 1 << 10,
+	IBV_QP_RNR_RETRY		= 1 << 11,
+	IBV_QP_RQ_PSN			= 1 << 12,
+	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
+	IBV_QP_ALT_PATH			= 1 << 14,
+	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
+	IBV_QP_SQ_PSN			= 1 << 16,
+	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
+	IBV_QP_PATH_MIG_STATE		= 1 << 18,
+	IBV_QP_CAP			= 1 << 19,
+	IBV_QP_DEST_QPN			= 1 << 20
+};
+
+enum ibv_qp_state {
+	IBV_QPS_RESET,
+	IBV_QPS_INIT,
+	IBV_QPS_RTR,
+	IBV_QPS_RTS,
+	IBV_QPS_SQD,
+	IBV_QPS_SQE,
+	IBV_QPS_ERR,
+	IBV_QPS_UNKNOWN
+};
+
+enum ibv_mig_state {
+	IBV_MIG_MIGRATED,
+	IBV_MIG_REARM,
+	IBV_MIG_ARMED
+};
+
+struct ibv_qp_attr {
+	enum ibv_qp_state	qp_state;
+	enum ibv_qp_state	cur_qp_state;
+	enum ibv_mtu		path_mtu;
+	enum ibv_mig_state	path_mig_state;
+	uint32_t		qkey;
+	uint32_t		rq_psn;
+	uint32_t		sq_psn;
+	uint32_t		dest_qp_num;
+	int			qp_access_flags;
+	struct ibv_qp_cap	cap;
+	struct ibv_ah_attr	ah_attr;
+	struct ibv_ah_attr	alt_ah_attr;
+	uint16_t		pkey_index;
+	uint16_t		alt_pkey_index;
+	uint8_t			en_sqd_async_notify;
+	uint8_t			sq_draining;
+	uint8_t			max_rd_atomic;
+	uint8_t			max_dest_rd_atomic;
+	uint8_t			min_rnr_timer;
+	uint8_t			port_num;
+	uint8_t			timeout;
+	uint8_t			retry_cnt;
+	uint8_t			rnr_retry;
+	uint8_t			alt_port_num;
+	uint8_t			alt_timeout;
+};
+
+enum ibv_wr_opcode {
+	IBV_WR_RDMA_WRITE,
+	IBV_WR_RDMA_WRITE_WITH_IMM,
+	IBV_WR_SEND,
+	IBV_WR_SEND_WITH_IMM,
+	IBV_WR_RDMA_READ,
+	IBV_WR_ATOMIC_CMP_AND_SWP,
+	IBV_WR_ATOMIC_FETCH_AND_ADD
+};
+
+enum ibv_send_flags {
+	IBV_SEND_FENCE		= 1 << 0,
+	IBV_SEND_SIGNALED	= 1 << 1,
+	IBV_SEND_SOLICITED	= 1 << 2,
+	IBV_SEND_INLINE		= 1 << 3
+};
+
+struct ibv_sge {
+	uint64_t		addr;
+	uint32_t		length;
+	uint32_t		lkey;
+};
+
+struct ibv_send_wr {
+	uint64_t		wr_id;
+	struct ibv_send_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+	enum ibv_wr_opcode	opcode;
+	int			send_flags;
+	uint32_t		imm_data;	/* in network byte order */
+	union {
+		struct {
+			uint64_t	remote_addr;
+			uint32_t	rkey;
+		} rdma;
+		struct {
+			uint64_t	remote_addr;
+			uint64_t	compare_add;
+			uint64_t	swap;
+			uint32_t	rkey;
+		} atomic;
+		struct {
+			struct ibv_ah  *ah;
+			uint32_t	remote_qpn;
+			uint32_t	remote_qkey;
+		} ud;
+	} wr;
+	union {
+		union {
+			struct {
+				uint32_t    remote_srqn;
+			} xrc;
+		} qp_type;
+
+		uint32_t		xrc_remote_srq_num;
+	};
+};
+
+struct ibv_recv_wr {
+	uint64_t		wr_id;
+	struct ibv_recv_wr     *next;
+	struct ibv_sge	       *sg_list;
+	int			num_sge;
+};
+
+struct ibv_mw_bind {
+	uint64_t		wr_id;
+	struct ibv_mr	       *mr;
+	void		       *addr;
+	size_t			length;
+	int			send_flags;
+	int			mw_access_flags;
+};
+
+struct ibv_srq {
+	struct ibv_context     *context;
+	void		       *srq_context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+
+	/* below are for source compatabilty with legacy XRC,
+	*   padding based on ibv_srq_legacy.
+	*/
+	uint32_t		xrc_srq_num_bin_compat_padding;
+	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
+	struct ibv_cq	*xrc_cq_bin_compat_padding;
+	void		*ibv_srq_padding;
+
+	/* legacy fields */
+	uint32_t		xrc_srq_num;
+	struct ibv_xrc_domain	*xrc_domain;
+	struct ibv_cq		*xrc_cq;
+};
+
+/* Not in use in new API, needed for compilation as part of source compat layer */
+enum ibv_event_flags {
+	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
+};
+
+
+
+struct ibv_qp {
+	struct ibv_context     *context;
+	void		       *qp_context;
+	struct ibv_pd	       *pd;
+	struct ibv_cq	       *send_cq;
+	struct ibv_cq	       *recv_cq;
+	struct ibv_srq	       *srq;
+	uint32_t		handle;
+	uint32_t		qp_num;
+	enum ibv_qp_state       state;
+	enum ibv_qp_type	qp_type;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		events_completed;
+};
+
+struct ibv_comp_channel {
+	struct ibv_context     *context;
+	int			fd;
+	int			refcnt;
+};
+
+struct ibv_cq {
+	struct ibv_context     *context;
+	struct ibv_comp_channel *channel;
+	void		       *cq_context;
+	uint32_t		handle;
+	int			cqe;
+
+	pthread_mutex_t		mutex;
+	pthread_cond_t		cond;
+	uint32_t		comp_events_completed;
+	uint32_t		async_events_completed;
+};
+
+struct ibv_ah {
+	struct ibv_context     *context;
+	struct ibv_pd	       *pd;
+	uint32_t		handle;
+};
+
+enum ibv_flow_flags {
+	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
+	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
+};
+
+enum ibv_flow_attr_type {
+	/* steering according to rule specifications */
+	IBV_FLOW_ATTR_NORMAL		= 0x0,
+	/* default unicast and multicast rule -
+	 * receive all Eth traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
+	/* default multicast rule -
+	 * receive all Eth multicast traffic which isn't steered to any QP
+	 */
+	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
+};
+
+enum ibv_flow_spec_type {
+	IBV_FLOW_SPEC_ETH	= 0x20,
+	IBV_FLOW_SPEC_IPV4	= 0x30,
+	IBV_FLOW_SPEC_TCP	= 0x40,
+	IBV_FLOW_SPEC_UDP	= 0x41,
+};
+
+struct ibv_flow_eth_filter {
+	uint8_t		dst_mac[6];
+	uint8_t		src_mac[6];
+	uint16_t	ether_type;
+	/*
+	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
+	 */
+	uint16_t	vlan_tag;
+};
+
+struct ibv_flow_spec_eth {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_eth_filter val;
+	struct ibv_flow_eth_filter mask;
+};
+
+struct ibv_flow_ipv4_filter {
+	uint32_t src_ip;
+	uint32_t dst_ip;
+};
+
+struct ibv_flow_spec_ipv4 {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_ipv4_filter val;
+	struct ibv_flow_ipv4_filter mask;
+};
+
+struct ibv_flow_tcp_udp_filter {
+	uint16_t dst_port;
+	uint16_t src_port;
+};
+
+struct ibv_flow_spec_tcp_udp {
+	enum ibv_flow_spec_type  type;
+	uint16_t  size;
+	struct ibv_flow_tcp_udp_filter val;
+	struct ibv_flow_tcp_udp_filter mask;
+};
+
+struct ibv_flow_spec {
+	union {
+		struct {
+			enum ibv_flow_spec_type	type;
+			uint16_t		size;
+		} hdr;
+		struct ibv_flow_spec_eth eth;
+		struct ibv_flow_spec_ipv4 ipv4;
+		struct ibv_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+struct ibv_flow_attr {
+	uint32_t comp_mask;
+	enum ibv_flow_attr_type type;
+	uint16_t size;
+	uint16_t priority;
+	uint8_t num_of_specs;
+	uint8_t port;
+	uint32_t flags;
+	/* Following are the optional layers according to user request
+	 * struct ibv_flow_spec_xxx [L2]
+	 * struct ibv_flow_spec_yyy [L3/L4]
+	 */
+};
+
+struct ibv_flow {
+	uint32_t	   comp_mask;
+	struct ibv_context *context;
+	uint32_t	   handle;
+};
+
+struct ibv_device;
+struct ibv_context;
+
+struct ibv_device_ops {
+	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
+	void			(*free_context)(struct ibv_context *context);
+};
+
+enum {
+	IBV_SYSFS_NAME_MAX	= 64,
+	IBV_SYSFS_PATH_MAX	= 256
+};
+
+struct ibv_device {
+	struct ibv_device_ops	ops;
+	enum ibv_node_type	node_type;
+	enum ibv_transport_type	transport_type;
+	/* Name of underlying kernel IB device, eg "mthca0" */
+	char			name[IBV_SYSFS_NAME_MAX];
+	/* Name of uverbs device, eg "uverbs0" */
+	char			dev_name[IBV_SYSFS_NAME_MAX];
+	/* Path to infiniband_verbs class device in sysfs */
+	char			dev_path[IBV_SYSFS_PATH_MAX];
+	/* Path to infiniband class device in sysfs */
+	char			ibdev_path[IBV_SYSFS_PATH_MAX];
+};
+
+struct verbs_device {
+	struct ibv_device device; /* Must be first */
+	size_t	sz;
+	size_t	size_of_context;
+	int	(*init_context)(struct verbs_device *device,
+				struct ibv_context *ctx, int cmd_fd);
+	void	(*uninit_context)(struct verbs_device *device,
+				struct ibv_context *ctx);
+	/* future fields added here */
+};
+
+struct ibv_context_ops {
+	int			(*query_device)(struct ibv_context *context,
+					      struct ibv_device_attr *device_attr);
+	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
+					      struct ibv_port_attr *port_attr);
+	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
+	int			(*dealloc_pd)(struct ibv_pd *pd);
+	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
+					  int access);
+	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
+					    int flags,
+					    struct ibv_pd *pd, void *addr,
+					    size_t length,
+					    int access);
+	int			(*dereg_mr)(struct ibv_mr *mr);
+	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
+	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
+					   struct ibv_mw_bind *mw_bind);
+	int			(*dealloc_mw)(struct ibv_mw *mw);
+	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
+					     struct ibv_comp_channel *channel,
+					     int comp_vector);
+	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
+	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
+	void			(*cq_event)(struct ibv_cq *cq);
+	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
+	int			(*destroy_cq)(struct ibv_cq *cq);
+	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
+					      struct ibv_srq_init_attr *srq_init_attr);
+	int			(*modify_srq)(struct ibv_srq *srq,
+					      struct ibv_srq_attr *srq_attr,
+					      int srq_attr_mask);
+	int			(*query_srq)(struct ibv_srq *srq,
+					     struct ibv_srq_attr *srq_attr);
+	int			(*destroy_srq)(struct ibv_srq *srq);
+	int			(*post_srq_recv)(struct ibv_srq *srq,
+						 struct ibv_recv_wr *recv_wr,
+						 struct ibv_recv_wr **bad_recv_wr);
+	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
+	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					    int attr_mask,
+					    struct ibv_qp_init_attr *init_attr);
+	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+					     int attr_mask);
+	int			(*destroy_qp)(struct ibv_qp *qp);
+	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
+					     struct ibv_send_wr **bad_wr);
+	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
+					     struct ibv_recv_wr **bad_wr);
+	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+	int			(*destroy_ah)(struct ibv_ah *ah);
+	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
+						uint16_t lid);
+	void			(*async_event)(struct ibv_async_event *event);
+};
+
+struct ibv_context {
+	struct ibv_device      *device;
+	struct ibv_context_ops	ops;
+	int			cmd_fd;
+	int			async_fd;
+	int			num_comp_vectors;
+	pthread_mutex_t		mutex;
+	void		       *abi_compat;
+};
+
+enum verbs_context_mask {
+	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
+	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
+	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
+	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
+	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
+};
+
+struct verbs_context {
+	/*  "grows up" - new fields go here */
+	int (*_reserved_2) (void);
+	int (*destroy_flow) (struct ibv_flow *flow);
+	int (*_reserved_1) (void);
+	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
+					  struct ibv_flow_attr *flow_attr);
+	struct ibv_qp * (*open_qp)(struct ibv_context *context,
+			struct ibv_qp_open_attr *attr);
+	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
+			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
+	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
+	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
+			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
+	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
+			struct ibv_xrcd_init_attr *xrcd_init_attr);
+	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
+	uint64_t has_comp_mask;
+	size_t   sz;	/* Must be immediately before struct ibv_context */
+	struct ibv_context context;/* Must be last field in the struct */
+};
+
+/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
+/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
+{
+	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
+		NULL : container_of(ctx, struct verbs_context, context);
+}
+
+#define verbs_get_ctx_op(ctx, op) ({ \
+	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
+	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
+	!_vctx->op) ? NULL : _vctx; })*/
+
+#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
+	struct verbs_context *vctx = _vctx; \
+	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
+		vctx->op = ptr; })
+
+static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
+{
+	return (dev->ops.alloc_context) ?
+		NULL : container_of(dev, struct verbs_device, device);
+}
+
+static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  return qp->context->ops.post_send(qp, wr, bad_wr);
+}
+
+struct ibv_ece {
+	/*
+	 * Unique identifier of the provider vendor on the network.
+	 * The providers will set IEEE OUI here to distinguish
+	 * itself in non-homogenius network.
+	 */
+	uint32_t vendor_id;
+	/*
+	 * Provider specific attributes which are supported or
+	 * needed to be enabled by ECE users.
+	 */
+	uint32_t options;
+	uint32_t comp_mask;
+};
+
+#endif  // NCCL_IBV_CORE_H_
diff --git a/nvls/ibvsymbols.h b/nvls/ibvsymbols.h
new file mode 100644
index 000000000..906b0df74
--- /dev/null
+++ b/nvls/ibvsymbols.h
@@ -0,0 +1,46 @@
+#ifndef NCCL_IBV_SYMBOLS_H_
+#define NCCL_IBV_SYMBOLS_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "nccl.h"
+
+/* IB Verbs Function Pointers*/
+struct ncclIbvSymbols {
+  int (*ibv_internal_fork_init)(void);
+  struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
+  void (*ibv_internal_free_device_list)(struct ibv_device **list);
+  const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
+  struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
+  int (*ibv_internal_close_device)(struct ibv_context *context);
+  int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
+  void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
+  int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
+  int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+  int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+  int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+  struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
+  int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
+  struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
+  struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
+  /* DMA-BUF support */
+  struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+  int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
+  struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+  int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
+  struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
+  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
+  int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
+  int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
+};
+
+/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
+ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
+
+#endif  // NCCL_IBV_SYMBOLS_H_
diff --git a/nvls/ibvwrap.h b/nvls/ibvwrap.h
new file mode 100644
index 000000000..c3709584c
--- /dev/null
+++ b/nvls/ibvwrap.h
@@ -0,0 +1,92 @@
+/*************************************************************************
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ *
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_IBVWRAP_H_
+#define NCCL_IBVWRAP_H_
+
+#ifdef NCCL_BUILD_RDMA_CORE
+#include <infiniband/verbs.h>
+#else
+#include "ibvcore.h"
+#endif
+
+#include "core.h"
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef enum ibv_return_enum
+{
+    IBV_SUCCESS = 0,                   //!< The operation was successful
+} ibv_return_t;
+
+ncclResult_t wrap_ibv_symbols(void);
+/* NCCL wrappers of IB verbs functions */
+ncclResult_t wrap_ibv_fork_init(void);
+ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
+ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
+const char *wrap_ibv_get_device_name(struct ibv_device *device);
+ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
+ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
+ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
+ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
+ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
+ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
+ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
+ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
+ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
+ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
+struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
+ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
+/* DMA-BUF support */
+ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
+ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
+ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
+ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
+ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
+ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
+static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
+  int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
+  if (done < 0) {
+    WARN("Call to ibv_poll_cq() returned %d", done);
+    return ncclSystemError;
+  }
+  *num_done = done;
+  return ncclSuccess;
+}
+ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
+ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
+ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
+ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
+
+static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
+  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
+  int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
+  if (ret != IBV_SUCCESS) {
+    WARN("ibv_post_recv() failed with error %s", strerror(ret));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
+
+#endif //End include guard
diff --git a/nvls/info.h b/nvls/info.h
new file mode 100644
index 000000000..f65ed2e69
--- /dev/null
+++ b/nvls/info.h
@@ -0,0 +1,134 @@
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INFO_H_
+#define NCCL_INFO_H_
+
+#include "nccl.h"
+#include "device.h"
+#include "collectives.h"
+#include "core.h"
+#include "utils.h"
+#include "strongstream.h"
+
+typedef enum : uint8_t {
+  ncclPatternRing,
+  ncclPatternRingTwice,
+  ncclPatternPipelineFrom,
+  ncclPatternPipelineTo,
+  ncclPatternTreeUp,
+  ncclPatternTreeDown,
+  ncclPatternTreeUpDown,
+  ncclPatternCollnetChain,
+  ncclPatternCollnetDirect,
+  ncclPatternNvls,
+  ncclPatternNvlsTree,
+  ncclPatternSend,
+  ncclPatternRecv
+} ncclPattern_t;
+
+// Used to pass NCCL call information between functions
+struct ncclInfo {
+  ncclFunc_t coll;
+  const char* opName;
+  // NCCL Coll Args
+  const void* sendbuff;
+  void* recvbuff;
+  size_t count;
+  ncclDataType_t datatype;
+  ncclRedOp_t op;
+  int root; // peer for p2p operations
+  ncclComm_t comm;
+  cudaStream_t stream;
+  // Algorithm details
+  int chunkSteps;
+  int sliceSteps;
+  // Computed later
+  ncclDevRedOpFull opFull;
+  int algorithm;
+  int protocol;
+  ncclPattern_t pattern;
+  int nChannels;
+  int nThreads;
+  size_t nBytes;
+  size_t sendbuffSize;
+  size_t recvbuffSize;
+  int nstepsPerLoop;
+  int nchunksPerLoop;
+  int chunkSize;
+  int channelId;
+};
+
+inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
+  info->nBytes = info->count * ncclTypeSize(info->datatype);
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
+    info->count = info->nBytes;
+    info->datatype = ncclInt8;
+  }
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
+
+  /* compute buffer size for NVLS buffer registration */
+  if (info->coll == ncclFuncAllGather) {
+    info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->recvbuffSize = info->sendbuffSize * nRanks;
+  } else if (info->coll == ncclFuncReduceScatter) {
+    info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+    info->sendbuffSize = info->recvbuffSize * nRanks;
+  } else {
+    info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
+  }
+  return ncclSuccess;
+}
+
+struct ncclTaskColl {
+  struct ncclTaskColl* next;
+  ncclFunc_t func;
+  void const* sendbuff;
+  void* recvbuff;
+  size_t count;
+  int root;
+  ncclDataType_t datatype;
+  ncclDevRedOpFull op;
+  int chunkSteps, sliceSteps;
+};
+struct ncclTaskP2p {
+  ncclTaskP2p *next;
+  void *buff;
+  size_t bytes;
+  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
+  // of where it left off.
+  int chunk;
+};
+
+struct ncclCudaStreamList {
+  struct ncclCudaStreamList *next;
+  cudaStream_t stream;
+};
+struct ncclTasks {
+  struct Peer {
+    bool sendSeen, recvSeen;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
+    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
+  };
+  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
+  size_t collBytesTotal;
+  struct Peer* peers/*[nRanks]*/;
+  int *p2pSendOrder, *p2pRecvOrder;
+  int p2pOrderSteps;
+  int nTasksColl, nTasksP2p;
+
+  // The list of user streams aggregated over all tasks present.
+  struct ncclCudaStreamList* streams;
+  // The most recent user stream. Ignored if streams==nullptr
+  cudaStream_t streamRecent;
+  // The graph capturing all user streams or invalid if none. Thus we restrict the
+  // user that all streams must be captured in the same graph or not captured
+  // at all. Technically we could probably relax this, but that would mean
+  // collecting a different `ncclTasks` per graph and one for non-graph.
+  struct ncclCudaGraph capturingGraph;
+};
+
+#endif
diff --git a/nvls/ipcsocket.cc b/nvls/ipcsocket.cc
new file mode 100644
index 000000000..9d66ac719
--- /dev/null
+++ b/nvls/ipcsocket.cc
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See COPYRIGHT for license information
+ */
+
+#include "ipcsocket.h"
+#include "utils.h"
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+// Enable Linux abstract socket naming
+#define USE_ABSTRACT_SOCKET
+
+#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
+
+/*
+ * Create a Unix Domain Socket
+ */
+ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
+  int fd = -1;
+  struct sockaddr_un cliaddr;
+  char temp[NCCL_IPC_SOCKNAME_LEN] = "";
+
+  if (handle == NULL) {
+    return ncclInternalError;
+  }
+
+  handle->fd = -1;
+  handle->socketName[0] = '\0';
+  if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
+    WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
+    return ncclSystemError;
+  }
+
+  bzero(&cliaddr, sizeof(cliaddr));
+  cliaddr.sun_family = AF_UNIX;
+
+  // Create unique name for the socket.
+  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
+  if (len > (sizeof(cliaddr.sun_path) - 1)) {
+    WARN("UDS: Cannot bind provided name to socket. Name too large");
+    return ncclInternalError;
+  }
+#ifndef USE_ABSTRACT_SOCKET
+  unlink(temp);
+#endif
+
+  TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
+
+  strncpy(cliaddr.sun_path, temp, len);
+#ifdef USE_ABSTRACT_SOCKET
+  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
+#endif
+  if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
+    WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
+    close(fd);
+    return ncclSystemError;
+  }
+
+  handle->fd = fd;
+  strcpy(handle->socketName, temp);
+
+  handle->abortFlag = abortFlag;
+  // Mark socket as non-blocking
+  if (handle->abortFlag) {
+    int flags;
+    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
+    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
+  if (handle == NULL) {
+    WARN("ncclSocketGetFd: pass NULL socket");
+    return ncclInvalidArgument;
+  }
+  if (fd) *fd = handle->fd;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
+  if (handle == NULL) {
+    return ncclInternalError;
+  }
+  if (handle->fd <= 0) {
+    return ncclSuccess;
+  }
+#ifndef USE_ABSTRACT_SOCKET
+  if (handle->socketName[0] != '\0') {
+    unlink(handle->socketName);
+  }
+#endif
+  close(handle->fd);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
+  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
+  struct iovec iov[1];
+
+  // Union to guarantee alignment requirements for control array
+  union {
+    struct cmsghdr cm;
+    char control[CMSG_SPACE(sizeof(int))];
+  } control_un;
+
+  struct cmsghdr *cmptr;
+  char dummy_buffer[1];
+  int ret;
+
+  msg.msg_control = control_un.control;
+  msg.msg_controllen = sizeof(control_un.control);
+
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }
+
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+
+  while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
+    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+      WARN("UDS: Receiving data over socket failed : %d", errno);
+      return ncclSystemError;
+    }
+    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
+  }
+
+  if (recvFd != NULL) {
+    if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
+      if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
+        WARN("UDS: Receiving data over socket failed");
+      return ncclSystemError;
+      }
+
+      memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
+    } else {
+      WARN("UDS: Receiving data over socket %s failed", handle->socketName);
+      return ncclSystemError;
+    }
+    TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
+  return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
+}
+
+ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
+  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
+  struct iovec iov[1];
+  char temp[NCCL_IPC_SOCKNAME_LEN];
+
+  union {
+    struct cmsghdr cm;
+    char control[CMSG_SPACE(sizeof(int))];
+  } control_un;
+
+  struct cmsghdr *cmptr;
+  char dummy_buffer[1];
+  struct sockaddr_un cliaddr;
+
+  // Construct client address to send this shareable handle to
+  bzero(&cliaddr, sizeof(cliaddr));
+  cliaddr.sun_family = AF_UNIX;
+
+  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
+  if (len > (sizeof(cliaddr.sun_path) - 1)) {
+    WARN("UDS: Cannot connect to provided name for socket. Name too large");
+    return ncclInternalError;
+  }
+  (void) strncpy(cliaddr.sun_path, temp, len);
+
+#ifdef USE_ABSTRACT_SOCKET
+  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
+#endif
+
+  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
+
+  if (sendFd != -1) {
+    TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
+
+    msg.msg_control = control_un.control;
+    msg.msg_controllen = sizeof(control_un.control);
+
+    cmptr = CMSG_FIRSTHDR(&msg);
+    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
+    cmptr->cmsg_level = SOL_SOCKET;
+    cmptr->cmsg_type = SCM_RIGHTS;
+    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
+  }
+
+  msg.msg_name = (void *)&cliaddr;
+  msg.msg_namelen = sizeof(struct sockaddr_un);
+
+  if (hdr == NULL) {
+    iov[0].iov_base = (void *)dummy_buffer;
+    iov[0].iov_len = sizeof(dummy_buffer);
+  } else {
+    iov[0].iov_base = hdr;
+    iov[0].iov_len = hdrLen;
+  }
+  msg.msg_iov = iov;
+  msg.msg_iovlen = 1;
+  msg.msg_flags = 0;
+
+  ssize_t sendResult;
+  while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
+    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
+      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
+      return ncclSystemError;
+    }
+    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
+  return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
+}
diff --git a/nvls/ipcsocket.h b/nvls/ipcsocket.h
new file mode 100644
index 000000000..ccecde84c
--- /dev/null
+++ b/nvls/ipcsocket.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See COPYRIGHT for license information
+ */
+
+#ifndef NCCL_IPCSOCKET_H
+#define NCCL_IPCSOCKET_H
+
+#include "nccl.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <memory.h>
+#include <sys/un.h>
+#include <inttypes.h>
+
+#define NCCL_IPC_SOCKNAME_LEN 64
+
+struct ncclIpcSocket {
+  int fd;
+  char socketName[NCCL_IPC_SOCKNAME_LEN];
+  volatile uint32_t* abortFlag;
+};
+
+ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
+ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
+ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
+
+ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
+ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
+
+#endif /* NCCL_IPCSOCKET_H */
diff --git a/nvls/nccl_common.h b/nvls/nccl_common.h
new file mode 100644
index 000000000..a37ac203e
--- /dev/null
+++ b/nvls/nccl_common.h
@@ -0,0 +1,33 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+
+typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
+
+typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
+
+#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
+
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_UNDEF -1
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_UNDEF -1
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+
+#endif
diff --git a/nvls/nccl_net.h b/nvls/nccl_net.h
new file mode 100644
index 000000000..9b3e6719f
--- /dev/null
+++ b/nvls/nccl_net.h
@@ -0,0 +1,333 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef ncclNetProperties_v7_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef ncclNet_v7_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+typedef ncclCollNet_v7_t ncclCollNet_t;
+
+// v6 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+#endif // end include guard
diff --git a/nvls/nccl_tuner.h b/nvls/nccl_tuner.h
new file mode 100644
index 000000000..b4a696e38
--- /dev/null
+++ b/nvls/nccl_tuner.h
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TUNER_H_
+#define NCCL_TUNER_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+
+// API to be implemented by external tuner
+typedef struct {
+  // Name of the tuner
+  const char* name;
+
+  // Initializes tuner states.
+  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
+  // nNodes: number of nodes in current communicator.
+  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
+  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
+
+  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
+  // Inputs:
+  //   - collType: collective type , e.g., allreduce, allgather…
+  //   - nBytes: collective size in bytes
+  //   - collNetTypeSupport: whether collnet supports this type
+  //   - nvlsTypeSupport: whether nvlink sharp supports this time
+  //   - numPipeOps: number of operations in the group
+  //
+  // Outputs:
+  //   - algorithm: selected algorithm to be used for the given collective
+  //   - protocol: selected protocol to be used for the given collective
+  //   - nChannels: number of channels (hence SMs) to be used.
+  //
+  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
+  // default tuning for the given collective.
+  // Also, the plugin is allowed to not set any output, or set only the
+  // algorithm and protocol, but not only the algorithm or only the protocol.
+  // Unset fields will be set automatically by NCCL.
+  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
+                              int collNetSupport, int nvlsSupport, int numPipeOps,
+                              int *algorithm, int *protocol, int* nChannels);
+
+  // Terminates the plugin and cleans up any resources that the plugin allocated.
+  ncclResult_t (*destroy)();
+} ncclTuner_v1_t;
+
+typedef ncclTuner_v1_t ncclTuner_t;
+
+#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
+
+#endif
diff --git a/nvls/net.h b/nvls/net.h
new file mode 100644
index 000000000..b5df58968
--- /dev/null
+++ b/nvls/net.h
@@ -0,0 +1,27 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_NET_H_
+#define NCCL_INT_NET_H_
+
+#include "nccl.h"
+#include "nccl_net.h"
+#include "comm.h"
+#include "checks.h"
+
+typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+
+ncclResult_t ncclNetPluginInit();
+ncclResult_t ncclNetInit(struct ncclComm* comm);
+int ncclNetVersion(struct ncclComm* comm);
+
+// Test whether the current GPU support GPU Direct RDMA.
+ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
+
+extern ncclNet_t ncclNetIb;
+extern ncclNet_t ncclNetSocket;
+
+#endif
diff --git a/nvls/net_device.h b/nvls/net_device.h
new file mode 100644
index 000000000..8f7c0d6e1
--- /dev/null
+++ b/nvls/net_device.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_DEVICE_H_
+#define NCCL_NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
+
+typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
+
+#endif
diff --git a/nvls/nvmlwrap.h b/nvls/nvmlwrap.h
new file mode 100644
index 000000000..2ab8e3a2b
--- /dev/null
+++ b/nvls/nvmlwrap.h
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVMLWRAP_H_
+#define NCCL_NVMLWRAP_H_
+
+#include "nccl.h"
+
+//#define NCCL_NVML_DIRECT 1
+#ifndef NCCL_NVML_DIRECT
+#define NCCL_NVML_DIRECT 0
+#endif
+
+#if NCCL_NVML_DIRECT
+#include "nvml.h"
+#else
+// Dynamically handle dependencies on NVML
+
+/* Extracted from nvml.h */
+typedef struct nvmlDevice_st* nvmlDevice_t;
+#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
+
+typedef enum nvmlEnableState_enum
+{
+    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
+    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
+} nvmlEnableState_t;
+
+typedef enum nvmlNvLinkCapability_enum
+{
+    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
+    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
+    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
+    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
+    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
+    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
+    // should be last
+    NVML_NVLINK_CAP_COUNT
+} nvmlNvLinkCapability_t;
+
+typedef enum nvmlReturn_enum
+{
+    NVML_SUCCESS = 0,                   //!< The operation was successful
+    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
+    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
+    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
+    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
+    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
+    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
+    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
+    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
+    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
+    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
+    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
+    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
+    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
+    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
+    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
+    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
+    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
+    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
+    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
+    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
+} nvmlReturn_t;
+
+typedef struct nvmlPciInfo_st
+{
+    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
+    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
+    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
+    unsigned int device;             //!< The device's id on the bus, 0 to 31
+    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
+
+    // Added in NVML 2.285 API
+    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
+
+    // NVIDIA reserved for internal use only
+    unsigned int reserved0;
+    unsigned int reserved1;
+    unsigned int reserved2;
+    unsigned int reserved3;
+} nvmlPciInfo_t;
+
+/* P2P Capability Index Status*/
+typedef enum nvmlGpuP2PStatus_enum
+{
+    NVML_P2P_STATUS_OK     = 0,
+    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
+    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
+    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
+    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
+    NVML_P2P_STATUS_NOT_SUPPORTED,
+    NVML_P2P_STATUS_UNKNOWN
+} nvmlGpuP2PStatus_t;
+
+/* P2P Capability Index*/
+typedef enum nvmlGpuP2PCapsIndex_enum
+{
+    NVML_P2P_CAPS_INDEX_READ = 0,
+    NVML_P2P_CAPS_INDEX_WRITE,
+    NVML_P2P_CAPS_INDEX_NVLINK,
+    NVML_P2P_CAPS_INDEX_ATOMICS,
+    NVML_P2P_CAPS_INDEX_PROP,
+    NVML_P2P_CAPS_INDEX_UNKNOWN
+} nvmlGpuP2PCapsIndex_t;
+
+/**
+ * Represents the type for sample value returned
+ */
+typedef enum nvmlValueType_enum
+{
+    NVML_VALUE_TYPE_DOUBLE = 0,
+    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
+    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
+    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
+    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
+
+    // Keep this last
+    NVML_VALUE_TYPE_COUNT
+}nvmlValueType_t;
+
+
+/**
+ * Union to represent different types of Value
+ */
+typedef union nvmlValue_st
+{
+    double dVal;                    //!< If the value is double
+    unsigned int uiVal;             //!< If the value is unsigned int
+    unsigned long ulVal;            //!< If the value is unsigned long
+    unsigned long long ullVal;      //!< If the value is unsigned long long
+    signed long long sllVal;        //!< If the value is signed long long
+}nvmlValue_t;
+
+/**
+ * Field Identifiers.
+ *
+ * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
+ */
+
+/* NVLink Speed */
+#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
+#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
+
+/**
+ * Remote device NVLink ID
+ *
+ * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
+ */
+#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID     146 //!< Remote device NVLink ID
+
+/**
+ * NVSwitch: connected NVLink count
+ */
+#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT   147  //!< Number of NVLinks connected to NVSwitch
+
+#define NVML_FI_DEV_NVLINK_GET_SPEED                  164
+#define NVML_FI_DEV_NVLINK_GET_STATE                  165
+#define NVML_FI_DEV_NVLINK_GET_VERSION                166
+
+#define NVML_FI_DEV_C2C_LINK_COUNT                    170 //!< Number of C2C Links present on the device
+#define NVML_FI_DEV_C2C_LINK_GET_STATUS               171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
+#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW               172 //!< C2C Link Speed in MBps for active links
+
+#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above
+
+/**
+ * Information for a Field Value Sample
+ */
+typedef struct nvmlFieldValue_st
+{
+    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
+    unsigned int scopeId;       //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
+    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
+    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
+    nvmlValueType_t valueType;  //!< Type of the value stored in value
+    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
+    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
+} nvmlFieldValue_t;
+
+/* End of nvml.h */
+#endif // NCCL_NVML_DIRECT
+
+constexpr int ncclNvmlMaxDevices = 32;
+struct ncclNvmlDeviceInfo {
+  nvmlDevice_t handle;
+  int computeCapabilityMajor, computeCapabilityMinor;
+};
+struct ncclNvmlDevicePairInfo {
+  nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
+};
+extern int ncclNvmlDeviceCount;
+extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
+extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
+
+// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
+// Outsiders need only call it if they want to inspect the ncclNvml global
+// tables above.
+ncclResult_t ncclNvmlEnsureInitialized();
+
+ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
+ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
+ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
+ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
+ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
+ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
+ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
+ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
+ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
+
+#endif // End include guard
diff --git a/nvls/nvtx.h b/nvls/nvtx.h
new file mode 100644
index 000000000..ab32ef27f
--- /dev/null
+++ b/nvls/nvtx.h
@@ -0,0 +1,85 @@
+/*************************************************************************
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NVTX_H_
+#define NCCL_NVTX_H_
+
+#include "nvtx3/nvtx3.hpp"
+
+#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
+#define NVTX3_CONSTEXPR_IF_CPP14 constexpr
+#else
+#define NVTX3_CONSTEXPR_IF_CPP14
+#endif
+
+// Define all NCCL-provided static schema IDs here (avoid duplicates).
+#define NVTX_SID_CommInitRank  0
+#define NVTX_SID_CommInitAll   1
+#define NVTX_SID_CommDestroy   2 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_CommAbort     3 // same schema as NVTX_SID_CommInitRank
+#define NVTX_SID_AllGather     4
+#define NVTX_SID_AllReduce     5
+#define NVTX_SID_Broadcast     6
+#define NVTX_SID_ReduceScatter 7
+#define NVTX_SID_Reduce        8
+#define NVTX_SID_Send          9
+#define NVTX_SID_Recv          10
+
+// Define static schema ID for the reduction operation.
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+
+extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
+
+struct nccl_domain{static constexpr char const* name{"NCCL"};};
+
+class payload_schema {
+ public:
+  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
+  {
+    schema_attr.name = schemaName;
+    schema_attr.entries = entries;
+    schema_attr.numEntries = numEntries;
+    schema_attr.schemaId = schemaId;
+    nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
+  }
+
+  payload_schema() = delete;
+  ~payload_schema() = default;
+  payload_schema(payload_schema const&) = default;
+  payload_schema& operator=(payload_schema const&) = default;
+  payload_schema(payload_schema&&) = default;
+  payload_schema& operator=(payload_schema&&) = default;
+
+ private:
+  nvtxPayloadSchemaAttr_t schema_attr{
+    NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
+    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
+    NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
+    nullptr,
+    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
+    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
+    nullptr, 0, 0, 0};
+};
+
+// Create NVTX push/pop range with parameters
+// @param name of the operation (see `NVTX_SID_*`)
+// @param N  schema name
+// @param S  schema (entries)
+// @param P  payload (struct)
+#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
+  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
+    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
+  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
+  nvtxPayloadData_t nvtx3_bpl__[] = { \
+    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
+  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
+  ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
+
+extern void initNvtxRegisteredEnums();
+
+#endif
diff --git a/nvls/p2p.h b/nvls/p2p.h
new file mode 100644
index 000000000..6ffba4b0e
--- /dev/null
+++ b/nvls/p2p.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+
+#ifndef NCCL_P2P_H_
+#define NCCL_P2P_H_
+
+#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+
+typedef struct {
+  uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
+} ncclCuDesc;
+
+typedef union {
+  // Legacy CUDA IPC
+  cudaIpcMemHandle_t devIpc;
+  // cuMem API support
+  ncclCuDesc cuDesc;
+} ncclIpcDesc;
+
+ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
+ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
+ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
+
+#endif
diff --git a/nvls/param.h b/nvls/param.h
new file mode 100644
index 000000000..963da9d17
--- /dev/null
+++ b/nvls/param.h
@@ -0,0 +1,30 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PARAM_H_
+#define NCCL_PARAM_H_
+
+#include <stdint.h>
+
+const char* userHomeDir();
+void setEnvFile(const char* fileName);
+void initEnv();
+const char *ncclGetEnv(const char *name);
+
+void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
+
+#define NCCL_PARAM(name, env, deftVal) \
+  int64_t ncclParam##name() { \
+    constexpr int64_t uninitialized = INT64_MIN; \
+    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
+    static int64_t cache = uninitialized; \
+    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
+      ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
+    } \
+    return cache; \
+  }
+
+#endif
diff --git a/nvls/profiler.h b/nvls/profiler.h
new file mode 100644
index 000000000..103af99ad
--- /dev/null
+++ b/nvls/profiler.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROFILER_H_
+#define NCCL_PROFILER_H_
+
+#include "proxy.h"
+
+enum ncclProxyProfileState {
+  ncclProxyProfileBegin = 0,
+
+  ncclProxyProfileSendGPUWait = 1,
+  ncclProxyProfileSendWait = 2,
+
+  ncclProxyProfileRecvWait = 1,
+  ncclProxyProfileRecvFlushWait = 2,
+  ncclProxyProfileRecvGPUWait = 3,
+
+  ncclProxyProfileEnd = 4,
+
+  ncclProxyProfileSleep = 8,
+  ncclProxyProfileWakeup = 9,
+
+  ncclProxyProfileIdle = 16,
+  ncclProxyProfileActive = 17,
+
+  ncclProxyProfileAppend = 24,
+  ncclProxyProfileAppendEnd = 25
+};
+
+ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
+void ncclProfilingDump();
+
+#endif
diff --git a/nvls/proxy.h b/nvls/proxy.h
new file mode 100644
index 000000000..8093c0ce6
--- /dev/null
+++ b/nvls/proxy.h
@@ -0,0 +1,296 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PROXY_H_
+#define NCCL_PROXY_H_
+
+#include "device.h"
+#include "info.h"
+#include "socket.h"
+#include "ipcsocket.h"
+#include "nccl_net.h"
+#include <pthread.h>
+#include "shm.h"
+#include "p2p.h"
+
+enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
+
+struct ncclProxyArgs;
+typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
+
+#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
+static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
+
+struct ncclProxyOp {
+  struct ncclProxyConnection* connection;
+  int channelId;
+  int nsteps;
+  ssize_t nbytes;
+  int root;
+  int next;
+
+  uint64_t opCount;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
+  uint8_t protocol;
+
+  union {
+    uint64_t unused;
+    // For use by enqueue.cc
+    struct ncclProxyOp *enqNext;
+  };
+};
+static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
+
+struct ncclProxySubArgs {
+  struct ncclProxyConnection* connection;
+  int channelId;
+  int nsteps;
+  ssize_t nbytes;
+  int peer;
+
+  int groupSize; // Number of consecutive sub operations sharing the same recvComm
+  uint64_t base;
+  uint64_t posted;
+  uint64_t received;
+  uint64_t flushed;
+  uint64_t transmitted;
+  uint64_t done;
+  uint64_t end;
+  void* requests[NCCL_STEPS];
+  void* profilingEvents[NCCL_STEPS];
+  void* recvRequestsCache[NCCL_STEPS];
+  int recvRequestsSubCount;
+};
+
+struct ncclProxyArgs {
+  struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
+  proxyProgressFunc_t progress;
+  int nsubs;
+  int done;
+  uint64_t opCount;
+  int sliceSteps;
+  int chunkSteps;
+  int chunkSize;
+  uint8_t /*ncclDataType_t*/ dtype;
+  uint8_t /*ncclDevRedOp_t*/ redOp;
+  uint8_t /*ncclPattern_t*/ pattern;
+  uint8_t protocol;
+  int state;
+  char* sharedBuff[NCCL_STEPS];
+  int sharedSize[NCCL_STEPS];
+
+  int idle;
+
+  // Element linking
+  struct ncclProxyArgs* next;
+  struct ncclProxyArgs* nextPeer;
+  struct ncclProxyArgs** proxyAppendPtr;
+};
+#define NCCL_MAX_NETDEVS 128
+
+// ProxyOps are used to communicate between main thread and service thread
+// Make sure we have enough to store two full rounds of operations on all channels.
+// Otherwise we'd be unable to post half of them to free new elements.
+#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
+#define NCCL_MAX_LOCAL_RANKS 64
+struct ncclProxyOpsPool {
+  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
+  volatile int nextOps;
+  volatile int nextOpsEnd;
+  volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+};
+
+struct ncclProxyOps {
+  ncclProxyOpsPool* pool;
+  ncclShmHandle_t handle;
+  int count;
+  int freeOp;
+  int nextOps;
+  int nextOpsEnd;
+};
+
+struct ncclProxySharedP2p {
+  int refcount;
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  // CUDA IPC
+  ncclIpcDesc ipcDesc;
+  struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
+};
+
+struct ncclProxyPeer {
+  struct ncclProxySharedP2p send;
+  struct ncclProxySharedP2p recv;
+};
+
+struct ncclSharedNetComms {
+  void* sendComm[MAXCHANNELS];
+  void* recvComm[MAXCHANNELS];
+  int sendRefCount[MAXCHANNELS];
+  int recvRefCount[MAXCHANNELS];
+};
+
+struct ncclProxyPool;
+struct ncclProxyProgressState {
+  // Used by main threads to send work to progress thread
+  struct ncclProxyOpsPool* opsPool;
+  ncclShmHandle_t handle;
+  char opsPoolShmSuffix[6];
+
+  pthread_t thread;
+  volatile int stop;
+  struct ncclProxyPeer** localPeers;
+  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
+  struct ncclProxyArgs* active;
+  struct ncclProxyArgs* pool;
+  struct ncclProxyPool* pools;
+  int nextOps;
+};
+
+// Expected proxy response fifo
+struct ncclExpectedProxyResponse {
+  void*                             opId;
+  int                               respSize;
+  bool                              done;
+  void*                             respBuff;
+  ncclResult_t                      res;
+  struct ncclExpectedProxyResponse* next;
+};
+
+struct ncclProxyAsyncOp {
+  int type;
+  struct ncclProxyConnection* connection;
+  int reqSize, respSize;
+  char *reqBuff, *respBuff;
+  void* opId;
+  ncclProxyAsyncOp* next;
+};
+
+struct ncclProxyLocalPeer {
+  struct ncclSocket sock;
+  int tpRank;
+  int tpLocalRank;
+  ncclProxyAsyncOp* asyncOps;
+  int asyncOpCounter;
+};
+
+// Common response header for all proxyOps
+// We pack this into a struct to reduce the number of blocking send and recv calls
+struct ncclProxyRpcResponseHeader {
+  void* opId;
+  ncclResult_t res;
+  int respSize;
+};
+
+struct ncclProxyState {
+  int refCount;
+  int tpRank;
+  int tpnRanks;
+  int tpLocalnRanks;
+  int cudaDev;
+  int p2pnChannels;
+  int p2pChunkSize;
+  int nChannels;
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  bool allocP2pNetLLBuffers;
+  bool dmaBufSupport;
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
+  volatile uint32_t* abortFlag;
+  // Service thread
+  pthread_t thread;
+  struct ncclSocket* listenSock;
+  int stop;
+  CUcontext cudaCtx;
+  ncclResult_t asyncResult;
+
+  // Used by main thread
+  union ncclSocketAddress* peerAddresses;
+  struct ncclSocket* peerSocks;
+  struct ncclProxyOps* proxyOps;
+  void** sharedDevMems;
+  struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
+
+  // Progress thread
+  struct ncclProxyProgressState progressState;
+
+  // Queue of expected responses from the proxy
+  struct ncclExpectedProxyResponse* expectedResponses;
+};
+
+enum proxyConnectState {
+  connUninitialized     = 0,
+  connInitialized       = 1,
+  connSharedInitialized = 2,
+  connSetupDone         = 3,
+  connConnected         = 4,
+  numConnStates         = 5
+};
+
+struct ncclProxyConnection {
+  int send, transport, shared;
+  int tpLocalRank, sameProcess;
+  struct ncclSocket* sock;
+  struct ncclTransportComm* tcomm;
+  struct ncclProxyArgs *proxyAppend;
+  struct ncclProxyArgs **proxyAppendPtr;
+  void* transportResources;
+  ncclNetDeviceHandle_t* netDeviceHandle;
+  void* mhandles[NCCL_NUM_PROTOCOLS];
+  proxyConnectState state;
+  struct ncclCollNetSharedRes* collNet;
+  int needsProxyProgress;
+};
+
+typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
+
+enum proxyMode {
+  proxyRing = 0,
+  proxyFrom = 1,
+  proxyTo = 2
+};
+
+ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
+ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
+ncclResult_t ncclProxyStart(struct ncclComm* comm);
+ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
+ncclResult_t ncclProxyCreate(struct ncclComm* comm);
+ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
+enum ncclProxyMsgType {
+  ncclProxyMsgInit = 1,
+  ncclProxyMsgSharedInit = 2,
+  ncclProxyMsgSetup = 3,
+  ncclProxyMsgConnect = 4,
+  ncclProxyMsgStart = 5,
+  ncclProxyMsgClose = 6,
+  ncclProxyMsgAbort = 7,
+  ncclProxyMsgStop = 8,
+  ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
+};
+
+// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
+// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
+// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
+ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
+
+// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
+ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
+ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
+
+ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
+
+ncclResult_t ncclProxyStop(struct ncclComm* comm);
+ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
+ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
+#endif
diff --git a/nvls/shm.h b/nvls/shm.h
new file mode 100644
index 000000000..e75caa6a6
--- /dev/null
+++ b/nvls/shm.h
@@ -0,0 +1,25 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SHM_H_
+#define NCCL_SHM_H_
+
+#include "nccl.h"
+
+typedef void* ncclShmHandle_t;
+ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
+ncclResult_t ncclShmClose(ncclShmHandle_t handle);
+ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
+
+struct ncclShmemCollBuff {
+  volatile size_t *cnt[2];
+  volatile void *ptr[2];
+  int round;
+};
+
+ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
+
+#endif
diff --git a/nvls/socket.h b/nvls/socket.h
new file mode 100644
index 000000000..9e5137289
--- /dev/null
+++ b/nvls/socket.h
@@ -0,0 +1,97 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_SOCKET_H_
+#define NCCL_SOCKET_H_
+
+#include "nccl.h"
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <netinet/tcp.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <poll.h>
+
+#define MAX_IFS 16
+#define MAX_IF_NAME_SIZE 16
+#define SLEEP_INT            1000 // connection retry sleep interval in usec
+#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
+#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
+#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
+#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
+
+/* Common socket address storage structure for IPv4/IPv6 */
+union ncclSocketAddress {
+  struct sockaddr sa;
+  struct sockaddr_in sin;
+  struct sockaddr_in6 sin6;
+};
+
+enum ncclSocketState {
+  ncclSocketStateNone = 0,
+  ncclSocketStateInitialized = 1,
+  ncclSocketStateAccepting = 2,
+  ncclSocketStateAccepted = 3,
+  ncclSocketStateConnecting = 4,
+  ncclSocketStateConnectPolling = 5,
+  ncclSocketStateConnected = 6,
+  ncclSocketStateReady = 7,
+  ncclSocketStateClosed = 8,
+  ncclSocketStateError = 9,
+  ncclSocketStateNum = 10
+};
+
+enum ncclSocketType {
+  ncclSocketTypeUnknown = 0,
+  ncclSocketTypeBootstrap = 1,
+  ncclSocketTypeProxy = 2,
+  ncclSocketTypeNetSocket = 3,
+  ncclSocketTypeNetIb = 4
+};
+
+struct ncclSocket {
+  int fd;
+  int acceptFd;
+  int timedOutRetries;
+  int refusedRetries;
+  union ncclSocketAddress addr;
+  volatile uint32_t* abortFlag;
+  int asyncFlag;
+  enum ncclSocketState state;
+  int salen;
+  uint64_t magic;
+  enum ncclSocketType type;
+};
+
+const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
+ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
+int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
+int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
+
+// Initialize a socket
+ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
+// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
+ncclResult_t ncclSocketListen(struct ncclSocket* sock);
+ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
+// Connect to sock->addr. sock->fd is set after a successful call.
+ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
+// Return socket connection state.
+ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
+// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
+ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
+ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
+ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
+
+#define NCCL_SOCKET_SEND 0
+#define NCCL_SOCKET_RECV 1
+
+ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
+ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
+ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
+ncclResult_t ncclSocketClose(struct ncclSocket* sock);
+#endif
diff --git a/nvls/strongstream.h b/nvls/strongstream.h
new file mode 100644
index 000000000..0984dfe57
--- /dev/null
+++ b/nvls/strongstream.h
@@ -0,0 +1,140 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_STRONGSTREAM_H_
+#define NCCL_STRONGSTREAM_H_
+
+#include "nccl.h"
+#include "checks.h"
+
+#include <stdint.h>
+
+/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
+ * easily.
+ */
+struct ncclCudaGraph {
+#if CUDART_VERSION >= 11030
+  cudaGraph_t graph;
+  unsigned long long graphId;
+#endif
+};
+
+inline struct ncclCudaGraph ncclCudaGraphNone() {
+  struct ncclCudaGraph tmp;
+  #if CUDART_VERSION >= 11030
+    tmp.graph = nullptr;
+    tmp.graphId = ULLONG_MAX;
+  #endif
+  return tmp;
+}
+
+inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
+  #if CUDART_VERSION >= 11030
+    return graph.graph != nullptr;
+  #else
+    return false;
+  #endif
+}
+
+inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
+  #if CUDART_VERSION >= 11030
+    return a.graphId == b.graphId;
+  #else
+    return true;
+  #endif
+}
+
+ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
+ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
+
+/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
+ * identity while being captured. Regular streams have the deficiency that the
+ * captured form of a stream in one graph launch has no relation to the
+ * uncaptured stream or to the captured form in other graph launches. This makes
+ * streams unfit for the use of serializing access to a persistent resource.
+ * Strong streams have been introduced to address this need.
+ *
+ * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
+ *
+ * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
+ *   indicating the currently capturing graph (or none). This parameter must be
+ *   the same for the entire sequence of {Acquire; ...; Release}.
+ *
+ * - An {Acquire; ...; Release} sequence must not be concurrent with any
+ *   other operations against the strong stream including graph launches which
+ *   reference this stream.
+ */
+struct ncclStrongStream;
+
+ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
+ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
+
+// Acquire-fence the strong stream.
+ncclResult_t ncclStrongStreamAcquire(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss
+);
+
+// Acquire-fence the strong stream assuming no graph is capturing. This permits
+// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
+// calls. Strong stream still must be released via:
+//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
+ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
+
+// Release-fence of the strong stream.
+ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
+
+// Add a host launch to the stream.
+ncclResult_t ncclStrongStreamLaunchHost(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  cudaHostFn_t fn, void* arg
+);
+// Add a kernel launch to the stream.
+ncclResult_t ncclStrongStreamLaunchKernel(
+  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
+  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
+);
+
+// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
+// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
+// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
+// implementation to induce few graph dependencies.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
+);
+// `b` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
+);
+// `a` must be capturing within `graph`.
+ncclResult_t ncclStrongStreamWaitStream(
+  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
+);
+
+// Synchrnoization does not need the strong stream to be acquired.
+ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclStrongStreamGraph; // internal to ncclStrongStream
+
+struct ncclStrongStream {
+  // Used when not graph capturing.
+  cudaStream_t cudaStream;
+#if CUDART_VERSION >= 11030
+  // The event used to establish order between graphs and streams. During acquire
+  // this event is waited on, during release it is recorded to.
+  cudaEvent_t serialEvent;
+  // This stream ever appeared in a graph capture.
+  bool everCaptured;
+  // Tracks whether serialEvent needs to be recorded to upon Release().
+  bool serialEventNeedsRecord;
+  struct ncclStrongStreamGraph* graphHead;
+#else
+  cudaEvent_t scratchEvent;
+#endif
+};
+
+#endif
diff --git a/nvls/test.cu b/nvls/test.cu
new file mode 100644
index 000000000..6e4d39bc8
--- /dev/null
+++ b/nvls/test.cu
@@ -0,0 +1,172 @@
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <mpi.h>
+#include <stdio.h>
+// #include <nccl.h>
+#include <unistd.h>
+#include <cudaTypedefs.h>
+#include "ipcsocket.cc"
+
+#define CUCHECK(cmd) do {               \
+    auto err = cmd;                     \
+    if( err != 0 ) {                    \
+        printf("Cuda failure %d: Line %d", err, __LINE__); \
+    }                                   \
+} while(false)
+
+//AR kernel snippet for sm_90 only
+
+#define MULTIMEM_ST(val, ptr)                                                  \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr),    \
+               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)                  \
+               : "memory");
+//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                                  \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"            \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)                     \
+      : "l"(ptr)                                                               \
+      : "memory");
+
+__global__ void testing2(float* uc_ptr){
+  uc_ptr[0] = 1.0;
+  printf("ptr -> %f\n", uc_ptr[0]);
+}
+
+
+__global__ void testing(float* mc_ptr, int numlines, int myrank, int RANKS){
+  //for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
+  //line is assumed to be 16B 4 ints of 8 halves
+  const int start_elem =  threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x);
+  const int end_elem = max(start_elem, numlines);
+  const int lineoffset = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
+  const int loop_step0 = (blockDim.x * gridDim.x) * 4;
+  __syncthreads();
+  printf("start %d, end %d step %d\n", start_elem, end_elem, loop_step0);
+    for (int line = start_elem; line < end_elem; line += loop_step0) {
+      uint4 val;
+      MULTIMEM_LD(val, mc_ptr + (lineoffset + line));
+      printf("val %f\n", *(float*)&(val.x));
+      MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
+    }
+  __syncthreads();
+}
+
+int main(){
+  int myrank, nranks;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+ 
+  cudaSetDevice(myrank);
+  CUresult res;
+
+  size_t size = 1024*1024*512*3;
+  CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+ 
+  CUmulticastObjectProp mcProp = {};
+  mcProp.numDevices = nranks;
+  mcProp.size = size;
+  mcProp.handleTypes = handleType;
+ 
+  size_t minGran, gran;
+  gran = 0;
+  minGran = 0;
+  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+  // printf("gran = %lld, minGrad = %lld\n", gran, minGran); 
+  size_t mcSize = ((size+gran-1)/gran)*gran;
+  mcProp.size = mcSize;
+
+  CUmemGenericAllocationHandle handle;
+  //only one rank creates the multicast object
+  if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
+ 
+  int fd, peerfd;
+  fd = 0;
+  peerfd = 0;
+  if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
+ 
+  //some ugly UDS business
+  // Borrow ipcsocket.{c,h} from nccl code
+  //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles
+  // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
+ 
+  volatile uint32_t abortFlag = 0;
+ struct ncclIpcSocket ipcSock = { 0 };
+ uint64_t opId=0xdeadcafebeef;
+  // ncclResult_t ret = ncclSuccess;
+
+  ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag);
+  MPI_Barrier(MPI_COMM_WORLD);
+  if(!myrank) {
+    for(int p=1;p<nranks;p++) {
+      ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId);
+    } 
+  } else {
+      ncclIpcSocketRecvFd(&ipcSock, &peerfd);
+  }
+  ncclIpcSocketClose(&ipcSock);
+
+  printf("fd = %d peerfd = %d\n", fd, peerfd);
+  // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  //everyone else would now have same multicast object
+  if(myrank)  CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)peerfd, handleType));
+ 
+//  if(myrank)
+//    close(peerfd);
+//  else
+    close(fd);
+  //end of ugly UDS business
+#if 1
+  //everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
+  int mydev = myrank;
+  CUCHECK(cuMulticastAddDevice(handle, mydev));
+  MPI_Barrier(MPI_COMM_WORLD);
+ 
+  CUmemGenericAllocationHandle memhandle;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = mydev;
+  prop.requestedHandleTypes = handleType;
+ 
+  //allocate physical memory (data buffer)
+  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
+ 
+  //everyone binds memory to the multicast
+  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  MPI_Barrier(MPI_COMM_WORLD);
+  //usual VA business: map both MC and PA to two different VA addresses
+  void* uc_va;
+  void* mc_va;
+    CUmemAccessDesc accessDesc = {};
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = mydev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+ 
+    // Map a VA to UC space
+    CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
+    cudaMemset(uc_va, 0, size);
+    CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
+    // set access on UC address
+    CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
+ 
+  // Map a VA to MC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
+  CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
+  // set access on MC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
+#endif
+  printf("Yoooo\n");
+  testing2<<<1, 1>>>((float*)mc_va);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  testing<<<1, 1>>>((float*)mc_va, 1, myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+} 
+//........
+ 
+
diff --git a/nvls/test2.cpp b/nvls/test2.cpp
new file mode 100644
index 000000000..400d566ae
--- /dev/null
+++ b/nvls/test2.cpp
@@ -0,0 +1,143 @@
+#include <cuda_runtime.h>
+#include <cuda.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <nccl.h>
+#include <unistd.h>
+#include <cudaTypedefs.h>
+
+#define CUCHECK(cmd) do {               \
+    auto err = cmd;                     \
+    if( err != 0 ) {                    \
+        printf("Cuda failure %d: Line %d", err, __LINE__); \
+    }                                   \
+} while(false)
+
+int main(){
+  int myrank, nranks;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+ 
+  cudaSetDevice(myrank);
+  CUresult res;
+
+
+CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+ 
+  CUmulticastObjectProp mcProp = {};
+  mcProp.numDevices = nranks;
+  mcProp.size = size;
+  mcProp.handleTypes = handleType;
+ 
+  size_t minGran, gran;
+  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+ 
+  size_t mcSize = ((size+gran-1)/gran)*gran;
+  mcProp.size = mcSize;
+ 
+  //only one rank creates the multicast object
+  if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
+ 
+  int fd, peerfd;
+  if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
+ 
+  //some ugly UDS business
+  // Borrow ipcsocket.{c,h} from nccl code
+  //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles
+  // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
+ 
+  volatile uint32_t abortFlag = 0;
+  struct ncclIpcSocket ipcSock = { 0 };
+  uint64_t opId=0xdeadcafebeef;
+  ncclResult_t ret = ncclSuccess;
+ 
+  NCCLCHECK(ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag));
+  MPI_Barrier(MPI_COMM_WORLD);
+  if(!myrank)
+    for(int p=1;p<nranks;p++) {
+      NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId), ret, error);
+    } else {
+      NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, &peerfd), ret, error);
+  }
+  error:
+  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
+ 
+  //everyone else would now have same multicast object
+  if(myrank)  CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)peerfd, handleType));
+ 
+  if(myrank)
+    close(peerfd);
+  else
+    close(fd);
+  //end of ugly UDS business
+ 
+  //everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
+  CUCHECK(cuMulticastAddDevice(handle, mydev));
+  MPI_Barrier(MPI_COMM_WORLD);
+ 
+  CUmemGenericAllocationHandle memhandle;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = mydev;
+  prop.requestedHandleTypes = handleType;
+ 
+  //allocate physical memory (data buffer)
+  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
+ 
+  //everyone binds memory to the multicast
+  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  MPI_Barrier(MPI_COMM_WORLD);
+  //usual VA business: map both MC and PA to two different VA addresses
+    CUmemAccessDesc accessDesc = {};
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    accessDesc.location.id = mydev;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+ 
+    // Map a VA to UC space
+    CUCHECK(cuMemAddressReserve(&uc_va, size, minGran, 0U, 0));
+    CUCHECK(cuMemMap(uc_va, size, 0, memhandle, 0));
+    // set access on UC address
+    CUCHECK(cuMemSetAccess(uc_va, size, &accessDesc, 1));
+ 
+  // Map a VA to MC space
+  CUCHECK(cuMemAddressReserve(&mc_va, mcSize, minGran, 0U, 0));
+  CUCHECK(cuMemMap(mc_va, mcSize, 0, handle, 0));
+  // set access on MC address
+  CUCHECK(cuMemSetAccess(mc_va, mcSize, &accessDesc, 1));
+
+  MPI_Finalize();
+} 
+//........
+ 
+/*
+//AR kernel snippet for sm_90 only
+ 
+#if __CUDA_ARCH__ >= 900
+#define MULTIMEM_ST(val, ptr)                                                  \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr),    \
+               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)                  \
+               : "memory");
+//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                                  \
+  asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"            \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)                     \
+      : "l"(ptr)                                                               \
+      : "memory");
+#endif
+ 
+//for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
+//line is assumed to be 16B 4 ints of 8 halves
+const int start_elem =  threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x);
+const int end_elem = max(start_elem, numlines);
+__syncthreads();
+  for (int line = start_elem; line < end_elem; line += loop_step0) {
+    uint4 val;
+    MULTIMEM_LD(val, mc_ptr + (lineoffset + line))
+    MULTIMEM_ST(val, mc_ptr + (lineoffset + line))
+  }
+__syncthreads();
+ 
+*/
diff --git a/nvls/timer.h b/nvls/timer.h
new file mode 100644
index 000000000..284fec6e0
--- /dev/null
+++ b/nvls/timer.h
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TIMER_H_
+#define NCCL_TIMER_H_
+#if ENABLE_TIMER
+#include <unistd.h>
+#include <sys/time.h>
+#include <x86intrin.h>
+static double freq = -1;
+static void calibrate() {
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  uint64_t timeCycles = __rdtsc();
+  double time = - tv.tv_sec*1E6 - tv.tv_usec;
+  uint64_t total = 0ULL;
+  for (int i=0; i<10000; i++) total += __rdtsc();
+  gettimeofday(&tv, NULL);
+  timeCycles = __rdtsc() - timeCycles;
+  time += tv.tv_sec*1E6 + tv.tv_usec;
+  freq = timeCycles/time;
+}
+static inline double gettime() {
+  if (freq == -1) calibrate();
+  return __rdtsc()/freq;
+}
+static uint64_t counts[8];
+static double times[8];
+static double startTimes[8];
+#define TIME_START(index) do { \
+  counts[index]++; \
+  startTimes[index] = gettime(); \
+} while (0);
+
+#define TIME_STOP(index) do { \
+  times[index] += gettime() - startTimes[index]; \
+} while (0);
+
+#define TIME_CANCEL(index) do { \
+  counts[index]--; \
+} while (0);
+
+#define TIME_PRINT(name) do { \
+  printf("%s stats", name); \
+  for (int i=0; i<8; i++) { \
+    if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
+    counts[i] = 0; \
+  } \
+  printf("\n"); \
+} while (0);
+#else
+#define TIME_START(index) while(0);
+#define TIME_STOP(index) while(0);
+#define TIME_CANCEL(index) while(0);
+#define TIME_PRINT(name)
+#endif
+#endif
diff --git a/nvls/transport.h b/nvls/transport.h
new file mode 100644
index 000000000..27529df5e
--- /dev/null
+++ b/nvls/transport.h
@@ -0,0 +1,128 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TRANSPORT_H_
+#define NCCL_TRANSPORT_H_
+
+#include "device.h"
+#include "graph.h"
+#include "nvmlwrap.h"
+#include "core.h"
+
+#define NTRANSPORTS 4
+#define TRANSPORT_P2P 0
+#define TRANSPORT_SHM 1
+#define TRANSPORT_NET 2
+#define TRANSPORT_COLLNET 3
+
+#include "proxy.h"
+
+extern struct ncclTransport p2pTransport;
+extern struct ncclTransport shmTransport;
+extern struct ncclTransport netTransport;
+extern struct ncclTransport collNetTransport;
+
+extern struct ncclTransport* ncclTransports[];
+
+// Forward declarations
+struct ncclRing;
+struct ncclConnector;
+struct ncclComm;
+
+struct ncclPeerInfo {
+  int rank;
+  int cudaDev;
+  int nvmlDev;
+  int gdrSupport;
+  uint64_t hostHash;
+  uint64_t pidHash;
+  dev_t shmDev;
+  int64_t busId;
+  struct ncclComm* comm;
+  int cudaCompCap;
+};
+
+#define CONNECT_SIZE 128
+struct ncclConnect {
+  char data[CONNECT_SIZE];
+};
+
+#if CUDART_VERSION >= 12010
+
+#define NVLS_HANDLE_SIZE 64
+struct ncclNvlsSharedRes {
+  int refCount;
+  CUmulticastObjectProp properties;
+  CUmemAccessDesc accessDesc;
+  int dev;
+  size_t size;
+  size_t granularity;
+  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
+  char* mcBuff; // Multicast NVLS buffer address
+  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
+  char* ucBuff; // Unicast NVLS buffer address
+  char shareableHandle[NVLS_HANDLE_SIZE];
+  size_t ucGran;
+  int nChannels;
+  struct ncclShmemCollBuff nvlsShmem;
+  void *nvlsShmemHandle;
+};
+
+#endif /* CUDART_VERSION >= 12010 */
+
+struct ncclCollNetSharedRes {
+  int refCount;
+  int size;
+  char* cudaBuff;
+  char* hostBuff;
+  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
+  void* resources;
+  int nChannels;
+  size_t buffSize;
+};
+
+struct ncclTransportComm {
+  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
+  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
+  ncclResult_t (*free)(struct ncclConnector*);
+  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
+  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
+  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
+  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
+};
+
+struct ncclTransport {
+  const char name[8];
+  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
+  struct ncclTransportComm send;
+  struct ncclTransportComm recv;
+};
+
+ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
+ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
+
+// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
+#define USE_POSIX_FD 1
+
+#if USE_POSIX_FD
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#else
+#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
+#endif
+
+ncclResult_t ncclNvlsInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
+ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
+ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
+ncclResult_t ncclNvlsFree(struct ncclComm* comm);
+
+enum { collNetRecv=0, collNetSend=1 };
+int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
+ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
+ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
+#endif
diff --git a/nvls/trees.h b/nvls/trees.h
new file mode 100644
index 000000000..ded84a667
--- /dev/null
+++ b/nvls/trees.h
@@ -0,0 +1,13 @@
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_TREES_H_
+#define NCCL_TREES_H_
+
+ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
+ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
+
+#endif
diff --git a/nvls/tuner.h b/nvls/tuner.h
new file mode 100644
index 000000000..d8b275017
--- /dev/null
+++ b/nvls/tuner.h
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_TUNER_H_
+#define NCCL_INT_TUNER_H_
+
+#include "nccl_tuner.h"
+
+// Tuning plugin to override NCCL's default algorithm/protocol tuning.
+
+// Attempts to load NCCL tuner from environmental variable.
+// Returns ncclSuccess if the correct tuner symbol has been found and
+// successully loaded.  Otherwise returns an error and also logs the error.
+ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
+
+// Cleans up NCCL tuner plugin.
+ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
+#endif
diff --git a/nvls/utils.h b/nvls/utils.h
new file mode 100644
index 000000000..60f6efb5f
--- /dev/null
+++ b/nvls/utils.h
@@ -0,0 +1,524 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_UTILS_H_
+#define NCCL_UTILS_H_
+
+#include "nccl.h"
+#include "alloc.h"
+#include "checks.h"
+#include <stdint.h>
+#include <time.h>
+#include <sched.h>
+#include <algorithm>
+#include <new>
+
+int ncclCudaCompCap();
+
+// PCI Bus ID <-> int64 conversion functions
+ncclResult_t int64ToBusId(int64_t id, char* busId);
+ncclResult_t busIdToInt64(const char* busId, int64_t* id);
+
+ncclResult_t getBusId(int cudaDev, int64_t *busId);
+
+ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+uint64_t getHash(const char* string, int n);
+uint64_t getHostHash();
+uint64_t getPidHash();
+ncclResult_t getRandomData(void* buffer, size_t bytes);
+
+struct netIf {
+  char prefix[64];
+  int port;
+};
+
+int parseStringList(const char* string, struct netIf* ifList, int maxList);
+bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
+
+static long log2i(long n) {
+ long l = 0;
+ while (n>>=1) l++;
+ return l;
+}
+
+inline uint64_t clockNano() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
+}
+
+/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
+ * return -1 */
+inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
+  ncclResult_t ret = ncclSuccess;
+  if (bytes > 0) {
+    const size_t one = 1UL;
+    FILE* fp = fopen("/dev/urandom", "r");
+    if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError;
+    if (fp) fclose(fp);
+  }
+  return ret;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename Int>
+inline void ncclAtomicRefCountIncrement(Int* refs) {
+  __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
+}
+
+template<typename Int>
+inline Int ncclAtomicRefCountDecrement(Int* refs) {
+  return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
+ * granularity of LIFO is not per object, instead frames containing many objects
+ * are pushed and popped. Therefor deallocation is extremely cheap since its
+ * done at the frame granularity.
+ *
+ * The initial state of the stack is with one frame, the "nil" frame, which
+ * cannot be popped. Therefor objects allocated in the nil frame cannot be
+ * deallocated sooner than stack destruction.
+ */
+struct ncclMemoryStack;
+
+void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
+void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
+void ncclMemoryStackPush(struct ncclMemoryStack* me);
+void ncclMemoryStackPop(struct ncclMemoryStack* me);
+template<typename T>
+T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
+ * a pool instance to ever hold objects whose type have differing
+ * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
+ * a backing `ncclMemoryStack` passed during Alloc(). If memory
+ * backing any currently held object is deallocated then it is an error to do
+ * anything other than reconstruct it, after which it is a valid empty pool.
+ */
+struct ncclMemoryPool;
+
+// Equivalent to zero-initialization
+void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
+template<typename T>
+T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
+template<typename T>
+void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
+void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
+ * field is given via the `next` template argument.
+ *
+ * Example:
+ *   struct Foo {
+ *     struct Foo *next1, *next2; // can be a member of two lists at once
+ *   };
+ *   ncclIntruQueue<Foo, &Foo::next1> list1;
+ *   ncclIntruQueue<Foo, &Foo::next2> list2;
+ */
+template<typename T, T *T::*next>
+struct ncclIntruQueue;
+
+template<typename T, T *T::*next>
+void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
+template<typename T, T *T::*next>
+T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
+
+////////////////////////////////////////////////////////////////////////////////
+/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
+ * and "cond" fields are part of the public interface.
+ */
+struct ncclThreadSignal {
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+};
+
+// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
+
+void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
+void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
+
+// A convenience instance per-thread.
+extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc;
+
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
+// Enqueue element. Returns true if queue is not abandoned. Even if queue is
+// abandoned the element enqueued, so the caller needs to make arrangements for
+// the queue to be tended.
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
+// Dequeue all elements at a glance. If there aren't any and `waitSome` is
+// true then this call will wait until it can return a non empty list.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
+// Dequeue all elements and set queue to abandoned state.
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclMemoryStack {
+  struct Hunk {
+    struct Hunk* above; // reverse stack pointer
+    size_t size; // size of this allocation (including this header struct)
+  };
+  struct Unhunk { // proxy header for objects allocated out-of-hunk
+    struct Unhunk* next;
+    void* obj;
+  };
+  struct Frame {
+    struct Hunk* hunk; // top of non-empty hunks
+    uintptr_t bumper, end; // points into top hunk
+    struct Unhunk* unhunks;
+    struct Frame* below;
+  };
+
+  static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
+  static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
+
+  struct Hunk stub;
+  struct Frame topFrame;
+};
+
+inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
+  me->stub.above = nullptr;
+  me->stub.size = 0;
+  me->topFrame.hunk = &me->stub;
+  me->topFrame.bumper = 0;
+  me->topFrame.end = 0;
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = nullptr;
+}
+
+inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
+  uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
+  void* obj;
+  if (__builtin_expect(o + size <= me->topFrame.end, true)) {
+    me->topFrame.bumper = o + size;
+    obj = reinterpret_cast<void*>(o);
+  } else {
+    obj = allocateSpilled(me, size, align);
+  }
+  return obj;
+}
+
+template<typename T>
+inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
+  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
+  memset(obj, 0, n*sizeof(T));
+  return (T*)obj;
+}
+
+inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
+  using Frame = ncclMemoryStack::Frame;
+  Frame tmp = me->topFrame;
+  Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
+  *snapshot = tmp; // C++ struct assignment
+  me->topFrame.unhunks = nullptr;
+  me->topFrame.below = snapshot;
+}
+
+inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
+  ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
+  while (un != nullptr) {
+    free(un->obj);
+    un = un->next;
+  }
+  me->topFrame = *me->topFrame.below; // C++ struct assignment
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+struct ncclMemoryPool {
+  struct Cell {
+    Cell *next;
+  };
+  struct Cell* head;
+  struct Cell* tail; // meaningful only when head != nullptr
+};
+
+inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
+  me->head = nullptr;
+}
+
+template<typename T>
+inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
+  using Cell = ncclMemoryPool::Cell;
+  Cell* cell;
+  if (__builtin_expect(me->head != nullptr, true)) {
+    cell = me->head;
+    me->head = cell->next;
+  } else {
+    // Use the internal allocate() since it doesn't memset to 0 yet.
+    size_t cellSize = std::max(sizeof(Cell), sizeof(T));
+    size_t cellAlign = std::max(alignof(Cell), alignof(T));
+    cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign);
+  }
+  memset(cell, 0, sizeof(T));
+  return reinterpret_cast<T*>(cell);
+}
+
+template<typename T>
+inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
+  using Cell = ncclMemoryPool::Cell;
+  Cell* cell = reinterpret_cast<Cell*>(obj);
+  cell->next = me->head;
+  if (me->head == nullptr) me->tail = cell;
+  me->head = cell;
+}
+
+inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
+  if (from->head != nullptr) {
+    from->tail->next = me->head;
+    if (me->head == nullptr) me->tail = from->tail;
+    me->head = from->head;
+    from->head = nullptr;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueue {
+  T *head, *tail;
+};
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
+  me->head = nullptr;
+  me->tail = nullptr;
+}
+
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
+  return me->head == nullptr;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
+  return me->head;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
+  return me->tail;
+}
+
+template<typename T, T *T::*next>
+inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
+  x->*next = nullptr;
+  (me->head ? me->tail->*next : me->head) = x;
+  me->tail = x;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  me->head = ans->*next;
+  if (me->head == nullptr) me->tail = nullptr;
+  return ans;
+}
+
+template<typename T, T *T::*next>
+inline bool ncclIntruQueueDelete(ncclIntruQueue<T,next> *me, T *x) {
+  T *prev = nullptr;
+  T *cur = me->head;
+  bool found = false;
+
+  while (cur) {
+    if (cur == x) {
+      found = true;
+      break;
+    }
+    prev = cur;
+    cur = cur->*next;
+  }
+
+  if (found) {
+    if (prev == nullptr)
+      me->head = cur->*next;
+    else
+      prev->*next = cur->*next;
+    if (cur == me->tail)
+      me->tail = prev;
+  }
+  return found;
+}
+
+template<typename T, T *T::*next>
+inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
+  T *ans = me->head;
+  if (ans != nullptr) {
+    me->head = ans->*next;
+    if (me->head == nullptr) me->tail = nullptr;
+  }
+  return ans;
+}
+
+template<typename T, T *T::*next>
+void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
+  T *head = me->head;
+  me->head = nullptr;
+  me->tail = nullptr;
+  while (head != nullptr) {
+    T *tmp = head->*next;
+    ncclMemoryPoolFree(pool, tmp);
+    head = tmp;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
+  return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
+}
+
+inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
+  pthread_mutex_init(&me->mutex, nullptr);
+  pthread_cond_init(&me->cond, nullptr);
+}
+
+inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
+  pthread_mutex_destroy(&me->mutex);
+  pthread_cond_destroy(&me->cond);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, T *T::*next>
+struct ncclIntruQueueMpsc {
+  T* head;
+  uintptr_t tail;
+  struct ncclThreadSignal* waiting;
+};
+
+template<typename T, T *T::*next>
+void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
+  me->head = nullptr;
+  me->tail = 0x0;
+  me->waiting = nullptr;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
+  return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
+}
+
+template<typename T, T *T::*next>
+bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
+  __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
+  T* prev = reinterpret_cast<T*>(utail);
+  T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
+  __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
+  if (utail == 0x1) { // waiting
+    __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
+    // This lock/unlock is essential to ensure we don't race ahead of the consumer
+    // and signal the cond before they begin waiting on it.
+    struct ncclThreadSignal* waiting = me->waiting;
+    pthread_mutex_lock(&waiting->mutex);
+    pthread_mutex_unlock(&waiting->mutex);
+    pthread_cond_broadcast(&waiting->cond);
+  }
+  return utail != 0x2; // not abandoned
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
+  T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+  if (head == nullptr) {
+    if (!waitSome) return nullptr;
+    uint64_t t0 = clockNano();
+    bool sleeping = false;
+    do {
+      if (clockNano()-t0 >= 10*1000) { // spin for first 10us
+        struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
+        pthread_mutex_lock(&waitSignal->mutex);
+        uintptr_t expected = sleeping ? 0x1 : 0x0;
+        uintptr_t desired = 0x1;
+        me->waiting = waitSignal; // release done by successful compare exchange
+        if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
+          sleeping = true;
+          pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
+        }
+        pthread_mutex_unlock(&waitSignal->mutex);
+      }
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+    } while (head == nullptr);
+  }
+
+  __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+  uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
+  T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+  T *x = head;
+  while (x != tail) {
+    T *x1;
+    int spins = 0;
+    while (true) {
+      x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+      if (x1 != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    x = x1;
+  }
+  return head;
+}
+
+template<typename T, T *T::*next>
+T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
+  uintptr_t expected = 0x0;
+  if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
+    return nullptr;
+  } else {
+    int spins = 0;
+    T* head;
+    while (true) {
+      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
+      if (head != nullptr) break;
+      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+    }
+    __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
+    uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
+    T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
+    T *x = head;
+    while (x != tail) {
+      T *x1;
+      spins = 0;
+      while (true) {
+        x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
+        if (x1 != nullptr) break;
+        if (++spins == 1024) { spins = 1024-1; sched_yield(); }
+      }
+      x = x1;
+    }
+    return head;
+  }
+}
+#endif

From a7b627a7165747cf092d428ddf89790c3e9b9a35 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Tue, 2 Jan 2024 22:47:53 +0000
Subject: [PATCH 02/67] a bit cleaned up

---
 nvls/test.cu | 198 +++++++++++++++++++++++++++------------------------
 1 file changed, 103 insertions(+), 95 deletions(-)

diff --git a/nvls/test.cu b/nvls/test.cu
index 6e4d39bc8..08a156dfc 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -1,172 +1,180 @@
-#include <cuda_runtime.h>
 #include <cuda.h>
+#include <cuda_runtime.h>
 #include <mpi.h>
 #include <stdio.h>
 // #include <nccl.h>
-#include <unistd.h>
 #include <cudaTypedefs.h>
+#include <unistd.h>
+
 #include "ipcsocket.cc"
 
-#define CUCHECK(cmd) do {               \
-    auto err = cmd;                     \
-    if( err != 0 ) {                    \
-        printf("Cuda failure %d: Line %d", err, __LINE__); \
-    }                                   \
-} while(false)
+#define CUCHECK(cmd)                                     \
+  do {                                                   \
+    auto err = cmd;                                      \
+    if (err != 0) {                                      \
+      printf("Cuda failure %d: Line %d", err, __LINE__); \
+      exit(-1);                                          \
+    }                                                    \
+  } while (false)
 
-//AR kernel snippet for sm_90 only
+// AR kernel snippet for sm_90 only
 
-#define MULTIMEM_ST(val, ptr)                                                  \
-  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr),    \
-               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)                  \
+#define MULTIMEM_ST(val, ptr)                                                                                   \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
+               "r"(val.w)                                                                                       \
                : "memory");
-//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
-#define MULTIMEM_LD(val, ptr)                                                  \
-  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"            \
-      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)                     \
-      : "l"(ptr)                                                               \
+// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                     \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
+      : "l"(ptr)                                                  \
       : "memory");
 
-__global__ void testing2(float* uc_ptr){
-  uc_ptr[0] = 1.0;
-  printf("ptr -> %f\n", uc_ptr[0]);
-}
+__global__ void testing2(float* uc_ptr) { uc_ptr[0] = 1.0; }
 
+#define UNROLL 8
+__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
+  // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
+  // line is assumed to be 16B 4 ints of 8 halves
+  int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
+  int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
 
-__global__ void testing(float* mc_ptr, int numlines, int myrank, int RANKS){
-  //for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
-  //line is assumed to be 16B 4 ints of 8 halves
-  const int start_elem =  threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x);
-  const int end_elem = max(start_elem, numlines);
-  const int lineoffset = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
-  const int loop_step0 = (blockDim.x * gridDim.x) * 4;
-  __syncthreads();
-  printf("start %d, end %d step %d\n", start_elem, end_elem, loop_step0);
-    for (int line = start_elem; line < end_elem; line += loop_step0) {
-      uint4 val;
-      MULTIMEM_LD(val, mc_ptr + (lineoffset + line));
-      printf("val %f\n", *(float*)&(val.x));
-      MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
-    }
-  __syncthreads();
+  int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    uint4 val;
+    MULTIMEM_LD(val, mc_ptr + idx);
+    MULTIMEM_ST(val, mc_ptr + idx);
+  }
 }
 
-int main(){
+int main() {
   int myrank, nranks;
   MPI_Init(NULL, NULL);
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
   MPI_Comm_size(MPI_COMM_WORLD, &nranks);
- 
+
   cudaSetDevice(myrank);
   CUresult res;
 
-  size_t size = 1024*1024*512*3;
+  size_t size = 1024 * 1024 * 512;
   CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
- 
+
   CUmulticastObjectProp mcProp = {};
   mcProp.numDevices = nranks;
   mcProp.size = size;
   mcProp.handleTypes = handleType;
- 
+
   size_t minGran, gran;
   gran = 0;
   minGran = 0;
   CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
   CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
 
-  // printf("gran = %lld, minGrad = %lld\n", gran, minGran); 
-  size_t mcSize = ((size+gran-1)/gran)*gran;
+  if (!myrank) printf("gran = %lu, minGrad = %lu\n", gran, minGran);
+  size_t mcSize = ((size + gran - 1) / gran) * gran;
   mcProp.size = mcSize;
 
   CUmemGenericAllocationHandle handle;
-  //only one rank creates the multicast object
-  if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
- 
+  // only one rank creates the multicast object
+  if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
+
   int fd, peerfd;
   fd = 0;
   peerfd = 0;
-  if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
- 
-  //some ugly UDS business
-  // Borrow ipcsocket.{c,h} from nccl code
-  //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles
-  // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
- 
+  if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
+
+  // some ugly UDS business
+  //  Borrow ipcsocket.{c,h} from nccl code
+  // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the
+  // exported handles
+  //  moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
+
   volatile uint32_t abortFlag = 0;
- struct ncclIpcSocket ipcSock = { 0 };
- uint64_t opId=0xdeadcafebeef;
+  struct ncclIpcSocket ipcSock = {0};
+  uint64_t opId = 0xdeadcafebeef;
   // ncclResult_t ret = ncclSuccess;
 
   ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag);
   MPI_Barrier(MPI_COMM_WORLD);
-  if(!myrank) {
-    for(int p=1;p<nranks;p++) {
+  if (!myrank) {
+    for (int p = 1; p < nranks; p++) {
       ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId);
-    } 
+    }
   } else {
-      ncclIpcSocketRecvFd(&ipcSock, &peerfd);
+    ncclIpcSocketRecvFd(&ipcSock, &peerfd);
   }
   ncclIpcSocketClose(&ipcSock);
 
-  printf("fd = %d peerfd = %d\n", fd, peerfd);
   // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
-  //everyone else would now have same multicast object
-  if(myrank)  CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)peerfd, handleType));
- 
-//  if(myrank)
-//    close(peerfd);
-//  else
-    close(fd);
-  //end of ugly UDS business
-#if 1
-  //everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
+  // everyone else would now have same multicast object
+  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerfd, handleType));
+
+  //  if(myrank)
+  //    close(peerfd);
+  //  else
+  close(fd);
+  // end of ugly UDS business
+  // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
   int mydev = myrank;
   CUCHECK(cuMulticastAddDevice(handle, mydev));
   MPI_Barrier(MPI_COMM_WORLD);
- 
+
   CUmemGenericAllocationHandle memhandle;
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   prop.location.id = mydev;
   prop.requestedHandleTypes = handleType;
- 
-  //allocate physical memory (data buffer)
+
+  // allocate physical memory (data buffer)
   CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
- 
-  //everyone binds memory to the multicast
+
+  // everyone binds memory to the multicast
   CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
   MPI_Barrier(MPI_COMM_WORLD);
-  //usual VA business: map both MC and PA to two different VA addresses
+  // usual VA business: map both MC and PA to two different VA addresses
   void* uc_va;
   void* mc_va;
-    CUmemAccessDesc accessDesc = {};
-    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    accessDesc.location.id = mydev;
-    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
- 
-    // Map a VA to UC space
-    CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
-    cudaMemset(uc_va, 0, size);
-    CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
-    // set access on UC address
-    CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
- 
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = mydev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  // Map a VA to UC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
+  cudaMemset(uc_va, 0, size);
+  CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
+  // set access on UC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
+
   // Map a VA to MC space
   CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
   CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
   // set access on MC address
   CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
-#endif
-  printf("Yoooo\n");
   testing2<<<1, 1>>>((float*)mc_va);
   cudaDeviceSynchronize();
   MPI_Barrier(MPI_COMM_WORLD);
-  testing<<<1, 1>>>((float*)mc_va, 1, myrank, nranks);
+  int rept = 10;
+  int nblocks = 16;
+  int blocksize = 1024;
+  // warmup
+  for (int i = 0; i < rept; i++) {
+    testing<<<nblocks, blocksize>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  }
   cudaDeviceSynchronize();
   MPI_Barrier(MPI_COMM_WORLD);
+  double st = MPI_Wtime();
+  for (int i = 0; i < rept; i++) {
+    testing<<<nblocks, blocksize>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  }
+  cudaDeviceSynchronize();
+  double en = MPI_Wtime();
+  double time = (en - st) / rept;
+  if (!myrank) printf("Time = %f, bw = %f\n", time, size / 1024. / 1024. / 1024. / time);
+  MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
-} 
+}
 //........
- 
-

From 4ff8a8954cb36db089222717e327b273cc6c25a1 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Tue, 2 Jan 2024 23:13:02 +0000
Subject: [PATCH 03/67] clean up

---
 nvls/test.cu | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/nvls/test.cu b/nvls/test.cu
index 08a156dfc..effd7653b 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -158,22 +158,24 @@ int main() {
   cudaDeviceSynchronize();
   MPI_Barrier(MPI_COMM_WORLD);
   int rept = 10;
-  int nblocks = 16;
-  int blocksize = 1024;
-  // warmup
-  for (int i = 0; i < rept; i++) {
-    testing<<<nblocks, blocksize>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
-  }
-  cudaDeviceSynchronize();
-  MPI_Barrier(MPI_COMM_WORLD);
-  double st = MPI_Wtime();
-  for (int i = 0; i < rept; i++) {
-    testing<<<nblocks, blocksize>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  for (int input_size = 1024*1024*8; input_size <= size; input_size *= 2){
+    int block_size = 1024;
+    int nblocks = 16;
+    // warmup
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    double st = MPI_Wtime();
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    double en = MPI_Wtime();
+    double time = (en - st) / rept;
+    if (!myrank) printf("input_size %d | Time = %f, alg_bw = %f\n", input_size, time, input_size / 1024. / 1024. / 1024. / time);
   }
-  cudaDeviceSynchronize();
-  double en = MPI_Wtime();
-  double time = (en - st) / rept;
-  if (!myrank) printf("Time = %f, bw = %f\n", time, size / 1024. / 1024. / 1024. / time);
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
 }

From dfab9fe9e7c42a6baf2450c82c894f37b357d5a1 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 3 Jan 2024 03:53:32 +0000
Subject: [PATCH 04/67] nvls connection wip

---
 include/mscclpp/core.hpp   |  6 ++++--
 nvls/test.cu               | 42 ++++++++++++++++++++++++++++++--------
 src/connection.cc          | 35 +++++++++++++++++++++++++++++++
 src/include/connection.hpp | 15 ++++++++++++++
 4 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 60494a099..24b54fd33 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -125,6 +125,7 @@ class TcpBootstrap : public Bootstrap {
 enum class Transport {
   Unknown,       // Unknown transport type.
   CudaIpc,       // CUDA IPC transport type.
+  Nvls,          // NVLS transport type.
   IB0,           // InfiniBand device 0 transport type.
   IB1,           // InfiniBand device 1 transport type.
   IB2,           // InfiniBand device 2 transport type.
@@ -136,10 +137,11 @@ enum class Transport {
   NumTransports  // The number of transports.
 };
 
-const std::string TransportNames[] = {"UNK", "IPC", "IB0", "IB1", "IB2", "IB3", "IB4", "IB5", "IB6", "IB7", "NUM"};
+const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2",
+                                      "IB3", "IB4", "IB5",  "IB6", "IB7", "NUM"};
 
 namespace detail {
-const size_t TransportFlagsSize = 10;
+const size_t TransportFlagsSize = 11;
 static_assert(TransportFlagsSize == static_cast<size_t>(Transport::NumTransports),
               "TransportFlagsSize must match the number of transports");
 /// Bitset for storing transport flags.
diff --git a/nvls/test.cu b/nvls/test.cu
index effd7653b..7bf12a699 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -30,9 +30,22 @@
       : "l"(ptr)                                                  \
       : "memory");
 
-__global__ void testing2(float* uc_ptr) { uc_ptr[0] = 1.0; }
+__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) { 
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){
+    uc_ptr[idx] = myrank + idx;
+  }
+}
+
+__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) { 
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){
+    float expected = (float)((nranks * (nranks-1)) / 2 + nranks * idx);
+    if (abs(uc_ptr[idx] - expected) > 0.01 * expected){
+      printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
+    }
+  }
+}
+
 
-#define UNROLL 8
 __global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
   // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
   // line is assumed to be 16B 4 ints of 8 halves
@@ -72,7 +85,7 @@ int main() {
   CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
   CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
 
-  if (!myrank) printf("gran = %lu, minGrad = %lu\n", gran, minGran);
+  if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran);
   size_t mcSize = ((size + gran - 1) / gran) * gran;
   mcProp.size = mcSize;
 
@@ -154,13 +167,24 @@ int main() {
   CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
   // set access on MC address
   CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
-  testing2<<<1, 1>>>((float*)mc_va);
+
+  int rept = 10;
+  int block_size = 1024;
+  int nblocks = 16;
+
   cudaDeviceSynchronize();
   MPI_Barrier(MPI_COMM_WORLD);
-  int rept = 10;
-  for (int input_size = 1024*1024*8; input_size <= size; input_size *= 2){
-    int block_size = 1024;
-    int nblocks = 16;
+  init_kernel<<<nblocks, block_size>>>((float*)uc_va, size/sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  testing<<<nblocks, block_size>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  check_correctness<<<nblocks, block_size>>>((float*)uc_va, size/sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for (int input_size = 1024*3; input_size <= size; input_size *= 2){
     // warmup
     for (int i = 0; i < rept; i++) {
       testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
@@ -174,7 +198,7 @@ int main() {
     cudaDeviceSynchronize();
     double en = MPI_Wtime();
     double time = (en - st) / rept;
-    if (!myrank) printf("input_size %d | Time = %f, alg_bw = %f\n", input_size, time, input_size / 1024. / 1024. / 1024. / time);
+    if (!myrank) printf("input_size %d | Time = %f us, alg_bw = %f (GBps)\n", input_size, time*1e6, input_size / 1e9 / time);
   }
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
diff --git a/src/connection.cc b/src/connection.cc
index 834a1456c..f1331e679 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -92,6 +92,41 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
   INFO(MSCCLPP_P2P, "CudaIpcConnection flushing connection");
 }
 
+// NVLS
+
+NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints) {
+  if (localEndpoint.transport() != Transport::Nvls) {
+    throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage);
+  }
+  for (auto remoteEndpoint : remoteEndpoints) {
+    if (remoteEndpoint.transport() != Transport::Nvls) {
+      throw mscclpp::Error("NVLS connection can only be made to a NVLS endpoint", ErrorCode::InvalidUsage);
+    }
+    // sanity check: make sure the IPC connection is being made within a node
+    if (getImpl(remoteEndpoint)->hostHash_ != getImpl(localEndpoint)->hostHash_) {
+      std::stringstream ss;
+      ss << "NVLS connection can only be made within a node: " << std::hex << getImpl(remoteEndpoint)->hostHash_
+         << " != " << std::hex << getImpl(localEndpoint)->hostHash_;
+      throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage);
+    }
+  }
+  INFO(MSCCLPP_P2P, "NVLS connection created");
+}
+
+Transport NvlsConnection::transport() { return Transport::Nvls; }
+
+Transport NvlsConnection::remoteTransport() { return Transport::Nvls; }
+
+void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) {
+  throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage);
+}
+
+void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64_t) {
+  throw Error("NVLS does not have a CPU updateAndSync API", ErrorCode::InvalidUsage);
+}
+
+void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); }
+
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)
diff --git a/src/include/connection.hpp b/src/include/connection.hpp
index 47b154758..5f9108753 100644
--- a/src/include/connection.hpp
+++ b/src/include/connection.hpp
@@ -31,6 +31,21 @@ class CudaIpcConnection : public Connection {
   void flush(int64_t timeoutUsec) override;
 };
 
+class NvlsConnection : public Connection {
+ public:
+  NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints);
+
+  Transport transport() override;
+
+  Transport remoteTransport() override;
+
+  void write(RegisteredMemory dst, uint64_t dstOffset, RegisteredMemory src, uint64_t srcOffset,
+             uint64_t size) override;
+  void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
+
+  void flush(int64_t timeoutUsec) override;
+};
+
 class IBConnection : public Connection {
   Transport transport_;
   Transport remoteTransport_;

From 643f124e4b648f48b2cd9ff9e8a67cbcf1ca48ef Mon Sep 17 00:00:00 2001
From: Ubuntu
 <saemal@h100-saemal4.1dhvghweipru3nkj30qt0m2spb.jx.internal.cloudapp.net>
Date: Fri, 5 Jan 2024 00:55:39 +0000
Subject: [PATCH 05/67] wip

---
 src/connection.cc          | 47 +++++++++++++++++++++++++++++++++++++-
 src/include/connection.hpp | 10 +++++++-
 src/registered_memory.cc   |  7 ++++++
 3 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/src/connection.cc b/src/connection.cc
index f1331e679..a2a2f12f9 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -94,7 +94,9 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // NVLS
 
-NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints) {
+NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints, size_t bufferSize,
+                               bool isRoot)
+    : isRoot_(isRoot) {
   if (localEndpoint.transport() != Transport::Nvls) {
     throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage);
   }
@@ -110,6 +112,47 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> rem
       throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage);
     }
   }
+  int nDevices = 1 + remoteEndpoints.size();
+  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId_));
+
+  CUmulticastObjectProp mcProp = {};
+  mcProp.numDevices = nDevices;
+  mcProp.size = bufferSize;
+  mcProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+  size_t minGran = 0;
+  size_t gran = 0;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  // only root needs to create the multicast handle
+  if (isRoot_) {
+    size_t mcSize = ((bufferSize + gran - 1) / gran) * gran;
+    mcProp.size = mcSize;
+
+    MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp));
+  }
+
+  // Allocate physical memory
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = cudaDeviceId_;
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+  // allocate physical memory (data buffer)
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle_, bufferSize, &prop, 0 /*flags*/));
+
+  // usual VA business: map both MC and PA to two different VA addresses
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = cudaDeviceId_;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  // Map a VA to UC space
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, minGran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0));
+  // set access on UC address
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1));
+
   INFO(MSCCLPP_P2P, "NVLS connection created");
 }
 
@@ -127,6 +170,8 @@ void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64
 
 void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); }
 
+void* NvlsConnection::getDevicePointer() { return deviceBuffer_; }
+
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)
diff --git a/src/include/connection.hpp b/src/include/connection.hpp
index 5f9108753..f15283b28 100644
--- a/src/include/connection.hpp
+++ b/src/include/connection.hpp
@@ -32,8 +32,14 @@ class CudaIpcConnection : public Connection {
 };
 
 class NvlsConnection : public Connection {
+  int cudaDeviceId_;
+  bool isRoot_;
+  CUmemGenericAllocationHandle mcHandle_;
+  CUmemGenericAllocationHandle memHandle_;
+  void* deviceBuffer_;
+
  public:
-  NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints);
+  NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints, size_t bufferSize, bool isRoot);
 
   Transport transport() override;
 
@@ -44,6 +50,8 @@ class NvlsConnection : public Connection {
   void updateAndSync(RegisteredMemory dst, uint64_t dstOffset, uint64_t* src, uint64_t newValue) override;
 
   void flush(int64_t timeoutUsec) override;
+
+  void* getDevicePointer();
 };
 
 class IBConnection : public Connection {
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 6d5fd79f5..043268821 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -20,6 +20,7 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
       hostHash(getHostHash()),
       pidHash(getPidHash()),
       transports(transports) {
+  // CUDA IPC
   if (transports.has(Transport::CudaIpc)) {
     TransportInfo transportInfo;
     transportInfo.transport = Transport::CudaIpc;
@@ -34,6 +35,8 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
     transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
     this->transportInfos.push_back(transportInfo);
   }
+
+  // IB
   if ((transports & AllIBTransports).any()) {
     auto addIb = [&](Transport ibTransport) {
       TransportInfo transportInfo;
@@ -54,6 +57,10 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
     if (transports.has(Transport::IB6)) addIb(Transport::IB6);
     if (transports.has(Transport::IB7)) addIb(Transport::IB7);
   }
+
+  // NVLS
+  // if ((transports.has(Transport::NVLS))) {
+  // }
 }
 
 MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr<Impl> pimpl) : pimpl_(pimpl) {}

From be04f9e72c665013f600f654d63a617fc767146d Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 5 Jan 2024 02:27:45 +0000
Subject: [PATCH 06/67] wip

---
 src/include/registered_memory.hpp |  3 +++
 src/registered_memory.cc          | 30 ++++++++++++++++++++++++++++--
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp
index 11cd30231..62594a855 100644
--- a/src/include/registered_memory.hpp
+++ b/src/include/registered_memory.hpp
@@ -27,6 +27,9 @@ struct TransportInfo {
       const IbMr* ibMr;
       IbMrInfo ibMrInfo;
     };
+    struct {
+      int fileDesciptor;
+    };
   };
 };
 
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index 043268821..f41cfda0a 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -59,8 +59,19 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
   }
 
   // NVLS
-  // if ((transports.has(Transport::NVLS))) {
-  // }
+  if ((transports.has(Transport::Nvls))) {
+    if (size != sizeof(CUmemGenericAllocationHandle)) {
+      throw mscclpp::Error("data must be an element of type CUmemGenericAllocationHandle", ErrorCode::InvalidUsage);
+    }
+    if ((transports & AllIBTransports).any() || (transports.has(Transport::CudaIpc))) {
+      throw mscclpp::Error("NVLS transport can only be used by itself", ErrorCode::InvalidUsage);
+    }
+    TransportInfo transportInfo;
+    MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesciptor,
+                                                 *reinterpret_cast<CUmemGenericAllocationHandle*>(data),
+                                                 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
+    this->transportInfos.push_back(transportInfo);
+  }
 }
 
 MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr<Impl> pimpl) : pimpl_(pimpl) {}
@@ -95,6 +106,9 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
                   std::back_inserter(result));
     } else if (AllIBTransports.has(entry.transport)) {
       std::copy_n(reinterpret_cast<char*>(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result));
+    } else if (entry.transport == Transport::Nvls) {
+      std::copy_n(reinterpret_cast<char*>(&entry.fileDesciptor), sizeof(entry.fileDesciptor),
+                  std::back_inserter(result));
     } else {
       throw mscclpp::Error("Unknown transport", ErrorCode::InternalError);
     }
@@ -136,6 +150,9 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
       std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast<char*>(&transportInfo.ibMrInfo));
       it += sizeof(transportInfo.ibMrInfo);
       transportInfo.ibLocal = false;
+    } else if (transportInfo.transport == Transport::Nvls) {
+      std::copy_n(it, sizeof(transportInfo.fileDesciptor), reinterpret_cast<char*>(&transportInfo.fileDesciptor));
+      it += sizeof(transportInfo.fileDesciptor);
     } else {
       throw mscclpp::Error("Unknown transport", ErrorCode::InternalError);
     }
@@ -156,6 +173,12 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
     MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
     this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
     INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data);
+  } else if (transports.has(Transport::Nvls) && getHostHash() == this->hostHash) {
+    auto entry = getTransportInfo(Transport::Nvls);
+    this->data = new CUmemGenericAllocationHandle;
+    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(reinterpret_cast<CUmemGenericAllocationHandle*>(this->data),
+                                                   reinterpret_cast<void*>(entry.fileDesciptor),
+                                                   CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
   } else {
     // No valid data pointer can be set
     this->data = nullptr;
@@ -174,6 +197,9 @@ RegisteredMemory::Impl::~Impl() {
     }
     data = nullptr;
   }
+  if (data && transports.has(Transport::Nvls)) {
+    delete reinterpret_cast<CUmemGenericAllocationHandle*>(this->data);
+  }
 }
 
 const TransportInfo& RegisteredMemory::Impl::getTransportInfo(Transport transport) const {

From 2211a1456ffac05d3dd13176fec1a1d853a236bf Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemaleki@gmail.com>
Date: Fri, 5 Jan 2024 11:44:13 -0800
Subject: [PATCH 07/67] restoring registered memory changes

---
 src/include/registered_memory.hpp |  3 ---
 src/registered_memory.cc          | 33 -------------------------------
 2 files changed, 36 deletions(-)

diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp
index 62594a855..11cd30231 100644
--- a/src/include/registered_memory.hpp
+++ b/src/include/registered_memory.hpp
@@ -27,9 +27,6 @@ struct TransportInfo {
       const IbMr* ibMr;
       IbMrInfo ibMrInfo;
     };
-    struct {
-      int fileDesciptor;
-    };
   };
 };
 
diff --git a/src/registered_memory.cc b/src/registered_memory.cc
index f41cfda0a..6d5fd79f5 100644
--- a/src/registered_memory.cc
+++ b/src/registered_memory.cc
@@ -20,7 +20,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
       hostHash(getHostHash()),
       pidHash(getPidHash()),
       transports(transports) {
-  // CUDA IPC
   if (transports.has(Transport::CudaIpc)) {
     TransportInfo transportInfo;
     transportInfo.transport = Transport::CudaIpc;
@@ -35,8 +34,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
     transportInfo.cudaIpcOffsetFromBase = (char*)data - (char*)baseDataPtr;
     this->transportInfos.push_back(transportInfo);
   }
-
-  // IB
   if ((transports & AllIBTransports).any()) {
     auto addIb = [&](Transport ibTransport) {
       TransportInfo transportInfo;
@@ -57,21 +54,6 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
     if (transports.has(Transport::IB6)) addIb(Transport::IB6);
     if (transports.has(Transport::IB7)) addIb(Transport::IB7);
   }
-
-  // NVLS
-  if ((transports.has(Transport::Nvls))) {
-    if (size != sizeof(CUmemGenericAllocationHandle)) {
-      throw mscclpp::Error("data must be an element of type CUmemGenericAllocationHandle", ErrorCode::InvalidUsage);
-    }
-    if ((transports & AllIBTransports).any() || (transports.has(Transport::CudaIpc))) {
-      throw mscclpp::Error("NVLS transport can only be used by itself", ErrorCode::InvalidUsage);
-    }
-    TransportInfo transportInfo;
-    MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesciptor,
-                                                 *reinterpret_cast<CUmemGenericAllocationHandle*>(data),
-                                                 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
-    this->transportInfos.push_back(transportInfo);
-  }
 }
 
 MSCCLPP_API_CPP RegisteredMemory::RegisteredMemory(std::shared_ptr<Impl> pimpl) : pimpl_(pimpl) {}
@@ -106,9 +88,6 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
                   std::back_inserter(result));
     } else if (AllIBTransports.has(entry.transport)) {
       std::copy_n(reinterpret_cast<char*>(&entry.ibMrInfo), sizeof(entry.ibMrInfo), std::back_inserter(result));
-    } else if (entry.transport == Transport::Nvls) {
-      std::copy_n(reinterpret_cast<char*>(&entry.fileDesciptor), sizeof(entry.fileDesciptor),
-                  std::back_inserter(result));
     } else {
       throw mscclpp::Error("Unknown transport", ErrorCode::InternalError);
     }
@@ -150,9 +129,6 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
       std::copy_n(it, sizeof(transportInfo.ibMrInfo), reinterpret_cast<char*>(&transportInfo.ibMrInfo));
       it += sizeof(transportInfo.ibMrInfo);
       transportInfo.ibLocal = false;
-    } else if (transportInfo.transport == Transport::Nvls) {
-      std::copy_n(it, sizeof(transportInfo.fileDesciptor), reinterpret_cast<char*>(&transportInfo.fileDesciptor));
-      it += sizeof(transportInfo.fileDesciptor);
     } else {
       throw mscclpp::Error("Unknown transport", ErrorCode::InternalError);
     }
@@ -173,12 +149,6 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
     MSCCLPP_CUDATHROW(cudaIpcOpenMemHandle(&base, entry.cudaIpcBaseHandle, cudaIpcMemLazyEnablePeerAccess));
     this->data = static_cast<char*>(base) + entry.cudaIpcOffsetFromBase;
     INFO(MSCCLPP_P2P, "Opened CUDA IPC handle at pointer %p", this->data);
-  } else if (transports.has(Transport::Nvls) && getHostHash() == this->hostHash) {
-    auto entry = getTransportInfo(Transport::Nvls);
-    this->data = new CUmemGenericAllocationHandle;
-    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(reinterpret_cast<CUmemGenericAllocationHandle*>(this->data),
-                                                   reinterpret_cast<void*>(entry.fileDesciptor),
-                                                   CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
   } else {
     // No valid data pointer can be set
     this->data = nullptr;
@@ -197,9 +167,6 @@ RegisteredMemory::Impl::~Impl() {
     }
     data = nullptr;
   }
-  if (data && transports.has(Transport::Nvls)) {
-    delete reinterpret_cast<CUmemGenericAllocationHandle*>(this->data);
-  }
 }
 
 const TransportInfo& RegisteredMemory::Impl::getTransportInfo(Transport transport) const {

From f5acce87f81b32c09f8611ee9809dada3598f27b Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saeed@Saeeds-MacBook-Pro.local>
Date: Wed, 10 Jan 2024 20:15:03 -0800
Subject: [PATCH 08/67] wip

---
 include/mscclpp/core.hpp | 21 ++++++++++++++++++---
 src/core.cc              |  2 ++
 src/endpoint.cc          | 27 +++++++++++++++++++++++++++
 src/include/endpoint.hpp |  7 +++++++
 4 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 24b54fd33..1d12a4083 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -125,7 +125,8 @@ class TcpBootstrap : public Bootstrap {
 enum class Transport {
   Unknown,       // Unknown transport type.
   CudaIpc,       // CUDA IPC transport type.
-  Nvls,          // NVLS transport type.
+  NvlsRoot,      // NVLS for root transport type.
+  NvlsNonRoot,   // NVLS for non-root transport type.
   IB0,           // InfiniBand device 0 transport type.
   IB1,           // InfiniBand device 1 transport type.
   IB2,           // InfiniBand device 2 transport type.
@@ -137,11 +138,11 @@ enum class Transport {
   NumTransports  // The number of transports.
 };
 
-const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2",
+const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", "IB0", "IB1", "IB2",
                                       "IB3", "IB4", "IB5",  "IB6", "IB7", "NUM"};
 
 namespace detail {
-const size_t TransportFlagsSize = 11;
+const size_t TransportFlagsSize = 13;
 static_assert(TransportFlagsSize == static_cast<size_t>(Transport::NumTransports),
               "TransportFlagsSize must match the number of transports");
 /// Bitset for storing transport flags.
@@ -460,6 +461,8 @@ struct EndpointConfig {
   int ibMaxSendWr = DefaultMaxSendWr;
   int ibMaxWrPerSend = DefaultMaxWrPerSend;
 
+  size_t nvlsBufferSize;
+
   /// Default constructor. Sets transport to Transport::Unknown.
   EndpointConfig() : transport(Transport::Unknown) {}
 
@@ -467,6 +470,15 @@ struct EndpointConfig {
   ///
   /// @param transport The transport to use.
   EndpointConfig(Transport transport) : transport(transport) {}
+
+  /// Constructor for NVLS explicitly
+  /// @param transport must be either NvlsRoot or NvlsNonRoot
+  /// @param nvlsBufferSize is the buffer to be alloced on each device
+  EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) {
+    if (!AllNvlsTransports.has(transport)) {
+      throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage);
+    }
+  }
 };
 
 /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases
@@ -688,6 +700,9 @@ extern const TransportFlags NoTransports;
 /// A constant TransportFlags object representing all InfiniBand transports.
 extern const TransportFlags AllIBTransports;
 
+/// A constant TransportFlags object representing all NVLS transports.
+extern const TransportFlags AllNvlsTransports;
+
 /// A constant TransportFlags object representing all transports.
 extern const TransportFlags AllTransports;
 
diff --git a/src/core.cc b/src/core.cc
index 4d89250d0..84faf4783 100644
--- a/src/core.cc
+++ b/src/core.cc
@@ -87,6 +87,8 @@ const TransportFlags NoTransports = TransportFlags();
 const TransportFlags AllIBTransports = Transport::IB0 | Transport::IB1 | Transport::IB2 | Transport::IB3 |
                                        Transport::IB4 | Transport::IB5 | Transport::IB6 | Transport::IB7;
 
+const TransportFlags AllNvlsTransports = Transport::NvlsNonRoot | Transport::NvlsRoot;
+
 const TransportFlags AllTransports = AllIBTransports | Transport::CudaIpc;
 
 void Setuppable::beginSetup(std::shared_ptr<Bootstrap>) {}
diff --git a/src/endpoint.cc b/src/endpoint.cc
index dbc773898..350cba07e 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -16,6 +16,23 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
                 ->createQp(config.ibMaxCqSize, config.ibMaxCqPollNum, config.ibMaxSendWr, 0, config.ibMaxWrPerSend);
     ibQpInfo_ = ibQp_->getInfo();
   }
+
+  if (AllNvlsTransports.has(transport_)) {
+    minMcGran_ = 0;
+    mcGran_ = 0;
+    mcProp_.size = config.nvlsBufferSize;
+    mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+    mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
+    // create the mc handle now only on the root
+    if (transport_ == Transport::NvlsRoot){
+      MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
+
+      fileDesc_ = 0;
+      MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&fileDesc_, handle, handleType, 0 /*flags*/));
+    }
+  }
 }
 
 MSCCLPP_API_CPP Transport Endpoint::transport() { return pimpl_->transport_; }
@@ -27,6 +44,10 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() {
   if (AllIBTransports.has(pimpl_->transport_)) {
     std::copy_n(reinterpret_cast<char*>(&pimpl_->ibQpInfo_), sizeof(pimpl_->ibQpInfo_), std::back_inserter(data));
   }
+
+  if (transport_ == Transport::NvlsRoot) {
+    std::copy_n(reinterpret_cast<char*>(&pimpl_->fileDesc_), sizeof(pimpl_->fileDesc_), std::back_inserter(data));
+  }
   return data;
 }
 
@@ -45,6 +66,12 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     std::copy_n(it, sizeof(ibQpInfo_), reinterpret_cast<char*>(&ibQpInfo_));
     it += sizeof(ibQpInfo_);
   }
+  if (transport_ == Transport::NvlsNonRoot) {
+    fileDesc_ = 0;
+    std::copy_n(it, sizeof(fileDesc_), reinterpret_cast<char*>(&fileDesc_));
+    it += sizeof(fileDesc_);
+    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)fileDesc_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+  }
 }
 
 MSCCLPP_API_CPP Endpoint::Endpoint(std::shared_ptr<mscclpp::Endpoint::Impl> pimpl) : pimpl_(pimpl) {}
diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp
index 311fa9982..00322674e 100644
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -22,6 +22,13 @@ struct Endpoint::Impl {
   bool ibLocal_;
   IbQp* ibQp_;
   IbQpInfo ibQpInfo_;
+
+  // These are only defined for multicast (NVLS) capability
+  CUmulticastObjectProp mcProp_;
+  CUmemGenericAllocationHandle mcHandle_;
+  size_t minMcGran_;
+  size_t mcGran_;
+  int fileDesc_;
 };
 
 }  // namespace mscclpp

From 985d8c78aa024655f2cf47492a94956c08cd765b Mon Sep 17 00:00:00 2001
From: "Saeed Maleki (saemal)" <saemal@ame.gbl>
Date: Sun, 14 Jan 2024 17:20:53 -0800
Subject: [PATCH 09/67] testing pid_getfd

---
 nvls/test.cu | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/nvls/test.cu b/nvls/test.cu
index 7bf12a699..48da87eef 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -120,9 +120,16 @@ int main() {
   }
   ncclIpcSocketClose(&ipcSock);
 
+  pid_t currentPid = getpid();
+  MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&currentPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
+
   // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
   // everyone else would now have same multicast object
   if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerfd, handleType));
+  int peerFd = 0;
+  if (myrank) peerFd = pidfd_getfd(currendPid, fd, 0);
+  printf("peerFd = %d\n", peerFd);
 
   //  if(myrank)
   //    close(peerfd);

From 5cc805ba980acf0661cf397e1b20e93108c464e1 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 15 Jan 2024 02:12:21 +0000
Subject: [PATCH 10/67] removing ipc dep

---
 nvls/test.cu | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

diff --git a/nvls/test.cu b/nvls/test.cu
index 48da87eef..bbbc3e391 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -2,11 +2,10 @@
 #include <cuda_runtime.h>
 #include <mpi.h>
 #include <stdio.h>
-// #include <nccl.h>
 #include <cudaTypedefs.h>
 #include <unistd.h>
-
-#include "ipcsocket.cc"
+#include <sys/syscall.h>
+#include <sys/types.h>
 
 #define CUCHECK(cmd)                                     \
   do {                                                   \
@@ -104,32 +103,17 @@ int main() {
   // exported handles
   //  moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
 
-  volatile uint32_t abortFlag = 0;
-  struct ncclIpcSocket ipcSock = {0};
-  uint64_t opId = 0xdeadcafebeef;
-  // ncclResult_t ret = ncclSuccess;
-
-  ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag);
-  MPI_Barrier(MPI_COMM_WORLD);
-  if (!myrank) {
-    for (int p = 1; p < nranks; p++) {
-      ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId);
-    }
-  } else {
-    ncclIpcSocketRecvFd(&ipcSock, &peerfd);
-  }
-  ncclIpcSocketClose(&ipcSock);
-
   pid_t currentPid = getpid();
   MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
   MPI_Bcast(&currentPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
+  int pidFd = syscall(SYS_pidfd_open, currentPid, 0);
 
   // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
   // everyone else would now have same multicast object
-  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerfd, handleType));
   int peerFd = 0;
-  if (myrank) peerFd = pidfd_getfd(currendPid, fd, 0);
-  printf("peerFd = %d\n", peerFd);
+  peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0);
+  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerFd, handleType));
+  MPI_Barrier(MPI_COMM_WORLD);
 
   //  if(myrank)
   //    close(peerfd);

From abfba07519b4df93414b154ac384c8ec1a28f8b0 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 15 Jan 2024 02:12:48 +0000
Subject: [PATCH 11/67] clean up

---
 nvls/align.h        |   47 --
 nvls/alloc.h        |  270 -----------
 nvls/argcheck.h     |   16 -
 nvls/bootstrap.h    |   32 --
 nvls/channel.h      |   48 --
 nvls/checks.h       |  160 -------
 nvls/coll_net.h     |   35 --
 nvls/collectives.h  |   48 --
 nvls/comm.h         |  473 -------------------
 nvls/core.h         |   41 --
 nvls/cpuset.h       |   61 ---
 nvls/cudawrap.h     |  129 ------
 nvls/debug.h        |   48 --
 nvls/device.h       |  463 -------------------
 nvls/enqueue.h      |   26 --
 nvls/gdrwrap.h      |  252 -----------
 nvls/graph.h        |  116 -----
 nvls/group.h        |  137 ------
 nvls/ibvcore.h      | 1058 -------------------------------------------
 nvls/ibvsymbols.h   |   46 --
 nvls/ibvwrap.h      |   92 ----
 nvls/info.h         |  134 ------
 nvls/ipcsocket.cc   |  232 ----------
 nvls/ipcsocket.h    |   38 --
 nvls/nccl_common.h  |   33 --
 nvls/nccl_net.h     |  333 --------------
 nvls/nccl_tuner.h   |   55 ---
 nvls/net.h          |   27 --
 nvls/net_device.h   |   29 --
 nvls/nvmlwrap.h     |  214 ---------
 nvls/nvtx.h         |   85 ----
 nvls/p2p.h          |   29 --
 nvls/param.h        |   30 --
 nvls/profiler.h     |   37 --
 nvls/proxy.h        |  296 ------------
 nvls/shm.h          |   25 -
 nvls/socket.h       |   97 ----
 nvls/strongstream.h |  140 ------
 nvls/timer.h        |   60 ---
 nvls/transport.h    |  128 ------
 nvls/trees.h        |   13 -
 nvls/tuner.h        |   22 -
 nvls/utils.h        |  524 ---------------------
 43 files changed, 6179 deletions(-)
 delete mode 100644 nvls/align.h
 delete mode 100644 nvls/alloc.h
 delete mode 100644 nvls/argcheck.h
 delete mode 100644 nvls/bootstrap.h
 delete mode 100644 nvls/channel.h
 delete mode 100644 nvls/checks.h
 delete mode 100644 nvls/coll_net.h
 delete mode 100644 nvls/collectives.h
 delete mode 100644 nvls/comm.h
 delete mode 100644 nvls/core.h
 delete mode 100644 nvls/cpuset.h
 delete mode 100644 nvls/cudawrap.h
 delete mode 100644 nvls/debug.h
 delete mode 100644 nvls/device.h
 delete mode 100644 nvls/enqueue.h
 delete mode 100644 nvls/gdrwrap.h
 delete mode 100644 nvls/graph.h
 delete mode 100644 nvls/group.h
 delete mode 100644 nvls/ibvcore.h
 delete mode 100644 nvls/ibvsymbols.h
 delete mode 100644 nvls/ibvwrap.h
 delete mode 100644 nvls/info.h
 delete mode 100644 nvls/ipcsocket.cc
 delete mode 100644 nvls/ipcsocket.h
 delete mode 100644 nvls/nccl_common.h
 delete mode 100644 nvls/nccl_net.h
 delete mode 100644 nvls/nccl_tuner.h
 delete mode 100644 nvls/net.h
 delete mode 100644 nvls/net_device.h
 delete mode 100644 nvls/nvmlwrap.h
 delete mode 100644 nvls/nvtx.h
 delete mode 100644 nvls/p2p.h
 delete mode 100644 nvls/param.h
 delete mode 100644 nvls/profiler.h
 delete mode 100644 nvls/proxy.h
 delete mode 100644 nvls/shm.h
 delete mode 100644 nvls/socket.h
 delete mode 100644 nvls/strongstream.h
 delete mode 100644 nvls/timer.h
 delete mode 100644 nvls/transport.h
 delete mode 100644 nvls/trees.h
 delete mode 100644 nvls/tuner.h
 delete mode 100644 nvls/utils.h

diff --git a/nvls/align.h b/nvls/align.h
deleted file mode 100644
index 2a71dd1bc..000000000
--- a/nvls/align.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ALIGN_H_
-#define NCCL_ALIGN_H_
-
-#define DIVUP(x, y) \
-    (((x)+(y)-1)/(y))
-
-#define ROUNDUP(x, y) \
-    (DIVUP((x), (y))*(y))
-
-#define ALIGN_POWER(x, y) \
-    ((x) > (y) ? ROUNDUP(x, y) : ((y)/((y)/(x))))
-
-#define ALIGN_SIZE(size, align) \
-  size = ((size + (align) - 1) / (align)) * (align);
-
-#if !__CUDA_ARCH__
-  #ifndef __host__
-    #define __host__
-  #endif
-  #ifndef __device__
-    #define __device__
-  #endif
-#endif
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z divUp(X x, Y y) {
-  return (x+y-1)/y;
-}
-
-template<typename X, typename Y, typename Z = decltype(X()+Y())>
-__host__ __device__ constexpr Z roundUp(X x, Y y) {
-  return (x+y-1) - (x+y-1)%y;
-}
-
-// assumes second argument is a power of 2
-template<typename X, typename Z = decltype(X()+int())>
-__host__ __device__ constexpr Z alignUp(X x, int a) {
-  return (x+a-1) & Z(-a);
-}
-
-#endif
diff --git a/nvls/alloc.h b/nvls/alloc.h
deleted file mode 100644
index f8d954469..000000000
--- a/nvls/alloc.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ALLOC_H_
-#define NCCL_ALLOC_H_
-
-#include "nccl.h"
-#include "checks.h"
-#include "align.h"
-#include "utils.h"
-#include "p2p.h"
-#include <sys/mman.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-
-uint64_t clockNano(); // from utils.h with which we have a circular dependency
-
-template <typename T>
-ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaHostAlloc(ptr, nelem*sizeof(T), cudaHostAllocMapped), result, finish);
-  memset(*ptr, 0, nelem*sizeof(T));
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA host alloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return result;
-}
-#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-inline ncclResult_t ncclCudaHostFree(void* ptr) {
-  CUDACHECK(cudaFreeHost(ptr));
-  return ncclSuccess;
-}
-
-template <typename T>
-ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  void* p = malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
-  }
-  //INFO(NCCL_ALLOC, "%s:%d malloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), p);
-  memset(p, 0, nelem*sizeof(T));
-  *ptr = (T*)p;
-  return ncclSuccess;
-}
-#define ncclCalloc(...) ncclCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-ncclResult_t ncclRealloc(T** ptr, size_t oldNelem, size_t nelem) {
-  if (nelem < oldNelem) return ncclInternalError;
-  if (nelem == oldNelem) return ncclSuccess;
-
-  T* oldp = *ptr;
-  T* p = (T*)malloc(nelem*sizeof(T));
-  if (p == NULL) {
-    WARN("Failed to malloc %ld bytes", nelem*sizeof(T));
-    return ncclSystemError;
-  }
-  memcpy(p, oldp, oldNelem*sizeof(T));
-  free(oldp);
-  memset(p+oldNelem, 0, (nelem-oldNelem)*sizeof(T));
-  *ptr = (T*)p;
-  INFO(NCCL_ALLOC, "Mem Realloc old size %ld, new size %ld pointer %p", oldNelem*sizeof(T), nelem*sizeof(T), *ptr);
-  return ncclSuccess;
-}
-
-#if CUDART_VERSION >= 11030
-
-#include <cuda.h>
-#include "cudawrap.h"
-
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
-  ncclResult_t result = ncclSuccess;
-  size_t granularity = 0;
-  CUdevice currentDev;
-  CUmemAllocationProp prop = {};
-  CUmemAccessDesc accessDesc = {};
-  CUmemGenericAllocationHandle handle;
-  int cudaDev;
-  int flag = 0;
-  CUDACHECK(cudaGetDevice(&cudaDev));
-  CUCHECK(cuDeviceGet(&currentDev, cudaDev));
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.requestedHandleTypes = NCCL_P2P_HANDLE_TYPE; // So it can be exported
-  prop.location.id = currentDev;
-  // Query device to see if RDMA support is available
-  CUCHECK(cuDeviceGetAttribute(&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED, currentDev));
-  if (flag) prop.allocFlags.gpuDirectRDMACapable = 1;
-  CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-  ALIGN_SIZE(size, granularity);
-  /* Allocate the physical memory on the device */
-  CUCHECK(cuMemCreate(&handle, size, &prop, 0));
-  /* Reserve a virtual address range */
-  CUCHECK(cuMemAddressReserve((CUdeviceptr *)ptr, size, granularity, 0, 0));
-  /* Map the virtual address range to the physical allocation */
-  CUCHECK(cuMemMap((CUdeviceptr)*ptr, size, 0, handle, 0));
-  /* Now allow RW access to the newly mapped memory */
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = currentDev;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  CUCHECK(cuMemSetAccess((CUdeviceptr)*ptr, size, &accessDesc, 1));
-  if (handlep) *handlep = handle;
-  TRACE(NCCL_ALLOC, "CuMem Alloc Size %zi pointer %p handle %llx", size, *ptr, handle);
-  return result;
-}
-
-static inline ncclResult_t ncclCuMemFree(void *ptr) {
-  if (ptr == NULL) return ncclSuccess;
-  ncclResult_t result = ncclSuccess;
-  CUmemGenericAllocationHandle handle;
-  size_t size = 0;
-  CUCHECK(cuMemRetainAllocationHandle(&handle, ptr));
-  CUCHECK(cuMemRelease(handle));
-  CUCHECK(cuMemGetAddressRange(NULL, &size, (CUdeviceptr)ptr));
-  TRACE(NCCL_ALLOC, "CuMem Free Size %zi pointer %p handle 0x%llx", size, ptr, handle);
-  CUCHECK(cuMemUnmap((CUdeviceptr)ptr, size));
-  CUCHECK(cuMemRelease(handle));
-  CUCHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
-  return result;
-}
-
-#else
-
-extern int ncclCuMemEnable();
-
-static inline ncclResult_t ncclCuMemAlloc(void **ptr, void *handlep, size_t size) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
-  return ncclInternalError;
-}
-static inline ncclResult_t ncclCuMemFree(void *ptr) {
-  WARN("CUMEM not supported prior to CUDA 11.3");
-  return ncclInternalError;
-}
-
-#endif
-
-template <typename T>
-ncclResult_t ncclCudaMallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  }
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return result;
-}
-#define ncclCudaMalloc(...) ncclCudaMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-ncclResult_t ncclCudaCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return result;
-}
-#define ncclCudaCalloc(...) ncclCudaCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-ncclResult_t ncclCudaCallocAsyncDebug(T** ptr, size_t nelem, cudaStream_t stream, const char *filefunc, int line) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  *ptr = nullptr;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemAlloc((void **)ptr, NULL, nelem*sizeof(T)), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
-  }
-  CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr) WARN("Failed to CUDA calloc async %ld bytes", nelem*sizeof(T));
-  INFO(NCCL_ALLOC, "%s:%d Cuda Alloc Size %ld pointer %p", filefunc, line, nelem*sizeof(T), *ptr);
-  return result;
-}
-#define ncclCudaCallocAsync(...) ncclCudaCallocAsyncDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-template <typename T>
-ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  // Need a side stream so as not to interfere with graph capture.
-  cudaStream_t stream;
-  CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), result, finish);
-  NCCLCHECKGOTO(ncclCudaMemcpyAsync(dst, src, nelem, stream), result, finish);
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
-  CUDACHECKGOTO(cudaStreamDestroy(stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-template <typename T>
-ncclResult_t ncclCudaMemcpyAsync(T* dst, T* src, size_t nelem, cudaStream_t stream) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  CUDACHECKGOTO(cudaMemcpyAsync(dst, src, nelem*sizeof(T), cudaMemcpyDefault, stream), result, finish);
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-template <typename T>
-ncclResult_t ncclCudaFree(T* ptr) {
-  ncclResult_t result = ncclSuccess;
-  cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
-  TRACE(NCCL_ALLOC, "Cuda Free pointer %p", ptr);
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (ncclCuMemEnable()) {
-    NCCLCHECKGOTO(ncclCuMemFree((void *)ptr), result, finish);
-  } else {
-    CUDACHECKGOTO(cudaFree(ptr), result, finish);
-  }
-finish:
-  CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  return result;
-}
-
-// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
-// allocated on separate pages as those pages will be marked DONTFORK
-// and if they are shared, that could cause a crash in a child process
-inline ncclResult_t ncclIbMallocDebug(void** ptr, size_t size, const char *filefunc, int line) {
-  size_t page_size = sysconf(_SC_PAGESIZE);
-  void* p;
-  int size_aligned = ROUNDUP(size, page_size);
-  int ret = posix_memalign(&p, page_size, size_aligned);
-  if (ret != 0) return ncclSystemError;
-  memset(p, 0, size);
-  *ptr = p;
-  INFO(NCCL_ALLOC, "%s:%d Ib Alloc Size %ld pointer %p", filefunc, line, size, *ptr);
-  return ncclSuccess;
-}
-#define ncclIbMalloc(...) ncclIbMallocDebug(__VA_ARGS__, __FILE__, __LINE__)
-
-#endif
diff --git a/nvls/argcheck.h b/nvls/argcheck.h
deleted file mode 100644
index 8d8b74e8e..000000000
--- a/nvls/argcheck.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ARGCHECK_H_
-#define NCCL_ARGCHECK_H_
-
-#include "core.h"
-#include "info.h"
-
-ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname);
-ncclResult_t ArgsCheck(struct ncclInfo* info);
-
-#endif
diff --git a/nvls/bootstrap.h b/nvls/bootstrap.h
deleted file mode 100644
index 400a479fb..000000000
--- a/nvls/bootstrap.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_BOOTSTRAP_H_
-#define NCCL_BOOTSTRAP_H_
-
-#include "nccl.h"
-#include "comm.h"
-
-struct ncclBootstrapHandle {
-  uint64_t magic;
-  union ncclSocketAddress addr;
-};
-static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
-
-ncclResult_t bootstrapNetInit();
-ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
-ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
-ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
-ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
-ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
-ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
-ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
-ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
-ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
-ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
-ncclResult_t bootstrapClose(void* commState);
-ncclResult_t bootstrapAbort(void* commState);
-#endif
diff --git a/nvls/channel.h b/nvls/channel.h
deleted file mode 100644
index adc38749a..000000000
--- a/nvls/channel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_CHANNEL_H_
-#define NCCL_CHANNEL_H_
-#include "comm.h"
-
-ncclResult_t initChannel(struct ncclComm* comm, int channelid);
-ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
-ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
-ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
-static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
-  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
-  int peerNode = comm->rankToNode[peer];
-  int peerIndex = comm->rankToLocalRank[peer];
-  int nsteps = comm->maxLocalRanks;
-  int rankIndex = comm->rankToLocalRank[comm->rank];
-  int step, delta;
-  if (coll == ncclFuncSend) {
-    step = (nsteps + peerIndex - rankIndex)%nsteps;
-    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
-  } else if (coll == ncclFuncRecv) {
-    step = (nsteps + rankIndex - peerIndex)%nsteps;
-    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
-  } else {
-    return ncclInternalError;
-  }
-  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
-  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
-  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
-  int base;
-  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
-  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
-  return ncclSuccess;
-}
-
-#endif
diff --git a/nvls/checks.h b/nvls/checks.h
deleted file mode 100644
index c9fd16176..000000000
--- a/nvls/checks.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_CHECKS_H_
-#define NCCL_CHECKS_H_
-
-#include "debug.h"
-
-// Check CUDA RT calls
-#define CUDACHECK(cmd) do {                                 \
-    cudaError_t err = cmd;                                  \
-    if( err != cudaSuccess ) {                              \
-        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        return ncclUnhandledCudaError;                      \
-    }                                                       \
-} while(false)
-
-#define CUDACHECKGOTO(cmd, RES, label) do {                 \
-    cudaError_t err = cmd;                                  \
-    if( err != cudaSuccess ) {                              \
-        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
-        RES = ncclUnhandledCudaError;                       \
-        goto label;                                         \
-    }                                                       \
-} while(false)
-
-// Report failure but clear error and continue
-#define CUDACHECKIGNORE(cmd) do {  \
-    cudaError_t err = cmd;         \
-    if( err != cudaSuccess ) {     \
-        INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
-        (void) cudaGetLastError(); \
-    }                              \
-} while(false)
-
-#include <errno.h>
-// Check system calls
-#define SYSCHECK(call, name) do { \
-  int retval; \
-  SYSCHECKVAL(call, name, retval); \
-} while (false)
-
-#define SYSCHECKVAL(call, name, retval) do { \
-  SYSCHECKSYNC(call, name, retval); \
-  if (retval == -1) { \
-    WARN("Call to " name " failed : %s", strerror(errno)); \
-    return ncclSystemError; \
-  } \
-} while (false)
-
-#define SYSCHECKSYNC(call, name, retval) do { \
-  retval = call; \
-  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
-    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
-  } else { \
-    break; \
-  } \
-} while(true)
-
-#define SYSCHECKGOTO(statement, RES, label) do { \
-  if ((statement) == -1) {    \
-    /* Print the back trace*/ \
-    RES = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
-    goto label; \
-  } \
-} while (0);
-
-#define NEQCHECK(statement, value) do {   \
-  if ((statement) != value) {             \
-    /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
-    return ncclSystemError;     \
-  }                             \
-} while (0);
-
-#define NEQCHECKGOTO(statement, value, RES, label) do { \
-  if ((statement) != value) { \
-    /* Print the back trace*/ \
-    RES = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
-    goto label; \
-  } \
-} while (0);
-
-#define EQCHECK(statement, value) do {    \
-  if ((statement) == value) {             \
-    /* Print the back trace*/             \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
-    return ncclSystemError;     \
-  }                             \
-} while (0);
-
-#define EQCHECKGOTO(statement, value, RES, label) do { \
-  if ((statement) == value) { \
-    /* Print the back trace*/ \
-    RES = ncclSystemError;    \
-    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
-    goto label; \
-  } \
-} while (0);
-
-// Propagate errors up
-#define NCCLCHECK(call) do { \
-  ncclResult_t RES = call; \
-  if (RES != ncclSuccess && RES != ncclInProgress) { \
-    /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
-    return RES; \
-  } \
-} while (0);
-
-#define NCCLCHECKGOTO(call, RES, label) do { \
-  RES = call; \
-  if (RES != ncclSuccess && RES != ncclInProgress) { \
-    /* Print the back trace*/ \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
-    goto label; \
-  } \
-} while (0);
-
-#define NCCLWAIT(call, cond, abortFlagPtr) do {         \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
-  ncclResult_t RES = call;                \
-  if (RES != ncclSuccess && RES != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
-    return ncclInternalError;             \
-  }                                       \
-  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
-} while (!(cond));
-
-#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
-  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
-  RES = call;                             \
-  if (RES != ncclSuccess && RES != ncclInProgress) {               \
-    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
-    goto label;                           \
-  }                                       \
-  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
-} while (!(cond));
-
-#define NCCLCHECKTHREAD(a, args) do { \
-  if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
-    return args; \
-  } \
-} while(0)
-
-#define CUDACHECKTHREAD(a) do { \
-  if ((a) != cudaSuccess) { \
-    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
-    args->ret = ncclUnhandledCudaError; \
-    return args; \
-  } \
-} while(0)
-
-#endif
diff --git a/nvls/coll_net.h b/nvls/coll_net.h
deleted file mode 100644
index f4b540866..000000000
--- a/nvls/coll_net.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef COLL_NET_H_
-#define COLL_NET_H_
-
-#include "nccl.h"
-#include "nccl_net.h"
-
-typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
-
-// Translation to external API
-static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
-static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
-static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
-static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
-static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
-static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
-static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
-/* DMA-BUF support */
-static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
-static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
-  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
-static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
-static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
-static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
-
-static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
-
-#endif
diff --git a/nvls/collectives.h b/nvls/collectives.h
deleted file mode 100644
index 0f965276a..000000000
--- a/nvls/collectives.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_COLLECTIVES_H_
-#define NCCL_COLLECTIVES_H_
-
-#include "nccl.h"
-
-// CHUNKSIZE must be a multiple of SLICESIZE
-#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
-#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
-#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
-#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
-#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
-#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
-#define BROADCAST_SLICESTEPS 1
-#define BROADCAST_CHUNKSTEPS 1
-#define REDUCE_SLICESTEPS 1
-#define REDUCE_CHUNKSTEPS 1
-#define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
-
-inline int ncclTypeSize(ncclDataType_t type) {
-  switch (type) {
-  case ncclInt8:
-  case ncclUint8:
-    return 1;
-  case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-  case ncclBfloat16:
-  #endif
-    return 2;
-  case ncclInt32:
-  case ncclUint32:
-  case ncclFloat32:
-    return 4;
-  case ncclInt64:
-  case ncclUint64:
-  case ncclFloat64:
-    return 8;
-  default:
-    return -1;
-  }
-}
-
-#endif
diff --git a/nvls/comm.h b/nvls/comm.h
deleted file mode 100644
index 328ffef3b..000000000
--- a/nvls/comm.h
+++ /dev/null
@@ -1,473 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_COMM_H_
-#define NCCL_COMM_H_
-
-#include "transport.h"
-#include "p2p.h"
-#include "collectives.h"
-#include "nccl_tuner.h"
-#include "proxy.h"
-#include "strongstream.h"
-#include "nccl_net.h"
-
-#if CUDART_VERSION < 9000
-struct cudaLaunchParams {
-  void *func;
-  dim3 gridDim;
-  dim3 blockDim;
-  void **args;
-  size_t sharedMem;
-  cudaStream_t stream;
-};
-#endif
-
-#define CACHE_LINE_SIZE 128
-#define MEM_ALIGN 4096
-#define CUDA_IPC_MIN 2097152UL
-
-// Channels / LL tuning
-#define NCCL_LL_THREAD_THRESHOLD 8
-#define NCCL_LL128_THREAD_THRESHOLD 8
-#define NCCL_SIMPLE_THREAD_THRESHOLD 64
-
-struct ncclSendMem {
-  union {
-    struct {
-      uint64_t head;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      void* ptrExchange;
-      uint64_t redOpArgExchange[2];
-      char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
-      int offsFifo[NCCL_STEPS];
-    };
-    char pad3[MEM_ALIGN];
-  };
-};
-
-struct ncclRecvMem {
-  union {
-    struct {
-      uint64_t tail;
-      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
-      int sizesFifo[NCCL_STEPS];
-      int offsFifo[NCCL_STEPS];
-      int flush; // For GDRCopy-based flush
-    };
-    char pad4[MEM_ALIGN];
-  };
-};
-
-enum helperThreadState {ThreadStart, ThreadStop};
-
-#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
-
-struct ncclGraphHelperResources {
-  ncclComm* comm;
-  pthread_mutex_t threadLock;
-  pthread_cond_t  threadCond;
-  enum helperThreadState threadState;
-  void* ipcBases[NCCL_IPC_POOL_SIZE];
-  int ipcTail;
-  int ipcHead;
-};
-
-struct ncclUserRedOp {
-  int freeNext; // -1=allocated, otherwise index of next free entry in array
-  ncclDataType_t datatype;
-  ncclDevRedOpFull opFull;
-};
-
-struct ncclNodeRanks {
-  int localRanks;
-  int* localRankToRank;
-};
-
-struct ncclDestructor {
-  struct ncclDestructor* next;
-  void* obj;
-  ncclResult_t(*fn)(struct ncclDestructor* me);
-};
-
-struct ncclCommCallback {
-  struct ncclCommCallback* next;
-  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
-};
-
-struct ncclSharedResources {
-  int refCount;
-  struct ncclComm* owner; /* comm which creates this shared res. */
-  struct ncclChannelPeer* peers[MAXCHANNELS];
-  struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
-  /* P2P operation counter, one per channel */
-  uint64_t p2pOpCount[MAXCHANNELS];
-  /* Collective operation counter */
-  uint64_t collOpCount;
-  int tpNRanks;
-  int tpNLocalRanks;
-  int tpNChannels;
-  int tpP2pNChannels;
-  int tpP2pChunkSize;
-  uint64_t magic;
-
-  // top parent rank to localRank translation table
-  int* tpRankToLocalRank;
-  // Internal streams
-  struct ncclStrongStream deviceStream, hostStream;
-
-  /* proxy related shared res */
-  struct ncclProxyState* proxyState;
-};
-
-struct ncclChannel {
-  struct ncclChannelPeer** peers;
-  struct ncclDevChannelPeer** devPeers;
-  /* devPeer pointer array used for host side access */
-  struct ncclDevChannelPeer** devPeersHostPtr;
-  struct ncclRing ring;
-  int* devRingUserRanks;
-  struct ncclTree tree;
-
-  struct ncclTree collnetChain;
-  struct ncclDirect collnetDirect;
-
-  struct ncclNvls nvls;
-
-  int id; // index of this channel
-  uint32_t workFifoSent; // last used work index+1
-
-  /* comm split sharable resources */
-  struct ncclChannelPeer* collnetPeers;
-  struct ncclDevChannelPeer* collnetDevPeers;
-  struct ncclChannelPeer* nvlsPeers;
-  struct ncclDevChannelPeer* nvlsDevPeers;
-};
-
-struct ncclWorkList {
-  struct ncclWorkList* next;
-  struct ncclWork work;
-};
-
-struct ncclPointerList {
-  struct ncclPointerList* next;
-  void *ptr;
-};
-
-struct ncclNvlsMcHandleList {
-  struct ncclNvlsMcHandleList *next;
-  CUmemGenericAllocationHandle mcHandle;
-  CUdeviceptr ptr;
-  int dev;
-  size_t size;
-};
-
-struct ncclKernelPlan {
-  // A kernel plan is also a callback that reclaims itself. Hence this must
-  // be the first member.
-  struct ncclCommCallback reclaimer;
-  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
-
-  struct ncclComm* comm;
-  struct ncclKernelPlan* next;
-
-  bool persistent; // aka captured in a graph
-  bool kernelSpecialized;
-  void *kernelFn;
-  int channelUbound; // only channels c < channelUbound are present
-  int channelCount; // number of channels present
-  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
-  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
-  int threadPerBlock;
-  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
-  struct ncclWork* workHead;
-
-  int collOpCount; // zero based for this plan
-
-  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
-  struct ncclIntruQueue<struct ncclNvlsMcHandleList, &ncclNvlsMcHandleList::next> nvlsMcHandleQueue;
-
-  struct Channel {
-    int nWork;
-    union {
-      int nWorkElem; // used for coll and reg coll
-      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
-    };
-    size_t collBytes;
-    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
-    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
-  } channels[MAXCHANNELS];
-};
-
-struct ncclRegRequest {
-  uintptr_t buff;
-  size_t size;
-  struct ncclRegRequest *next;
-};
-
-struct ncclRegRecord {
-  uintptr_t buff;
-  size_t size;
-  CUdeviceptr regAddr;
-  size_t regSize;
-  int dev;
-  CUmemGenericAllocationHandle mcHandle;
-  uintptr_t *addrs; /* use to check if NVLS buffers match among intra-node ranks */
-  struct ncclRegRecord *next;
-};
-
-struct ncclComm {
-  struct ncclMemoryStack memPermanent, memScoped;
-  // List of destructors to run when comm is destructed
-  struct ncclDestructor* destructorHead;
-
-  struct ncclSharedResources* sharedRes;
-  /* map to top parent ranks. */
-  int* topParentRanks;
-  int* topParentLocalRanks;
-  struct ncclChannel channels[MAXCHANNELS];
-  struct ncclPeerInfo* peerInfo;
-  struct ncclTopoSystem* topo;
-
-  ncclNet_t* ncclNet;
-  ncclCollNet_t* ncclCollNet;
-  void* bootstrap;
-  // Bitmasks for ncclTransportP2pSetup
-  uint64_t* connectSend;
-  uint64_t* connectRecv;
-
-  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
-
-  uint64_t commHash;
-  int rank;    // my rank in the communicator
-  int nRanks;  // number of GPUs in communicator
-  int cudaDev; // my cuda device index
-  int nvmlDev; // my nvml device index
-  int compCap; // compute capability of the GPU
-  int minCompCap, maxCompCap; // min/max compute capability in the communicator
-  int64_t busId;   // my PCI bus ID in int format
-  cpu_set_t cpuAffinity; // CPU affinity of the GPU
-  int cudaArch; // matches __CUDA_ARCH__ of device
-
-  int node;
-  int nNodes;
-  int localRank;
-  int localRanks;
-  int maxLocalRanks;
-  int* rankToNode;
-  int* rankToLocalRank;
-  int* localRankToRank;
-  // localRanks and localRanktoRank for all nodes
-  struct ncclNodeRanks* nodeRanks;
-
-  bool checkPointers;
-  bool dmaBufSupport;
-
-  // Counter for tracking CUDA launches (P2P and collectives included)
-  uint64_t opCount;
-
-  // Channels for collectives
-  int nChannels;
-  int nvlsChannels;
-  int collNetChannels;
-  // Channels (per peer) for p2p
-  int p2pnChannels;
-  int p2pnChannelsPerPeer;
-  int p2pChannels[MAXCHANNELS];
-
-  // Should this comm allocate LL buffers for network P2P connections?
-  bool allocP2pNetLLBuffers;
-
-  // Buffer sizes
-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  int p2pChunkSize;
-
-  // Algorithm/Protocols thresholds
-  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-  float ringbdw[NCCL_NUM_FUNCTIONS][NCCL_NUM_PROTOCOLS];
-  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
-
-  /* This attribute can indicate the states of communicators and return code of
-   * asynchronous NCCL operations. */
-  ncclResult_t asyncResult;
-
-  // Flag to ask NCCL kernels to abort
-  volatile uint32_t *abortFlag;
-  volatile uint32_t *childAbortFlag;
-  uint32_t *abortFlagRefCount;
-
-  // Device side of the communicator (for cudaFree's)
-  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
-
-  // Operation pool.
-  int workFifoDepth; // size of workFifoHeap[], power of 2
-  struct ncclWork* workFifoHeap;
-  struct ncclWork* devWorkFifoHeap;
-  void* workFifoHeapGdrHandle;
-
-  // Work completion notificaion
-  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
-  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
-  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
-
-  // Intra-process sync
-  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
-  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
-  int intraRank;
-  int intraRanks;
-  uint32_t intraBarrierPhase;
-  char intraPad1[64 - sizeof(uint64_t)];
-  uint64_t intraBarrierCounter; // only used if this is intraComm0
-  char intraPad2[64 - sizeof(uint64_t)];
-  uint64_t intraBarrierGate; // only used if this is intraComm0
-
-  struct ncclProxyState* proxyState;
-  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
-  // Whether this communicator uses collNet
-  int collNetSupport;
-  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
-  int intraHighestTransportType;
-  int* collNetHeads;
-  int collNetHeadsNum;
-  /* sharable collNet proxy progress resource. */
-  struct ncclCollNetSharedRes* collNetSharedRes;
-
-  // NVLink SHARP (NVLS) support
-  int nvlsSupport;
-  int nvlsRegSupport;
-  /* sharable NVLS resource. */
-  struct ncclNvlsSharedRes* nvlsResources;
-
-  ssize_t channelSize; // User requested work size (bytes) for channel partitions
-
-  // pools backed by comm->memPermanent
-  struct ncclMemoryPool memPool_ncclProxyOp;
-  struct ncclMemoryPool memPool_ncclKernelPlan;
-  struct ncclMemoryPool memPool_ncclPointerList;
-  struct ncclMemoryPool memPool_ncclNvlsHandleList;
-  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
-  // this comm is not yet in a group.
-  struct ncclComm* groupNext;
-  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
-  struct ncclComm* preconnectNext;
-  int persistentRefs; // number of persistent plan-lists capturing this comm
-  struct ncclTasks tasks;
-
-  // user-created reduction ops
-  int userRedOpCapacity, userRedOpFreeHead;
-  ncclUserRedOp *userRedOps;
-
-  // Queue of things for the main thread to do
-  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
-
-  // List of kernel plans built form tasks.
-  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
-  // First of the unlaunched kernels in `planQueue`
-  struct ncclKernelPlan* unlaunchedPlansHead;
-
-  ncclConfig_t config;
-  // initState is to more conveniently reclaim resources when errors happen.
-  ncclResult_t initState;
-  // flag to indicate if ncclCommFinalize() is called
-  bool finalizeCalled;
-  // shared structures for finalization
-  int finalizeRankCnt;
-  // group job to support multi-thread FT
-  struct ncclGroupJob *groupJob;
-
-  /* store to buffer register request */
-  struct ncclIntruQueue<struct ncclRegRequest, &ncclRegRequest::next> regRequestQueue;
-  /* store registered buffer */
-  struct ncclIntruQueue<struct ncclRegRecord, &ncclRegRecord::next> regRecordQueue;
-
-  // Tuning plugin
-  ncclTuner_t* tuner;
-};
-
-enum ncclLaunchMode {
-  ncclLaunchModeInvalid=0,
-  ncclLaunchModeParallel,
-  ncclLaunchModeGroup
-};
-extern enum ncclLaunchMode ncclParamLaunchMode;
-
-void ncclCommPushFree(struct ncclComm* comm, void* buf);
-void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
-void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
-void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
-
-inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
-  ncclResult_t result = ncclSuccess;
-  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
-  while (cb != nullptr) {
-    struct ncclCommCallback* next = cb->next;
-    ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
-    if (res1 != ncclSuccess) result = res1;
-    cb = next;
-  }
-  NCCLCHECK(result);
-  return ncclSuccess;
-}
-
-inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
-  int phase = comm->intraBarrierPhase;
-  if (comm->intraRanks == 1) {
-    // Release everyone (just me).
-    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
-  } else {
-    struct ncclComm* comm0 = comm->intraComm0;
-    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
-    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
-      // Reset.
-      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
-      // Release everyone.
-      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
-    }
-  }
-}
-
-// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
-inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
-  struct ncclComm* comm0 = comm->intraComm0;
-  comm->intraBarrierPhase ^= 1;
-  uint32_t phase = comm->intraBarrierPhase;
-  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
-  if ((gate & 1) != phase) {
-    uint64_t t0 = clockNano();
-    do {
-      // Spin vigorously for first 5us.
-      if (clockNano()-t0 >= 5*1000) sched_yield();
-      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
-    } while ((gate & 1) != phase);
-  }
-  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
-  return gate>>32;
-}
-
-// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
-// communicator memory address. Used to catch bugs so that integer handles
-// associated with this communicator won't collide with handles of other
-// communicatrs. This function is its own inverse.
-static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
-  // Preserve the built-in values.
-  if(int(op) < int(ncclNumOps))
-    return op;
-  uint64_t h = reinterpret_cast<uint64_t>(comm);
-  h ^= h >> 32;
-  h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
-  h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
-  h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
-  int op1 = int(h) ^ int(op);
-  // Since builtin values are preserved, we also have to preserve their preimage.
-  return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
-}
-
-ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
-ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
-
-#endif
diff --git a/nvls/core.h b/nvls/core.h
deleted file mode 100644
index a1754beeb..000000000
--- a/nvls/core.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_CORE_H_
-#define NCCL_CORE_H_
-
-#include <pthread.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <algorithm> // For std::min/std::max
-#include "nccl.h"
-
-#ifdef PROFAPI
-#define NCCL_API(ret, func, args...)        \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((alias(#func)))          \
-    ret p##func (args);                     \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    __attribute__ ((weak))                  \
-    ret func(args)
-#else
-#define NCCL_API(ret, func, args...)        \
-    extern "C"                              \
-    __attribute__ ((visibility("default"))) \
-    ret func(args)
-#endif // end PROFAPI
-
-#include "debug.h"
-#include "checks.h"
-#include "cudawrap.h"
-#include "alloc.h"
-#include "utils.h"
-#include "param.h"
-#include "nvtx.h"
-
-#endif // end include guard
diff --git a/nvls/cpuset.h b/nvls/cpuset.h
deleted file mode 100644
index ec55cbc54..000000000
--- a/nvls/cpuset.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_CPUSET_H_
-#define NCCL_CPUSET_H_
-
-// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
-
-static int hexToInt(char c) {
-  int v = c - '0';
-  if (v < 0) return -1;
-  if (v > 9) v = 10 + c - 'a';
-  if ((v < 0) || (v > 15)) return -1;
-  return v;
-}
-
-#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
-
-static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
-  uint32_t cpumasks[CPU_SET_N_U32];
-  int m = CPU_SET_N_U32-1;
-  cpumasks[m] = 0;
-  for (int o=0; o<strlen(str); o++) {
-    char c = str[o];
-    if (c == ',') {
-      m--;
-      cpumasks[m] = 0;
-    } else {
-      int v = hexToInt(c);
-      if (v == -1) break;
-      cpumasks[m] <<= 4;
-      cpumasks[m] += v;
-    }
-  }
-  // Copy cpumasks to mask
-  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
-    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
-  }
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
-  int c = 0;
-  uint8_t* m8 = (uint8_t*)mask;
-  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
-    if (c == 0 && m8[o] == 0) continue;
-    sprintf(str+c, "%02x", m8[o]);
-    c+=2;
-    if (o && o%4 == 0) {
-      sprintf(str+c, ",");
-      c++;
-    }
-  }
-  str[c] = '\0';
-  return ncclSuccess;
-}
-
-#endif
diff --git a/nvls/cudawrap.h b/nvls/cudawrap.h
deleted file mode 100644
index cc363c1ac..000000000
--- a/nvls/cudawrap.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_CUDAWRAP_H_
-#define NCCL_CUDAWRAP_H_
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include "checks.h"
-
-// Is cuMem API usage enabled
-extern int ncclCuMemEnable();
-
-#if CUDART_VERSION >= 11030
-#include <cudaTypedefs.h>
-#else
-typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
-typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
-typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
-#endif
-
-#define CUPFN(symbol) pfn_##symbol
-
-// Check CUDA PFN driver calls
-#define CUCHECK(cmd) do {				      \
-    CUresult err = pfn_##cmd;				      \
-    if( err != CUDA_SUCCESS ) {				      \
-      const char *errStr;				      \
-      (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure %d '%s'", err, errStr);	      \
-      return ncclUnhandledCudaError;			      \
-    }							      \
-} while(false)
-
-#define CUCHECKGOTO(cmd, res, label) do {		      \
-    CUresult err = pfn_##cmd;				      \
-    if( err != CUDA_SUCCESS ) {				      \
-      const char *errStr;				      \
-      (void) pfn_cuGetErrorString(err, &errStr);	      \
-      WARN("Cuda failure %d '%s'", err, errStr);	      \
-      res = ncclUnhandledCudaError;			      \
-      goto label;					      \
-    }							      \
-} while(false)
-
-// Report failure but clear error and continue
-#define CUCHECKIGNORE(cmd) do {						\
-    CUresult err = pfn_##cmd;						\
-    if( err != CUDA_SUCCESS ) {						\
-      const char *errStr;						\
-      (void) pfn_cuGetErrorString(err, &errStr);			\
-      INFO(NCCL_ALL,"%s:%d Cuda failure %d '%s'", __FILE__, __LINE__, err, errStr); \
-    }									\
-} while(false)
-
-#define CUCHECKTHREAD(cmd, args) do {					\
-    CUresult err = pfn_##cmd;						\
-    if (err != CUDA_SUCCESS) {						\
-      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
-      args->ret = ncclUnhandledCudaError;				\
-      return args;							\
-    }									\
-} while(0)
-
-#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
-
-#if CUDART_VERSION >= 11030
-/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
-DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
-DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
-DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
-DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
-DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
-DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
-DECLARE_CUDA_PFN_EXTERN(cuPointerGetAttribute, 4000);
-// cuMem API support
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
-DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
-DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
-#if CUDA_VERSION >= 11070
-DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
-#endif
-#if CUDA_VERSION >= 12010
-/* NVSwitch Multicast support */
-DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
-DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
-#endif
-#endif
-
-/* CUDA Driver functions loaded with dlsym() */
-DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
-DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
-DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
-
-
-ncclResult_t ncclCudaLibraryInit(void);
-
-extern int ncclCudaDriverVersionCache;
-extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
-
-inline ncclResult_t ncclCudaDriverVersion(int* driver) {
-  int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
-  if (version == -1) {
-    CUDACHECK(cudaDriverGetVersion(&version));
-    __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
-  }
-  *driver = version;
-  return ncclSuccess;
-}
-#endif
diff --git a/nvls/debug.h b/nvls/debug.h
deleted file mode 100644
index d10217856..000000000
--- a/nvls/debug.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_INT_DEBUG_H_
-#define NCCL_INT_DEBUG_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include <stdio.h>
-#include <chrono>
-#include <type_traits>
-
-#include <limits.h>
-#include <string.h>
-#include <pthread.h>
-
-// Conform to pthread and NVTX standard
-#define NCCL_THREAD_NAMELEN 16
-
-extern int ncclDebugLevel;
-extern uint64_t ncclDebugMask;
-extern pthread_mutex_t ncclDebugLock;
-extern FILE *ncclDebugFile;
-extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
-
-void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
-
-// Let code temporarily downgrade WARN into INFO
-extern thread_local int ncclDebugNoWarn;
-extern char ncclLastError[];
-
-#define WARN(...) printf(__VA_ARGS__)
-#define INFO(FLAGS, ...) printf(__VA_ARGS__)
-#define TRACE_CALL(...) printf(__VA_ARGS__)
-
-#ifdef ENABLE_TRACE
-#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-extern std::chrono::steady_clock::time_point ncclEpoch;
-#else
-#define TRACE(...)
-#endif
-
-void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
-
-#endif
diff --git a/nvls/device.h b/nvls/device.h
deleted file mode 100644
index 56f8039f3..000000000
--- a/nvls/device.h
+++ /dev/null
@@ -1,463 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_DEVICE_H_
-#define NCCL_DEVICE_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "align.h"
-#include <stdint.h>
-
-extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS];
-
-extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
-
-extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
-
-#define NCCL_MAX_OPS 2048
-#define NCCL_STEPS 8
-
-#include "net_device.h"
-
-enum ncclDevRedOp_t {
-  ncclDevSum, ncclDevProd, ncclDevMinMax,
-  ncclDevPreMulSum, ncclDevSumPostDiv,
-  ncclNumDevRedOps
-};
-struct ncclDevRedOpFull {
-  ncclDevRedOp_t op;
-  ncclRedOp_t proxyOp;
-  bool scalarArgIsPtr;
-  uint64_t scalarArg;
-};
-
-union ncclLLFifoLine {
-  /* Flags have to be *after* data, because otherwise, an incomplete receive
-     from the network may receive the flag but not the data.
-     Note this is assuming that either we receive contiguous chunks of data
-     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
-  struct {
-    uint32_t data1;
-    uint32_t flag1;
-    uint32_t data2;
-    uint32_t flag2;
-  };
-  uint64_t v[2];
-  int4 i4;
-};
-
-#define WARP_SIZE 32
-#define MAXCHANNELS 32
-#define NCCL_MAX_NTHREADS 640
-#define NCCL_SIMPLE_MAX_NTHREADS 512
-#define NCCL_LL_MAX_NTHREADS 512
-#define NCCL_LL_LINES_PER_THREAD 8
-#ifdef TEST_LL_CLEANUP
-#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
-#define NCCL_LL_FLAG_MAX   0x100
-#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
-#else
-#define NCCL_LL_CLEAN_MASK 0x7ffffff8
-#define NCCL_LL_FLAG(a) ((uint32_t)(a))
-#endif
-// Make sure the clean mask will last for at least NCCL_NSTEPS
-static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
-
-#define NCCL_LL128_LINESIZE 128
-#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
-#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
-
-#define NCCL_LL128_MAX_NTHREADS 640
-#define NCCL_LL128_ELEMS_PER_THREAD 120
-
-#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 8
-#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
-
-#define NCCL_DIRECT_WRITE 0x01
-#define NCCL_DIRECT_READ  0x02
-#define NCCL_DIRECT_NIC   0x04
-#define NCCL_IPC_WRITE    0x08
-#define NCCL_IPC_READ     0x10
-#define NCCL_NVLS_MIN_POLL 0x20
-
-struct ncclConnInfo {
-  // Regular comm mechanism
-  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
-  void* mhandles[NCCL_NUM_PROTOCOLS];
-  uint64_t *tail;     // Local for recv, remote for send
-  uint64_t *head;     // Local for send, remote for recv
-
-  int flags;          // Direct communication / other flags
-  int shared;         // Buffers are shared
-  void **ptrExchange; // Pointer exchange for direct communication
-  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
-
-  int *sizesFifo;     // Sizes fifo from GPU to proxy
-  int *offsFifo;      // Buffer fifo from proxy to GPU
-
-  uint64_t step;      // Keep where we are
-  uint64_t llLastCleaning;
-  ncclNetDeviceHandle_t netDeviceHandle;
-};
-
-struct ncclProxyConnector {
-  int tpRank;
-  int tpLocalRank;
-  int sameProcess;
-  struct ncclProxyConnection* connection;
-  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
-};
-
-struct ncclConnector {
-  int connected;
-  struct ncclProxyConnector proxyConn;
-  struct ncclTransportComm* transportComm;
-  void* transportResources;
-  struct ncclConnInfo conn;
-};
-
-struct ncclRing {
-  // Shortcuts for userRanks[1] and userRanks[n-1]
-  int prev;
-  int next;
-
-  // Maps an internal nccl index to user-specified rank order. This is necessary
-  // since we need to know how the user expects data to be ordered across
-  // devices. Ordered from current device.
-  int* userRanks;
-
-  int index; // This rank's index in the ring
-};
-
-
-// The root of each tree only has one node down (+1 intra-node).
-#define NCCL_MAX_TREE_ARITY_TOP 2
-// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
-#define NCCL_MAX_TREE_ARITY 3
-struct ncclTree {
-  int depth;
-  int up;
-  int down[NCCL_MAX_TREE_ARITY];
-};
-
-#define NCCL_MAX_DIRECT_ARITY 7
-struct ncclDirect {
-  int depth;
-  int out;
-  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
-  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
-  int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
-  int up[NCCL_MAX_DIRECT_ARITY];
-  int down[NCCL_MAX_DIRECT_ARITY];
-};
-
-#define NCCL_MAX_NVLS_ARITY 8
-#define NCCL_MAX_NVLS_TREE_ARITY 3
-struct ncclNvls {
-  int out;
-  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
-  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
-  int up[NCCL_MAX_NVLS_ARITY];
-  int down;
-  int treeUp;
-  int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
-  int node;
-  int nNodes;
-};
-
-#define NCCL_MAX_CONNS 2
-struct ncclChannelPeer {
-  struct ncclConnector send[NCCL_MAX_CONNS];
-  struct ncclConnector recv[NCCL_MAX_CONNS];
-  int refCount;
-};
-
-struct ncclDevComm;
-
-/* ncclWork is to be a power of two, currently 8x64 bytes, */
-/* to make sure reads to host from the CUDA kernel are aligned. */
-/* Make sure to adjust padding at the end of ncclWorkElem. */
-#define NCCL_WORK_SIZE 512
-
-enum ncclWorkType : uint8_t {
-   ncclWorkTypeUnused=0,
-   ncclWorkTypeColl=1,
-   ncclWorkTypeP2p=2,
-   ncclWorkTypeRegColl=3
-};
-enum ncclWorkP2PType : uint8_t {
-  ncclWorkP2pTypeUnused=0,
-  ncclWorkP2pTypeSend,
-  ncclWorkP2pTypeRecv
-};
-
-struct ncclWorkHeader {
-  union {
-    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
-    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
-  };
-  uint16_t funcIndex;
-  uint8_t isLast:1; // last work for this kernel
-  uint8_t inFifo:1; // is this work in the fifo
-  enum ncclWorkType type;
-};
-
-struct ncclWorkElem {
-  union {
-    uint8_t flagBits;
-    struct {
-      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1;
-    };
-  };
-  uint8_t nWarps;
-  uint8_t direct;
-
-  const void * sendbuff;
-  void * recvbuff;
-
-  size_t count;
-  size_t lastChunkSize;
-  uint32_t root;
-  uint8_t bid;
-  uint8_t nChannels;
-  uint64_t redOpArg;
-};
-
-#define NCCL_MAX_WORK_ELEMENTS ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem))
-static_assert(NCCL_MAX_WORK_ELEMENTS == 9, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 9");
-
-struct ncclWorkElemP2p {
-  int peer : 30;
-  int proto : 2;
-
-  enum ncclWorkP2PType p2pType;
-  uint8_t nWarps;
-  uint8_t warpStart;
-  uint8_t ngroups;
-  // Important not to use any fields with greater than 4-byte alignment since
-  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
-  // there were 8-byte fields.
-  //void* buff;
-  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
-  //size_t count;
-  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
-  int chunkSize;
-};
-
-static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) >= 16, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 16");
-#define NCCL_MAX_WORK_ELEMENTS_P2P 16
-
-struct ncclWorkElemReg {
-  struct ncclWorkElem elem;
-  void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
-  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
-  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
-};
-
-#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
-static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 2, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 2");
-
-// Number of named barriers supported by CUDA
-#define NCCL_MAX_GROUPS 16
-
-struct ncclWork {
-  struct ncclWorkHeader header;
-  union {
-    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
-    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
-    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
-    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
-  };
-};
-static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
-static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
-
-struct ncclDevChannelPeer {
-  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
-  // instead of the full ncclConnector.
-  struct ncclConnInfo send[NCCL_MAX_CONNS];
-  struct ncclConnInfo recv[NCCL_MAX_CONNS];
-};
-
-struct alignas(16) ncclDevChannel {
-  struct ncclDevChannelPeer** peers;
-  struct ncclRing ring;
-  struct ncclTree tree;
-  struct ncclTree collnetChain;
-  struct ncclDirect collnetDirect;
-  struct ncclNvls nvls;
-  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
-};
-
-struct ncclDevComm {
-  int rank;
-  int nRanks;
-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  int p2pChunkSize;
-
-  // Operation list for aggregation
-  int workFifoDepth;
-  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
-
-  // Flag to ask NCCL kernels to abort
-  volatile uint32_t* abortFlag;
-
-  // Channels, device side
-  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
-};
-
-struct alignas(16) ncclDevCommAndChannels {
-  struct ncclDevComm comm;
-  struct ncclDevChannel channels[MAXCHANNELS];
-};
-
-#ifdef __CUDA_ARCH__
-  #define NCCL_CUDA_ARCH __CUDA_ARCH__
-#else
-  #define NCCL_CUDA_ARCH 0
-#endif
-
-template<typename T>
-__host__ __device__ constexpr T min_constexpr(T a) { return a; }
-template<typename T, typename ...Ts>
-__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
-  return min_constexpr<T>((a < b ? a : b), c...);
-}
-
-template<typename T>
-__host__ __device__ constexpr T max_constexpr(T a) { return a; }
-template<typename T, typename ...Ts>
-__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
-  return max_constexpr<T>((a > b ? a : b), c...);
-}
-
-// Calculate the unroll factor given:
-// * bytePerPack: number of bytes accessed per instruction
-// * insns: max permissible unroll value
-// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
-__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
-  return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
-}
-
-// Note that all unroll value logic should depend on a given cudaArch argument
-// and not __CUDA_ARCH__ since these need to be host-side executable where the
-// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
-// side code can elide passing the arch for brevity.
-
-__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
-  // Our collective unroll should move to the same bytes&insns model as NVLS.
-  return cudaArch >= 800 ? 8 : 4;
-}
-
-__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
-__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
-
-__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
-  return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
-}
-
-// The amount of dynamic shmem per warp
-__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
-  return (max_constexpr<int>(
-      /*LL    */0,
-      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
-      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
-      // NVLS needs an extra 16B to read unaligned data.
-      /*NVLS  */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
-    ) + 15) & -16; // pad to 16 bytes
-}
-
-// The amount of dynamic shmem per block
-__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
-  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
-}
-
-// Host-side table of kernel function pointers.
-extern int const ncclDevKernelCount;
-extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
-
-// Table of most specialized kernel function to run given func index.
-extern int const ncclDevFuncRowToId[];
-extern void* const ncclDevKernelForFunc[/*funcIndex*/];
-extern bool const ncclDevKernelForFuncIsSpecialized[/*funcIndex*/];
-
-// Launch a one-rank reduction on stream.
-ncclResult_t ncclLaunchOneRank(void* dst, void const* src, size_t nElts, struct ncclDevRedOpFull redOp, ncclDataType_t type, cudaStream_t stream);
-
-// `ncclNvlsSupported()` needs to be in sync with "func_valid" in "src/device/generate.py"
-inline bool ncclNvlsSupported(int devRedOp, int type) {
-  switch (type) {
-  case ncclInt32:
-  case ncclUint32:
-  case ncclInt64:
-  case ncclUint64:
-  case ncclFloat16:
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-  case ncclBfloat16:
-  #endif
-    return devRedOp == ncclDevSum || devRedOp == ncclDevMinMax;
-  case ncclFloat:
-  case ncclDouble:
-    return devRedOp == ncclDevSum;
-  default:
-    return false;
-  }
-}
-
-// `ncclDevFuncIndex()` needs to be in sync with "all_functions()" in "src/device/generate.py"
-inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) {
-  #if defined(__CUDA_BF16_TYPES_EXIST__)
-  constexpr int NumTypes = ncclNumTypes;
-  #else
-  constexpr int NumTypes = ncclNumTypes + 1;
-  #endif
-
-  int row = 0; // ncclDevFuncIndex_P2p
-  if (coll == ncclFuncSendRecv) goto have_row;
-  row += 1;
-
-  if (coll == ncclFuncAllGather) {
-    int algo1 = algo == NCCL_ALGO_RING ? 0 :
-              /*algo == NCCL_ALGO_NVLS*/ 1;
-    row += algo1*NCCL_NUM_PROTOCOLS + proto;
-    goto have_row;
-  }
-  row += (/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
-
-  if (coll == ncclFuncBroadcast) {
-    row += proto;
-    goto have_row;
-  }
-  row += (/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
-
-  if (coll == ncclFuncAllReduce) {
-    row += ((devRedOp*NumTypes + type)*NCCL_NUM_ALGORITHMS + algo)*NCCL_NUM_PROTOCOLS + proto;
-    goto have_row;
-  }
-  row += ncclNumDevRedOps*NumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS;
-
-  if (coll == ncclFuncReduce) {
-    row += (devRedOp*NumTypes + type)*NCCL_NUM_PROTOCOLS + proto;
-    goto have_row;
-  }
-  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/1)*NCCL_NUM_PROTOCOLS;
-
-  if (coll == ncclFuncReduceScatter) {
-    int algo1 = algo == NCCL_ALGO_RING ? 0 :
-              /*algo == NCCL_ALGO_NVLS*/ 1;
-    row += ((devRedOp*NumTypes + type)*2 + algo1)*NCCL_NUM_PROTOCOLS + proto;
-    goto have_row;
-  }
-  row += ncclNumDevRedOps*NumTypes*(/*NumAlgos=*/2)*NCCL_NUM_PROTOCOLS;
-
-have_row:
-  return ncclDevFuncRowToId[row];
-}
-
-inline int ncclDevFuncId_P2p() { return ncclDevFuncRowToId[0]; }
-
-#endif
diff --git a/nvls/enqueue.h b/nvls/enqueue.h
deleted file mode 100644
index 634f037cb..000000000
--- a/nvls/enqueue.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_ENQUEUE_H_
-#define NCCL_ENQUEUE_H_
-
-#include "comm.h"
-#include "group.h"
-#include "collectives.h"
-#include "utils.h"
-
-#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
-#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
-
-ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
-ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
-ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
-ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
-ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
-ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
-ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
-
-#endif // End include guard
diff --git a/nvls/gdrwrap.h b/nvls/gdrwrap.h
deleted file mode 100644
index a64674cc5..000000000
--- a/nvls/gdrwrap.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_GDRWRAP_H_
-#define NCCL_GDRWRAP_H_
-
-#include "nccl.h"
-#include <stdint.h> // for standard [u]intX_t types
-#include <stdio.h>
-#include <stdlib.h>
-
-// These can be used if the GDR library isn't thread safe
-#include <pthread.h>
-extern pthread_mutex_t gdrLock;
-#define GDRLOCK() pthread_mutex_lock(&gdrLock)
-#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
-#define GDRLOCKCALL(cmd, ret) do {                      \
-    GDRLOCK();                                          \
-    ret = cmd;                                          \
-    GDRUNLOCK();                                        \
-} while(false)
-
-#define GDRCHECK(cmd) do {                              \
-    int e;                                              \
-    /* GDRLOCKCALL(cmd, e); */                          \
-    e = cmd;                                            \
-    if( e != 0 ) {                                      \
-      WARN("GDRCOPY failure %d", e);                    \
-      return ncclSystemError;                           \
-    }                                                   \
-} while(false)
-
-// This is required as the GDR memory is mapped WC
-#if !defined(__NVCC__)
-#if defined(__PPC__)
-static inline void wc_store_fence(void) { asm volatile("sync") ; }
-#elif defined(__x86_64__)
-#include <immintrin.h>
-static inline void wc_store_fence(void) { _mm_sfence(); }
-#elif defined(__aarch64__)
-#ifdef __cplusplus
-#include <atomic>
-static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
-#else
-#include <stdatomic.h>
-static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
-#endif
-#endif
-#endif
-
-//#define GDR_DIRECT 1
-#ifdef GDR_DIRECT
-// Call the GDR API library code directly rather than via
-// dlopen() wrappers
-#include <gdrapi.h>
-
-static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
-static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
-static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
-static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
-  GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
-  GDRCHECK(gdr_unpin_buffer(g, handle));
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
-  GDRCHECK(gdr_get_info(g, handle, info));
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
-  GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
-  GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
-  return ncclSuccess;
-}
-static void wrap_gdr_runtime_get_version(int *major, int *minor) {
-  gdr_runtime_get_version(major, minor);
-  return ncclSuccess;
-}
-static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
-  gdr_driver_get_version(g, major, minor);
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
-  GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
-  return ncclSuccess;
-}
-static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
-  GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
-  return ncclSuccess;
-}
-
-#else
-// Dynamically handle dependency the GDR API library
-
-/* Extracted from gdrapi.h (v2.1 Nov 2020) */
-
-#define GPU_PAGE_SHIFT   16
-#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
-#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
-#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
-
-struct gdr;
-typedef struct gdr *gdr_t;
-
-typedef struct gdr_mh_s {
-  unsigned long h;
-} gdr_mh_t;
-
-struct gdr_info {
-    uint64_t va;
-    uint64_t mapped_size;
-    uint32_t page_size;
-    uint64_t tm_cycles;
-    uint32_t cycles_per_ms;
-    unsigned mapped:1;
-    unsigned wc_mapping:1;
-};
-typedef struct gdr_info gdr_info_t;
-
-/* End of gdrapi.h */
-
-ncclResult_t wrap_gdr_symbols(void);
-
-gdr_t wrap_gdr_open(void);
-ncclResult_t wrap_gdr_close(gdr_t g);
-ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
-ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
-ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
-ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
-ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
-ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
-ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
-ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
-ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
-
-#endif // GDR_DIRECT
-
-// Global GDR driver handle
-extern gdr_t ncclGdrCopy;
-
-#include "alloc.h"
-
-typedef struct gdr_mem_desc {
-  void *gdrDevMem;
-  void *gdrMap;
-  size_t gdrOffset;
-  size_t gdrMapSize;
-  gdr_mh_t gdrMh;
-} gdr_mem_desc_t;
-
-static gdr_t ncclGdrInit() {
-  int libMajor, libMinor, drvMajor, drvMinor;
-  gdr_t handle = NULL;
-  // Dynamically load the GDRAPI library symbols
-  if (wrap_gdr_symbols() == ncclSuccess) {
-    handle = wrap_gdr_open();
-
-    if (handle != NULL) {
-      ncclResult_t res;
-
-      // Query the version of libgdrapi
-      NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
-
-      // Query the version of gdrdrv driver
-      NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
-
-      // Only support GDRAPI 2.1 and later
-      if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
-        goto error;
-      }
-      else
-        INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
-    }
-  }
-  return handle;
-error:
-  if (handle != NULL) (void) wrap_gdr_close(handle);
-  return NULL;
-}
-
-template <typename T>
-static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
-  gdr_info_t info;
-  size_t mapSize;
-  gdr_mh_t mh;
-  char *devMem;
-  void *gdrMap;
-
-  mapSize = sizeof(T)*nelem;
-
-  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
-  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
-  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
-  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
-  uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
-  size_t align = alignedAddr - (uint64_t)devMem;
-
-  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
-  NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
-
-  NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
-  //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
-
-  NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
-
-  // Will offset ever be non zero ?
-  ssize_t off = info.va - alignedAddr;
-
-  gdr_mem_desc_t* md;
-  NCCLCHECK(ncclCalloc(&md, 1));
-  md->gdrDevMem = devMem;
-  md->gdrMap = gdrMap;
-  md->gdrMapSize = mapSize;
-  md->gdrOffset = off+align;
-  md->gdrMh = mh;
-  *gdrHandle = md;
-
-  *ptr = (T *)((char *)gdrMap+off);
-  if (devPtr) *devPtr = (T *)(devMem+off+align);
-
-  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
-       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
-
-  return ncclSuccess;
-}
-
-template <typename T>
-static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
-  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
-  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
-  return ncclSuccess;
-}
-
-static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
-  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
-  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
-  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
-  NCCLCHECK(ncclCudaFree(md->gdrDevMem));
-  free(md);
-
-  return ncclSuccess;
-}
-
-#endif // End include guard
diff --git a/nvls/graph.h b/nvls/graph.h
deleted file mode 100644
index fdd634894..000000000
--- a/nvls/graph.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_GRAPH_H_
-#define NCCL_GRAPH_H_
-
-#include "nccl.h"
-#include "device.h"
-#include <limits.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <stdio.h>
-#include <sched.h>
-
-ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
-
-struct ncclTopoSystem;
-// Build the topology
-ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
-ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
-ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
-
-ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
-void ncclTopoFree(struct ncclTopoSystem* system);
-ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
-ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
-ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
-int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
-
-// Query topology
-ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
-ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
-ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
-ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
-int ncclPxnDisable(struct ncclComm* comm);
-ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
-
-// Find CPU affinity
-ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
-
-#define NCCL_TOPO_CPU_ARCH_X86 1
-#define NCCL_TOPO_CPU_ARCH_POWER 2
-#define NCCL_TOPO_CPU_ARCH_ARM 3
-#define NCCL_TOPO_CPU_VENDOR_INTEL 1
-#define NCCL_TOPO_CPU_VENDOR_AMD 2
-#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
-#define NCCL_TOPO_CPU_TYPE_BDW 1
-#define NCCL_TOPO_CPU_TYPE_SKL 2
-#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
-ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
-ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
-ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
-ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
-
-#define NCCL_TOPO_MAX_NODES 256
-
-// Init search. Needs to be done before calling ncclTopoCompute
-ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
-
-#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
-#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
-#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
-#define NCCL_TOPO_PATTERN_RING 4            // Ring
-#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
-struct ncclTopoGraph {
-  // Input / output
-  int id; // ring : 0, tree : 1, collnet : 2
-  int pattern;
-  int crossNic;
-  int collNet;
-  int minChannels;
-  int maxChannels;
-  // Output
-  int nChannels;
-  float bwIntra;
-  float bwInter;
-  float latencyInter;
-  int typeIntra;
-  int typeInter;
-  int sameChannels;
-  int nHops;
-  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
-  int inter[MAXCHANNELS*2];
-};
-ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
-
-ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
-ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
-
-struct ncclTopoRanks {
-  int ringRecv[MAXCHANNELS];
-  int ringSend[MAXCHANNELS];
-  int ringPrev[MAXCHANNELS];
-  int ringNext[MAXCHANNELS];
-  int treeToParent[MAXCHANNELS];
-  int treeToChild0[MAXCHANNELS];
-  int treeToChild1[MAXCHANNELS];
-  int nvlsHeads[MAXCHANNELS];
-};
-
-ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
-
-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
-    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs);
-
-ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
-#include "info.h"
-ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time, bool* backup = NULL);
-
-#endif
diff --git a/nvls/group.h b/nvls/group.h
deleted file mode 100644
index 72251147f..000000000
--- a/nvls/group.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_GROUP_H_
-#define NCCL_GROUP_H_
-
-#include "nccl.h"
-#include "comm.h"
-
-ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
-void ncclGroupCommJoin(struct ncclComm* comm);
-void ncclGroupCommPreconnect(struct ncclComm* comm);
-ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
-ncclResult_t ncclGroupJobAbort(struct ncclGroupJob* groupJob);
-ncclResult_t ncclGroupJobComplete(struct ncclGroupJob *groupJob);
-
-typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
-
-ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
-
-typedef enum ncclGroupJobState {
-  ncclGroupJobRunning = 0,
-  ncclGroupJobDone    = 1,
-  ncclGroupJobJoined  = 2,
-} ncclGroupJobState_t;
-
-struct ncclAsyncJob {
-  struct ncclAsyncJob* next;
-  pthread_t thread;
-  ncclResult_t result;
-  ncclResult_t(*func)(struct ncclAsyncJob*);
-  void(*undo)(struct ncclAsyncJob*);
-  void(*destructor)(void*);
-  ncclGroupJobState_t state;
-  volatile uint32_t *abortFlag; /* point to comm abortFlag */
-  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
-  ncclComm_t comm;
-};
-
-ncclResult_t ncclAsyncLaunch(
-  struct ncclAsyncJob* job,
-  ncclResult_t(*func)(struct ncclAsyncJob*),
-  void(*undo)(struct ncclAsyncJob*),
-  void(*destructor)(void*), ncclComm_t comm
-);
-
-struct ncclGroupJob {
-  struct ncclAsyncJob base;
-  struct ncclComm **groupCommHeadPtr;
-  struct ncclComm **groupCommPreconnectHeadPtr;
-  ncclResult_t *groupErrorPtr;
-  volatile bool *abortFlagPtr;
-  int *groupBlockingPtr;
-  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
-  bool initialized;
-};
-
-ncclResult_t ncclGroupStartInternal();
-ncclResult_t ncclGroupEndInternal();
-ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
-
-////////////////////////////////////////////////////////////////////////////////
-
-extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
-extern __thread ncclResult_t ncclGroupError;
-extern __thread struct ncclComm* ncclGroupCommHead;
-extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
-extern __thread int ncclGroupBlocking;
-extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
-extern __thread struct ncclGroupJob ncclGroupJobMain;
-
-static inline void groupResetJobState() {
-  ncclGroupBlocking = -1;
-  ncclGroupJobMainPtr = NULL;
-  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
-  return;
-}
-
-static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
-  ncclResult_t ret = ncclSuccess;
-  if (job) {
-    ret = ncclAsyncJobComplete(&job->base);
-    groupResetJobState();
-  }
-  return ret;
-}
-
-inline ncclResult_t ncclGroupStartInternal() {
-  ncclGroupDepth++;
-  return ncclSuccess;
-}
-
-inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
-  if (ncclGroupDepth > 0) {
-    if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
-  }
-  return ret;
-}
-
-// Add comm to this thread's group
-inline void ncclGroupCommJoin(struct ncclComm* comm) {
-  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
-    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
-    // the users program order yet insures siblings occur consecutively. This
-    // is required by doLaunches() in "group.cc".
-    struct ncclComm** pp = &ncclGroupCommHead;
-    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
-      pp = &(*pp)->groupNext;
-    comm->groupNext = *pp;
-    *pp = comm;
-    // Comms gets a new memory stack scope upon joining. Each task batched for
-    // this comm is allocated there.
-    ncclMemoryStackPush(&comm->memScoped);
-  }
-
-  ncclGroupBlocking = comm->config.blocking;
-}
-
-// Add comm to this thread's group needing preconnect
-inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
-  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
-    comm->preconnectNext = ncclGroupCommPreconnectHead;
-    ncclGroupCommPreconnectHead = comm;
-  }
-}
-
-// Comm has left group
-inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
-  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
-  ncclMemoryStackPop(&comm->memScoped);
-  return ncclSuccess;
-}
-
-#endif
diff --git a/nvls/ibvcore.h b/nvls/ibvcore.h
deleted file mode 100644
index 8d8ecf1ec..000000000
--- a/nvls/ibvcore.h
+++ /dev/null
@@ -1,1058 +0,0 @@
-#ifndef NCCL_IBV_CORE_H_
-#define NCCL_IBV_CORE_H_
-
-/* Basic IB verbs structs. Needed to dynamically load IB verbs functions without
- * explicit including of IB verbs header.
- */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#if __GNUC__ >= 3
-#  define __attribute_const __attribute__((const))
-#else
-#  define __attribute_const
-#endif
-
-union ibv_gid {
-	uint8_t			raw[16];
-	struct {
-		uint64_t	subnet_prefix;
-		uint64_t	interface_id;
-	} global;
-};
-
-#ifndef container_of
-/**
-  * container_of - cast a member of a structure out to the containing structure
-  * @ptr:        the pointer to the member.
-  * @type:       the type of the container struct this is embedded in.
-  * @member:     the name of the member within the struct.
-  *
- */
-#define container_of(ptr, type, member) \
-	((type *) ((uint8_t *)(ptr) - offsetof(type, member)))
-#endif
-
-#define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz))
-
-/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-//static void *__VERBS_ABI_IS_EXTENDED = ((uint8_t *)NULL) - 1;
-
-enum ibv_node_type {
-	IBV_NODE_UNKNOWN	= -1,
-	IBV_NODE_CA 		= 1,
-	IBV_NODE_SWITCH,
-	IBV_NODE_ROUTER,
-	IBV_NODE_RNIC,
-
-	/* Leave a gap for future node types before starting with
-	 * experimental node types.
-	 */
-	IBV_EXP_NODE_TYPE_START	= 32,
-	IBV_EXP_NODE_MIC	= IBV_EXP_NODE_TYPE_START
-};
-
-enum ibv_transport_type {
-	IBV_TRANSPORT_UNKNOWN	= -1,
-	IBV_TRANSPORT_IB	= 0,
-	IBV_TRANSPORT_IWARP,
-
-	/* Leave a gap for future transport types before starting with
-	 * experimental transport types.
-	 */
-	IBV_EXP_TRANSPORT_TYPE_START	= 32,
-	IBV_EXP_TRANSPORT_SCIF		= IBV_EXP_TRANSPORT_TYPE_START
-};
-
-enum ibv_device_cap_flags {
-	IBV_DEVICE_RESIZE_MAX_WR	= 1,
-	IBV_DEVICE_BAD_PKEY_CNTR	= 1 <<  1,
-	IBV_DEVICE_BAD_QKEY_CNTR	= 1 <<  2,
-	IBV_DEVICE_RAW_MULTI		= 1 <<  3,
-	IBV_DEVICE_AUTO_PATH_MIG	= 1 <<  4,
-	IBV_DEVICE_CHANGE_PHY_PORT	= 1 <<  5,
-	IBV_DEVICE_UD_AV_PORT_ENFORCE	= 1 <<  6,
-	IBV_DEVICE_CURR_QP_STATE_MOD	= 1 <<  7,
-	IBV_DEVICE_SHUTDOWN_PORT	= 1 <<  8,
-	IBV_DEVICE_INIT_TYPE		= 1 <<  9,
-	IBV_DEVICE_PORT_ACTIVE_EVENT	= 1 << 10,
-	IBV_DEVICE_SYS_IMAGE_GUID	= 1 << 11,
-	IBV_DEVICE_RC_RNR_NAK_GEN	= 1 << 12,
-	IBV_DEVICE_SRQ_RESIZE		= 1 << 13,
-	IBV_DEVICE_N_NOTIFY_CQ		= 1 << 14,
-	IBV_DEVICE_XRC			= 1 << 20,
-	IBV_DEVICE_MANAGED_FLOW_STEERING = 1 << 29
-};
-
-enum ibv_atomic_cap {
-	IBV_ATOMIC_NONE,
-	IBV_ATOMIC_HCA,
-	IBV_ATOMIC_GLOB
-};
-
-struct ibv_device_attr {
-	char			fw_ver[64];
-	uint64_t		node_guid;
-	uint64_t		sys_image_guid;
-	uint64_t		max_mr_size;
-	uint64_t		page_size_cap;
-	uint32_t		vendor_id;
-	uint32_t		vendor_part_id;
-	uint32_t		hw_ver;
-	int			max_qp;
-	int			max_qp_wr;
-	int			device_cap_flags;
-	int			max_sge;
-	int			max_sge_rd;
-	int			max_cq;
-	int			max_cqe;
-	int			max_mr;
-	int			max_pd;
-	int			max_qp_rd_atom;
-	int			max_ee_rd_atom;
-	int			max_res_rd_atom;
-	int			max_qp_init_rd_atom;
-	int			max_ee_init_rd_atom;
-	enum ibv_atomic_cap	atomic_cap;
-	int			max_ee;
-	int			max_rdd;
-	int			max_mw;
-	int			max_raw_ipv6_qp;
-	int			max_raw_ethy_qp;
-	int			max_mcast_grp;
-	int			max_mcast_qp_attach;
-	int			max_total_mcast_qp_attach;
-	int			max_ah;
-	int			max_fmr;
-	int			max_map_per_fmr;
-	int			max_srq;
-	int			max_srq_wr;
-	int			max_srq_sge;
-	uint16_t		max_pkeys;
-	uint8_t			local_ca_ack_delay;
-	uint8_t			phys_port_cnt;
-};
-
-enum ibv_mtu {
-	IBV_MTU_256  = 1,
-	IBV_MTU_512  = 2,
-	IBV_MTU_1024 = 3,
-	IBV_MTU_2048 = 4,
-	IBV_MTU_4096 = 5
-};
-
-enum ibv_port_state {
-	IBV_PORT_NOP		= 0,
-	IBV_PORT_DOWN		= 1,
-	IBV_PORT_INIT		= 2,
-	IBV_PORT_ARMED		= 3,
-	IBV_PORT_ACTIVE		= 4,
-	IBV_PORT_ACTIVE_DEFER	= 5
-};
-
-enum {
-	IBV_LINK_LAYER_UNSPECIFIED,
-	IBV_LINK_LAYER_INFINIBAND,
-	IBV_LINK_LAYER_ETHERNET,
-
-	/* Leave a gap for future link layer types before starting with
-	 * experimental link layer.
-	 */
-	IBV_EXP_LINK_LAYER_START	= 32,
-	IBV_EXP_LINK_LAYER_SCIF		= IBV_EXP_LINK_LAYER_START
-};
-
-enum ibv_port_cap_flags {
-	IBV_PORT_SM				= 1 <<  1,
-	IBV_PORT_NOTICE_SUP			= 1 <<  2,
-	IBV_PORT_TRAP_SUP			= 1 <<  3,
-	IBV_PORT_OPT_IPD_SUP			= 1 <<  4,
-	IBV_PORT_AUTO_MIGR_SUP			= 1 <<  5,
-	IBV_PORT_SL_MAP_SUP			= 1 <<  6,
-	IBV_PORT_MKEY_NVRAM			= 1 <<  7,
-	IBV_PORT_PKEY_NVRAM			= 1 <<  8,
-	IBV_PORT_LED_INFO_SUP			= 1 <<  9,
-	IBV_PORT_SYS_IMAGE_GUID_SUP		= 1 << 11,
-	IBV_PORT_PKEY_SW_EXT_PORT_TRAP_SUP	= 1 << 12,
-	IBV_PORT_EXTENDED_SPEEDS_SUP		= 1 << 14,
-	IBV_PORT_CM_SUP				= 1 << 16,
-	IBV_PORT_SNMP_TUNNEL_SUP		= 1 << 17,
-	IBV_PORT_REINIT_SUP			= 1 << 18,
-	IBV_PORT_DEVICE_MGMT_SUP		= 1 << 19,
-	IBV_PORT_VENDOR_CLASS			= 1 << 24,
-	IBV_PORT_CLIENT_REG_SUP			= 1 << 25,
-	IBV_PORT_IP_BASED_GIDS			= 1 << 26,
-};
-
-struct ibv_port_attr {
-	enum ibv_port_state	state;
-	enum ibv_mtu		max_mtu;
-	enum ibv_mtu		active_mtu;
-	int			gid_tbl_len;
-	uint32_t		port_cap_flags;
-	uint32_t		max_msg_sz;
-	uint32_t		bad_pkey_cntr;
-	uint32_t		qkey_viol_cntr;
-	uint16_t		pkey_tbl_len;
-	uint16_t		lid;
-	uint16_t		sm_lid;
-	uint8_t			lmc;
-	uint8_t			max_vl_num;
-	uint8_t			sm_sl;
-	uint8_t			subnet_timeout;
-	uint8_t			init_type_reply;
-	uint8_t			active_width;
-	uint8_t			active_speed;
-	uint8_t			phys_state;
-	uint8_t			link_layer;
-	uint8_t			reserved;
-};
-
-enum ibv_event_type {
-	IBV_EVENT_CQ_ERR,
-	IBV_EVENT_QP_FATAL,
-	IBV_EVENT_QP_REQ_ERR,
-	IBV_EVENT_QP_ACCESS_ERR,
-	IBV_EVENT_COMM_EST,
-	IBV_EVENT_SQ_DRAINED,
-	IBV_EVENT_PATH_MIG,
-	IBV_EVENT_PATH_MIG_ERR,
-	IBV_EVENT_DEVICE_FATAL,
-	IBV_EVENT_PORT_ACTIVE,
-	IBV_EVENT_PORT_ERR,
-	IBV_EVENT_LID_CHANGE,
-	IBV_EVENT_PKEY_CHANGE,
-	IBV_EVENT_SM_CHANGE,
-	IBV_EVENT_SRQ_ERR,
-	IBV_EVENT_SRQ_LIMIT_REACHED,
-	IBV_EVENT_QP_LAST_WQE_REACHED,
-	IBV_EVENT_CLIENT_REREGISTER,
-	IBV_EVENT_GID_CHANGE,
-
-	/* new experimental events start here leaving enough
-	 * room for 14 events which should be enough
-	 */
-	IBV_EXP_EVENT_DCT_KEY_VIOLATION = 32,
-	IBV_EXP_EVENT_DCT_ACCESS_ERR,
-	IBV_EXP_EVENT_DCT_REQ_ERR,
-};
-
-struct ibv_async_event {
-	union {
-		struct ibv_cq  *cq;
-		struct ibv_qp  *qp;
-		struct ibv_srq *srq;
-		struct ibv_exp_dct *dct;
-		int		port_num;
-		/* For source compatible with Legacy API */
-		uint32_t	xrc_qp_num;
-	} element;
-	enum ibv_event_type	event_type;
-};
-
-enum ibv_wc_status {
-	IBV_WC_SUCCESS,
-	IBV_WC_LOC_LEN_ERR,
-	IBV_WC_LOC_QP_OP_ERR,
-	IBV_WC_LOC_EEC_OP_ERR,
-	IBV_WC_LOC_PROT_ERR,
-	IBV_WC_WR_FLUSH_ERR,
-	IBV_WC_MW_BIND_ERR,
-	IBV_WC_BAD_RESP_ERR,
-	IBV_WC_LOC_ACCESS_ERR,
-	IBV_WC_REM_INV_REQ_ERR,
-	IBV_WC_REM_ACCESS_ERR,
-	IBV_WC_REM_OP_ERR,
-	IBV_WC_RETRY_EXC_ERR,
-	IBV_WC_RNR_RETRY_EXC_ERR,
-	IBV_WC_LOC_RDD_VIOL_ERR,
-	IBV_WC_REM_INV_RD_REQ_ERR,
-	IBV_WC_REM_ABORT_ERR,
-	IBV_WC_INV_EECN_ERR,
-	IBV_WC_INV_EEC_STATE_ERR,
-	IBV_WC_FATAL_ERR,
-	IBV_WC_RESP_TIMEOUT_ERR,
-	IBV_WC_GENERAL_ERR
-};
-const char *ibv_wc_status_str(enum ibv_wc_status status);
-
-enum ibv_wc_opcode {
-	IBV_WC_SEND,
-	IBV_WC_RDMA_WRITE,
-	IBV_WC_RDMA_READ,
-	IBV_WC_COMP_SWAP,
-	IBV_WC_FETCH_ADD,
-	IBV_WC_BIND_MW,
-/*
- * Set value of IBV_WC_RECV so consumers can test if a completion is a
- * receive by testing (opcode & IBV_WC_RECV).
- */
-	IBV_WC_RECV			= 1 << 7,
-	IBV_WC_RECV_RDMA_WITH_IMM
-};
-
-enum ibv_wc_flags {
-	IBV_WC_GRH		= 1 << 0,
-	IBV_WC_WITH_IMM		= 1 << 1
-};
-
-struct ibv_wc {
-	uint64_t		wr_id;
-	enum ibv_wc_status	status;
-	enum ibv_wc_opcode	opcode;
-	uint32_t		vendor_err;
-	uint32_t		byte_len;
-	uint32_t		imm_data;	/* in network byte order */
-	uint32_t		qp_num;
-	uint32_t		src_qp;
-	int			wc_flags;
-	uint16_t		pkey_index;
-	uint16_t		slid;
-	uint8_t			sl;
-	uint8_t			dlid_path_bits;
-};
-
-enum ibv_access_flags {
-	IBV_ACCESS_LOCAL_WRITE		= 1,
-	IBV_ACCESS_REMOTE_WRITE		= (1<<1),
-	IBV_ACCESS_REMOTE_READ		= (1<<2),
-	IBV_ACCESS_REMOTE_ATOMIC	= (1<<3),
-	IBV_ACCESS_MW_BIND		= (1<<4),
-	IBV_ACCESS_RELAXED_ORDERING     = (1<<20),
-};
-
-struct ibv_pd {
-	struct ibv_context     *context;
-	uint32_t		handle;
-};
-
-enum ibv_xrcd_init_attr_mask {
-	IBV_XRCD_INIT_ATTR_FD	    = 1 << 0,
-	IBV_XRCD_INIT_ATTR_OFLAGS   = 1 << 1,
-	IBV_XRCD_INIT_ATTR_RESERVED = 1 << 2
-};
-
-struct ibv_xrcd_init_attr {
-	uint32_t comp_mask;
-	int	 fd;
-	int	 oflags;
-};
-
-struct ibv_xrcd {
-	struct ibv_context     *context;
-};
-
-enum ibv_rereg_mr_flags {
-	IBV_REREG_MR_CHANGE_TRANSLATION	= (1 << 0),
-	IBV_REREG_MR_CHANGE_PD		= (1 << 1),
-	IBV_REREG_MR_CHANGE_ACCESS	= (1 << 2),
-	IBV_REREG_MR_KEEP_VALID		= (1 << 3)
-};
-
-struct ibv_mr {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	void		       *addr;
-	size_t			length;
-	uint32_t		handle;
-	uint32_t		lkey;
-	uint32_t		rkey;
-};
-
-enum ibv_mw_type {
-	IBV_MW_TYPE_1			= 1,
-	IBV_MW_TYPE_2			= 2
-};
-
-struct ibv_mw {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	uint32_t		rkey;
-};
-
-struct ibv_global_route {
-	union ibv_gid		dgid;
-	uint32_t		flow_label;
-	uint8_t			sgid_index;
-	uint8_t			hop_limit;
-	uint8_t			traffic_class;
-};
-
-struct ibv_grh {
-	uint32_t		version_tclass_flow;
-	uint16_t		paylen;
-	uint8_t			next_hdr;
-	uint8_t			hop_limit;
-	union ibv_gid		sgid;
-	union ibv_gid		dgid;
-};
-
-enum ibv_rate {
-	IBV_RATE_MAX      = 0,
-	IBV_RATE_2_5_GBPS = 2,
-	IBV_RATE_5_GBPS   = 5,
-	IBV_RATE_10_GBPS  = 3,
-	IBV_RATE_20_GBPS  = 6,
-	IBV_RATE_30_GBPS  = 4,
-	IBV_RATE_40_GBPS  = 7,
-	IBV_RATE_60_GBPS  = 8,
-	IBV_RATE_80_GBPS  = 9,
-	IBV_RATE_120_GBPS = 10,
-	IBV_RATE_14_GBPS  = 11,
-	IBV_RATE_56_GBPS  = 12,
-	IBV_RATE_112_GBPS = 13,
-	IBV_RATE_168_GBPS = 14,
-	IBV_RATE_25_GBPS  = 15,
-	IBV_RATE_100_GBPS = 16,
-	IBV_RATE_200_GBPS = 17,
-	IBV_RATE_300_GBPS = 18
-};
-
-/**
- * ibv_rate_to_mult - Convert the IB rate enum to a multiple of the
- * base rate of 2.5 Gbit/sec.  For example, IBV_RATE_5_GBPS will be
- * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
- * @rate: rate to convert.
- */
-int ibv_rate_to_mult(enum ibv_rate rate) __attribute_const;
-
-/**
- * mult_to_ibv_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate enum.
- * @mult: multiple to convert.
- */
-enum ibv_rate mult_to_ibv_rate(int mult) __attribute_const;
-
-/**
- * ibv_rate_to_mbps - Convert the IB rate enum to Mbit/sec.
- * For example, IBV_RATE_5_GBPS will return the value 5000.
- * @rate: rate to convert.
- */
-int ibv_rate_to_mbps(enum ibv_rate rate) __attribute_const;
-
-/**
- * mbps_to_ibv_rate - Convert a Mbit/sec value to an IB rate enum.
- * @mbps: value to convert.
- */
-enum ibv_rate mbps_to_ibv_rate(int mbps) __attribute_const;
-
-struct ibv_ah_attr {
-	struct ibv_global_route	grh;
-	uint16_t		dlid;
-	uint8_t			sl;
-	uint8_t			src_path_bits;
-	uint8_t			static_rate;
-	uint8_t			is_global;
-	uint8_t			port_num;
-};
-
-enum ibv_srq_attr_mask {
-	IBV_SRQ_MAX_WR	= 1 << 0,
-	IBV_SRQ_LIMIT	= 1 << 1
-};
-
-struct ibv_srq_attr {
-	uint32_t		max_wr;
-	uint32_t		max_sge;
-	uint32_t		srq_limit;
-};
-
-struct ibv_srq_init_attr {
-	void		       *srq_context;
-	struct ibv_srq_attr	attr;
-};
-
-enum ibv_srq_type {
-	IBV_SRQT_BASIC,
-	IBV_SRQT_XRC
-};
-
-enum ibv_srq_init_attr_mask {
-	IBV_SRQ_INIT_ATTR_TYPE		= 1 << 0,
-	IBV_SRQ_INIT_ATTR_PD		= 1 << 1,
-	IBV_SRQ_INIT_ATTR_XRCD		= 1 << 2,
-	IBV_SRQ_INIT_ATTR_CQ		= 1 << 3,
-	IBV_SRQ_INIT_ATTR_RESERVED	= 1 << 4
-};
-
-struct ibv_srq_init_attr_ex {
-	void		       *srq_context;
-	struct ibv_srq_attr	attr;
-
-	uint32_t		comp_mask;
-	enum ibv_srq_type	srq_type;
-	struct ibv_pd	       *pd;
-	struct ibv_xrcd	       *xrcd;
-	struct ibv_cq	       *cq;
-};
-
-enum ibv_qp_type {
-	IBV_QPT_RC = 2,
-	IBV_QPT_UC,
-	IBV_QPT_UD,
-	/* XRC compatible code */
-	IBV_QPT_XRC,
-	IBV_QPT_RAW_PACKET = 8,
-	IBV_QPT_RAW_ETH = 8,
-	IBV_QPT_XRC_SEND = 9,
-	IBV_QPT_XRC_RECV,
-
-	/* Leave a gap for future qp types before starting with
-	 * experimental qp types.
-	 */
-	IBV_EXP_QP_TYPE_START	= 32,
-	IBV_EXP_QPT_DC_INI	= IBV_EXP_QP_TYPE_START
-};
-
-struct ibv_qp_cap {
-	uint32_t		max_send_wr;
-	uint32_t		max_recv_wr;
-	uint32_t		max_send_sge;
-	uint32_t		max_recv_sge;
-	uint32_t		max_inline_data;
-};
-
-struct ibv_qp_init_attr {
-	void		       *qp_context;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	struct ibv_qp_cap	cap;
-	enum ibv_qp_type	qp_type;
-	int			sq_sig_all;
-	/* Below is needed for backwards compatabile */
-	struct ibv_xrc_domain  *xrc_domain;
-};
-
-enum ibv_qp_init_attr_mask {
-	IBV_QP_INIT_ATTR_PD		= 1 << 0,
-	IBV_QP_INIT_ATTR_XRCD		= 1 << 1,
-	IBV_QP_INIT_ATTR_RESERVED	= 1 << 2
-};
-
-struct ibv_qp_init_attr_ex {
-	void		       *qp_context;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	struct ibv_qp_cap	cap;
-	enum ibv_qp_type	qp_type;
-	int			sq_sig_all;
-
-	uint32_t		comp_mask;
-	struct ibv_pd	       *pd;
-	struct ibv_xrcd	       *xrcd;
-};
-
-enum ibv_qp_open_attr_mask {
-	IBV_QP_OPEN_ATTR_NUM		= 1 << 0,
-	IBV_QP_OPEN_ATTR_XRCD		= 1 << 1,
-	IBV_QP_OPEN_ATTR_CONTEXT	= 1 << 2,
-	IBV_QP_OPEN_ATTR_TYPE		= 1 << 3,
-	IBV_QP_OPEN_ATTR_RESERVED	= 1 << 4
-};
-
-struct ibv_qp_open_attr {
-	uint32_t		comp_mask;
-	uint32_t		qp_num;
-	struct ibv_xrcd        *xrcd;
-	void		       *qp_context;
-	enum ibv_qp_type	qp_type;
-};
-
-enum ibv_qp_attr_mask {
-	IBV_QP_STATE			= 1 << 	0,
-	IBV_QP_CUR_STATE		= 1 << 	1,
-	IBV_QP_EN_SQD_ASYNC_NOTIFY	= 1 << 	2,
-	IBV_QP_ACCESS_FLAGS		= 1 << 	3,
-	IBV_QP_PKEY_INDEX		= 1 << 	4,
-	IBV_QP_PORT			= 1 << 	5,
-	IBV_QP_QKEY			= 1 << 	6,
-	IBV_QP_AV			= 1 << 	7,
-	IBV_QP_PATH_MTU			= 1 << 	8,
-	IBV_QP_TIMEOUT			= 1 << 	9,
-	IBV_QP_RETRY_CNT		= 1 << 10,
-	IBV_QP_RNR_RETRY		= 1 << 11,
-	IBV_QP_RQ_PSN			= 1 << 12,
-	IBV_QP_MAX_QP_RD_ATOMIC		= 1 << 13,
-	IBV_QP_ALT_PATH			= 1 << 14,
-	IBV_QP_MIN_RNR_TIMER		= 1 << 15,
-	IBV_QP_SQ_PSN			= 1 << 16,
-	IBV_QP_MAX_DEST_RD_ATOMIC	= 1 << 17,
-	IBV_QP_PATH_MIG_STATE		= 1 << 18,
-	IBV_QP_CAP			= 1 << 19,
-	IBV_QP_DEST_QPN			= 1 << 20
-};
-
-enum ibv_qp_state {
-	IBV_QPS_RESET,
-	IBV_QPS_INIT,
-	IBV_QPS_RTR,
-	IBV_QPS_RTS,
-	IBV_QPS_SQD,
-	IBV_QPS_SQE,
-	IBV_QPS_ERR,
-	IBV_QPS_UNKNOWN
-};
-
-enum ibv_mig_state {
-	IBV_MIG_MIGRATED,
-	IBV_MIG_REARM,
-	IBV_MIG_ARMED
-};
-
-struct ibv_qp_attr {
-	enum ibv_qp_state	qp_state;
-	enum ibv_qp_state	cur_qp_state;
-	enum ibv_mtu		path_mtu;
-	enum ibv_mig_state	path_mig_state;
-	uint32_t		qkey;
-	uint32_t		rq_psn;
-	uint32_t		sq_psn;
-	uint32_t		dest_qp_num;
-	int			qp_access_flags;
-	struct ibv_qp_cap	cap;
-	struct ibv_ah_attr	ah_attr;
-	struct ibv_ah_attr	alt_ah_attr;
-	uint16_t		pkey_index;
-	uint16_t		alt_pkey_index;
-	uint8_t			en_sqd_async_notify;
-	uint8_t			sq_draining;
-	uint8_t			max_rd_atomic;
-	uint8_t			max_dest_rd_atomic;
-	uint8_t			min_rnr_timer;
-	uint8_t			port_num;
-	uint8_t			timeout;
-	uint8_t			retry_cnt;
-	uint8_t			rnr_retry;
-	uint8_t			alt_port_num;
-	uint8_t			alt_timeout;
-};
-
-enum ibv_wr_opcode {
-	IBV_WR_RDMA_WRITE,
-	IBV_WR_RDMA_WRITE_WITH_IMM,
-	IBV_WR_SEND,
-	IBV_WR_SEND_WITH_IMM,
-	IBV_WR_RDMA_READ,
-	IBV_WR_ATOMIC_CMP_AND_SWP,
-	IBV_WR_ATOMIC_FETCH_AND_ADD
-};
-
-enum ibv_send_flags {
-	IBV_SEND_FENCE		= 1 << 0,
-	IBV_SEND_SIGNALED	= 1 << 1,
-	IBV_SEND_SOLICITED	= 1 << 2,
-	IBV_SEND_INLINE		= 1 << 3
-};
-
-struct ibv_sge {
-	uint64_t		addr;
-	uint32_t		length;
-	uint32_t		lkey;
-};
-
-struct ibv_send_wr {
-	uint64_t		wr_id;
-	struct ibv_send_wr     *next;
-	struct ibv_sge	       *sg_list;
-	int			num_sge;
-	enum ibv_wr_opcode	opcode;
-	int			send_flags;
-	uint32_t		imm_data;	/* in network byte order */
-	union {
-		struct {
-			uint64_t	remote_addr;
-			uint32_t	rkey;
-		} rdma;
-		struct {
-			uint64_t	remote_addr;
-			uint64_t	compare_add;
-			uint64_t	swap;
-			uint32_t	rkey;
-		} atomic;
-		struct {
-			struct ibv_ah  *ah;
-			uint32_t	remote_qpn;
-			uint32_t	remote_qkey;
-		} ud;
-	} wr;
-	union {
-		union {
-			struct {
-				uint32_t    remote_srqn;
-			} xrc;
-		} qp_type;
-
-		uint32_t		xrc_remote_srq_num;
-	};
-};
-
-struct ibv_recv_wr {
-	uint64_t		wr_id;
-	struct ibv_recv_wr     *next;
-	struct ibv_sge	       *sg_list;
-	int			num_sge;
-};
-
-struct ibv_mw_bind {
-	uint64_t		wr_id;
-	struct ibv_mr	       *mr;
-	void		       *addr;
-	size_t			length;
-	int			send_flags;
-	int			mw_access_flags;
-};
-
-struct ibv_srq {
-	struct ibv_context     *context;
-	void		       *srq_context;
-	struct ibv_pd	       *pd;
-	uint32_t		handle;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		events_completed;
-
-	/* below are for source compatabilty with legacy XRC,
-	*   padding based on ibv_srq_legacy.
-	*/
-	uint32_t		xrc_srq_num_bin_compat_padding;
-	struct ibv_xrc_domain	*xrc_domain_bin_compat_padding;
-	struct ibv_cq	*xrc_cq_bin_compat_padding;
-	void		*ibv_srq_padding;
-
-	/* legacy fields */
-	uint32_t		xrc_srq_num;
-	struct ibv_xrc_domain	*xrc_domain;
-	struct ibv_cq		*xrc_cq;
-};
-
-/* Not in use in new API, needed for compilation as part of source compat layer */
-enum ibv_event_flags {
-	IBV_XRC_QP_EVENT_FLAG = 0x80000000,
-};
-
-
-
-struct ibv_qp {
-	struct ibv_context     *context;
-	void		       *qp_context;
-	struct ibv_pd	       *pd;
-	struct ibv_cq	       *send_cq;
-	struct ibv_cq	       *recv_cq;
-	struct ibv_srq	       *srq;
-	uint32_t		handle;
-	uint32_t		qp_num;
-	enum ibv_qp_state       state;
-	enum ibv_qp_type	qp_type;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		events_completed;
-};
-
-struct ibv_comp_channel {
-	struct ibv_context     *context;
-	int			fd;
-	int			refcnt;
-};
-
-struct ibv_cq {
-	struct ibv_context     *context;
-	struct ibv_comp_channel *channel;
-	void		       *cq_context;
-	uint32_t		handle;
-	int			cqe;
-
-	pthread_mutex_t		mutex;
-	pthread_cond_t		cond;
-	uint32_t		comp_events_completed;
-	uint32_t		async_events_completed;
-};
-
-struct ibv_ah {
-	struct ibv_context     *context;
-	struct ibv_pd	       *pd;
-	uint32_t		handle;
-};
-
-enum ibv_flow_flags {
-	IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK = 1,
-	IBV_FLOW_ATTR_FLAGS_DONT_TRAP = 1 << 1,
-};
-
-enum ibv_flow_attr_type {
-	/* steering according to rule specifications */
-	IBV_FLOW_ATTR_NORMAL		= 0x0,
-	/* default unicast and multicast rule -
-	 * receive all Eth traffic which isn't steered to any QP
-	 */
-	IBV_FLOW_ATTR_ALL_DEFAULT	= 0x1,
-	/* default multicast rule -
-	 * receive all Eth multicast traffic which isn't steered to any QP
-	 */
-	IBV_FLOW_ATTR_MC_DEFAULT	= 0x2,
-};
-
-enum ibv_flow_spec_type {
-	IBV_FLOW_SPEC_ETH	= 0x20,
-	IBV_FLOW_SPEC_IPV4	= 0x30,
-	IBV_FLOW_SPEC_TCP	= 0x40,
-	IBV_FLOW_SPEC_UDP	= 0x41,
-};
-
-struct ibv_flow_eth_filter {
-	uint8_t		dst_mac[6];
-	uint8_t		src_mac[6];
-	uint16_t	ether_type;
-	/*
-	 * same layout as 802.1q: prio 3, cfi 1, vlan id 12
-	 */
-	uint16_t	vlan_tag;
-};
-
-struct ibv_flow_spec_eth {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_eth_filter val;
-	struct ibv_flow_eth_filter mask;
-};
-
-struct ibv_flow_ipv4_filter {
-	uint32_t src_ip;
-	uint32_t dst_ip;
-};
-
-struct ibv_flow_spec_ipv4 {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_ipv4_filter val;
-	struct ibv_flow_ipv4_filter mask;
-};
-
-struct ibv_flow_tcp_udp_filter {
-	uint16_t dst_port;
-	uint16_t src_port;
-};
-
-struct ibv_flow_spec_tcp_udp {
-	enum ibv_flow_spec_type  type;
-	uint16_t  size;
-	struct ibv_flow_tcp_udp_filter val;
-	struct ibv_flow_tcp_udp_filter mask;
-};
-
-struct ibv_flow_spec {
-	union {
-		struct {
-			enum ibv_flow_spec_type	type;
-			uint16_t		size;
-		} hdr;
-		struct ibv_flow_spec_eth eth;
-		struct ibv_flow_spec_ipv4 ipv4;
-		struct ibv_flow_spec_tcp_udp tcp_udp;
-	};
-};
-
-struct ibv_flow_attr {
-	uint32_t comp_mask;
-	enum ibv_flow_attr_type type;
-	uint16_t size;
-	uint16_t priority;
-	uint8_t num_of_specs;
-	uint8_t port;
-	uint32_t flags;
-	/* Following are the optional layers according to user request
-	 * struct ibv_flow_spec_xxx [L2]
-	 * struct ibv_flow_spec_yyy [L3/L4]
-	 */
-};
-
-struct ibv_flow {
-	uint32_t	   comp_mask;
-	struct ibv_context *context;
-	uint32_t	   handle;
-};
-
-struct ibv_device;
-struct ibv_context;
-
-struct ibv_device_ops {
-	struct ibv_context *	(*alloc_context)(struct ibv_device *device, int cmd_fd);
-	void			(*free_context)(struct ibv_context *context);
-};
-
-enum {
-	IBV_SYSFS_NAME_MAX	= 64,
-	IBV_SYSFS_PATH_MAX	= 256
-};
-
-struct ibv_device {
-	struct ibv_device_ops	ops;
-	enum ibv_node_type	node_type;
-	enum ibv_transport_type	transport_type;
-	/* Name of underlying kernel IB device, eg "mthca0" */
-	char			name[IBV_SYSFS_NAME_MAX];
-	/* Name of uverbs device, eg "uverbs0" */
-	char			dev_name[IBV_SYSFS_NAME_MAX];
-	/* Path to infiniband_verbs class device in sysfs */
-	char			dev_path[IBV_SYSFS_PATH_MAX];
-	/* Path to infiniband class device in sysfs */
-	char			ibdev_path[IBV_SYSFS_PATH_MAX];
-};
-
-struct verbs_device {
-	struct ibv_device device; /* Must be first */
-	size_t	sz;
-	size_t	size_of_context;
-	int	(*init_context)(struct verbs_device *device,
-				struct ibv_context *ctx, int cmd_fd);
-	void	(*uninit_context)(struct verbs_device *device,
-				struct ibv_context *ctx);
-	/* future fields added here */
-};
-
-struct ibv_context_ops {
-	int			(*query_device)(struct ibv_context *context,
-					      struct ibv_device_attr *device_attr);
-	int			(*query_port)(struct ibv_context *context, uint8_t port_num,
-					      struct ibv_port_attr *port_attr);
-	struct ibv_pd *		(*alloc_pd)(struct ibv_context *context);
-	int			(*dealloc_pd)(struct ibv_pd *pd);
-	struct ibv_mr *		(*reg_mr)(struct ibv_pd *pd, void *addr, size_t length,
-					  int access);
-	struct ibv_mr *		(*rereg_mr)(struct ibv_mr *mr,
-					    int flags,
-					    struct ibv_pd *pd, void *addr,
-					    size_t length,
-					    int access);
-	int			(*dereg_mr)(struct ibv_mr *mr);
-	struct ibv_mw *		(*alloc_mw)(struct ibv_pd *pd, enum ibv_mw_type type);
-	int			(*bind_mw)(struct ibv_qp *qp, struct ibv_mw *mw,
-					   struct ibv_mw_bind *mw_bind);
-	int			(*dealloc_mw)(struct ibv_mw *mw);
-	struct ibv_cq *		(*create_cq)(struct ibv_context *context, int cqe,
-					     struct ibv_comp_channel *channel,
-					     int comp_vector);
-	int			(*poll_cq)(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc);
-	int			(*req_notify_cq)(struct ibv_cq *cq, int solicited_only);
-	void			(*cq_event)(struct ibv_cq *cq);
-	int			(*resize_cq)(struct ibv_cq *cq, int cqe);
-	int			(*destroy_cq)(struct ibv_cq *cq);
-	struct ibv_srq *	(*create_srq)(struct ibv_pd *pd,
-					      struct ibv_srq_init_attr *srq_init_attr);
-	int			(*modify_srq)(struct ibv_srq *srq,
-					      struct ibv_srq_attr *srq_attr,
-					      int srq_attr_mask);
-	int			(*query_srq)(struct ibv_srq *srq,
-					     struct ibv_srq_attr *srq_attr);
-	int			(*destroy_srq)(struct ibv_srq *srq);
-	int			(*post_srq_recv)(struct ibv_srq *srq,
-						 struct ibv_recv_wr *recv_wr,
-						 struct ibv_recv_wr **bad_recv_wr);
-	struct ibv_qp *		(*create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *attr);
-	int			(*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-					    int attr_mask,
-					    struct ibv_qp_init_attr *init_attr);
-	int			(*modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr,
-					     int attr_mask);
-	int			(*destroy_qp)(struct ibv_qp *qp);
-	int			(*post_send)(struct ibv_qp *qp, struct ibv_send_wr *wr,
-					     struct ibv_send_wr **bad_wr);
-	int			(*post_recv)(struct ibv_qp *qp, struct ibv_recv_wr *wr,
-					     struct ibv_recv_wr **bad_wr);
-	struct ibv_ah *		(*create_ah)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
-	int			(*destroy_ah)(struct ibv_ah *ah);
-	int			(*attach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
-						uint16_t lid);
-	int			(*detach_mcast)(struct ibv_qp *qp, const union ibv_gid *gid,
-						uint16_t lid);
-	void			(*async_event)(struct ibv_async_event *event);
-};
-
-struct ibv_context {
-	struct ibv_device      *device;
-	struct ibv_context_ops	ops;
-	int			cmd_fd;
-	int			async_fd;
-	int			num_comp_vectors;
-	pthread_mutex_t		mutex;
-	void		       *abi_compat;
-};
-
-enum verbs_context_mask {
-	VERBS_CONTEXT_XRCD         = (uint64_t)1 << 0,
-	VERBS_CONTEXT_SRQ          = (uint64_t)1 << 1,
-	VERBS_CONTEXT_QP           = (uint64_t)1 << 2,
-	VERBS_CONTEXT_RESERVED     = (uint64_t)1 << 3,
-	VERBS_CONTEXT_EXP	   = (uint64_t)1 << 62
-};
-
-struct verbs_context {
-	/*  "grows up" - new fields go here */
-	int (*_reserved_2) (void);
-	int (*destroy_flow) (struct ibv_flow *flow);
-	int (*_reserved_1) (void);
-	struct ibv_flow * (*create_flow) (struct ibv_qp *qp,
-					  struct ibv_flow_attr *flow_attr);
-	struct ibv_qp * (*open_qp)(struct ibv_context *context,
-			struct ibv_qp_open_attr *attr);
-	struct ibv_qp * (*create_qp_ex)(struct ibv_context *context,
-			struct ibv_qp_init_attr_ex *qp_init_attr_ex);
-	int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num);
-	struct ibv_srq * (*create_srq_ex)(struct ibv_context *context,
-			struct ibv_srq_init_attr_ex *srq_init_attr_ex);
-	struct ibv_xrcd * (*open_xrcd)(struct ibv_context *context,
-			struct ibv_xrcd_init_attr *xrcd_init_attr);
-	int  (*close_xrcd)(struct ibv_xrcd *xrcd);
-	uint64_t has_comp_mask;
-	size_t   sz;	/* Must be immediately before struct ibv_context */
-	struct ibv_context context;/* Must be last field in the struct */
-};
-
-/*XXX:__VERBS_ABI_IS_EXTENDED produces warning "integer operation result is out of range" with g++ 4.8.2*/
-/*static inline struct verbs_context *verbs_get_ctx(struct ibv_context *ctx)
-{
-	return (!ctx || (ctx->abi_compat != __VERBS_ABI_IS_EXTENDED)) ?
-		NULL : container_of(ctx, struct verbs_context, context);
-}
-
-#define verbs_get_ctx_op(ctx, op) ({ \
-	struct verbs_context *_vctx = verbs_get_ctx(ctx); \
-	(!_vctx || (_vctx->sz < sizeof(*_vctx) - offsetof(struct verbs_context, op)) || \
-	!_vctx->op) ? NULL : _vctx; })*/
-
-#define verbs_set_ctx_op(_vctx, op, ptr) ({ \
-	struct verbs_context *vctx = _vctx; \
-	if (vctx && (vctx->sz >= sizeof(*vctx) - offsetof(struct verbs_context, op))) \
-		vctx->op = ptr; })
-
-static inline struct verbs_device *verbs_get_device(struct ibv_device *dev)
-{
-	return (dev->ops.alloc_context) ?
-		NULL : container_of(dev, struct verbs_device, device);
-}
-
-static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
-  return qp->context->ops.post_send(qp, wr, bad_wr);
-}
-
-struct ibv_ece {
-	/*
-	 * Unique identifier of the provider vendor on the network.
-	 * The providers will set IEEE OUI here to distinguish
-	 * itself in non-homogenius network.
-	 */
-	uint32_t vendor_id;
-	/*
-	 * Provider specific attributes which are supported or
-	 * needed to be enabled by ECE users.
-	 */
-	uint32_t options;
-	uint32_t comp_mask;
-};
-
-#endif  // NCCL_IBV_CORE_H_
diff --git a/nvls/ibvsymbols.h b/nvls/ibvsymbols.h
deleted file mode 100644
index 906b0df74..000000000
--- a/nvls/ibvsymbols.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef NCCL_IBV_SYMBOLS_H_
-#define NCCL_IBV_SYMBOLS_H_
-
-#ifdef NCCL_BUILD_RDMA_CORE
-#include <infiniband/verbs.h>
-#else
-#include "ibvcore.h"
-#endif
-
-#include "nccl.h"
-
-/* IB Verbs Function Pointers*/
-struct ncclIbvSymbols {
-  int (*ibv_internal_fork_init)(void);
-  struct ibv_device** (*ibv_internal_get_device_list)(int *num_devices);
-  void (*ibv_internal_free_device_list)(struct ibv_device **list);
-  const char * (*ibv_internal_get_device_name)(struct ibv_device *device);
-  struct ibv_context* (*ibv_internal_open_device)(struct ibv_device* device);
-  int (*ibv_internal_close_device)(struct ibv_context *context);
-  int (*ibv_internal_get_async_event)(struct ibv_context *context, struct ibv_async_event *event);
-  void (*ibv_internal_ack_async_event)(struct ibv_async_event *event);
-  int (*ibv_internal_query_device)(struct ibv_context *context, struct ibv_device_attr *device_attr);
-  int (*ibv_internal_query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
-  int (*ibv_internal_query_gid)(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
-  int (*ibv_internal_query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
-  struct ibv_pd * (*ibv_internal_alloc_pd)(struct ibv_context *context);
-  int (*ibv_internal_dealloc_pd)(struct ibv_pd *pd);
-  struct ibv_mr * (*ibv_internal_reg_mr)(struct ibv_pd *pd, void *addr, size_t length, int access);
-  struct ibv_mr * (*ibv_internal_reg_mr_iova2)(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, unsigned int access);
-  /* DMA-BUF support */
-  struct ibv_mr * (*ibv_internal_reg_dmabuf_mr)(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
-  int (*ibv_internal_dereg_mr)(struct ibv_mr *mr);
-  struct ibv_cq * (*ibv_internal_create_cq)(struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
-  int (*ibv_internal_destroy_cq)(struct ibv_cq *cq);
-  struct ibv_qp * (*ibv_internal_create_qp)(struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
-  int (*ibv_internal_modify_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
-  int (*ibv_internal_destroy_qp)(struct ibv_qp *qp);
-  const char * (*ibv_internal_event_type_str)(enum ibv_event_type event);
-  int (*ibv_internal_query_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
-  int (*ibv_internal_set_ece)(struct ibv_qp *qp, struct ibv_ece *ece);
-};
-
-/* Constructs IB verbs symbols per rdma-core linking or dynamic loading mode */
-ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols);
-
-#endif  // NCCL_IBV_SYMBOLS_H_
diff --git a/nvls/ibvwrap.h b/nvls/ibvwrap.h
deleted file mode 100644
index c3709584c..000000000
--- a/nvls/ibvwrap.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
- * Copyright (c) 2004, 2011-2012 Intel Corporation.  All rights reserved.
- * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc.  All rights reserved.
- * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
- *
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_IBVWRAP_H_
-#define NCCL_IBVWRAP_H_
-
-#ifdef NCCL_BUILD_RDMA_CORE
-#include <infiniband/verbs.h>
-#else
-#include "ibvcore.h"
-#endif
-
-#include "core.h"
-#include <sys/types.h>
-#include <unistd.h>
-
-typedef enum ibv_return_enum
-{
-    IBV_SUCCESS = 0,                   //!< The operation was successful
-} ibv_return_t;
-
-ncclResult_t wrap_ibv_symbols(void);
-/* NCCL wrappers of IB verbs functions */
-ncclResult_t wrap_ibv_fork_init(void);
-ncclResult_t wrap_ibv_get_device_list(struct ibv_device ***ret, int *num_devices);
-ncclResult_t wrap_ibv_free_device_list(struct ibv_device **list);
-const char *wrap_ibv_get_device_name(struct ibv_device *device);
-ncclResult_t wrap_ibv_open_device(struct ibv_context **ret, struct ibv_device *device);
-ncclResult_t wrap_ibv_close_device(struct ibv_context *context);
-ncclResult_t wrap_ibv_get_async_event(struct ibv_context *context, struct ibv_async_event *event);
-ncclResult_t wrap_ibv_ack_async_event(struct ibv_async_event *event);
-ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr);
-ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr);
-ncclResult_t wrap_ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid);
-ncclResult_t wrap_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr);
-ncclResult_t wrap_ibv_alloc_pd(struct ibv_pd **ret, struct ibv_context *context);
-ncclResult_t wrap_ibv_dealloc_pd(struct ibv_pd *pd);
-ncclResult_t wrap_ibv_reg_mr(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, int access);
-struct ibv_mr * wrap_direct_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access);
-ncclResult_t wrap_ibv_reg_mr_iova2(struct ibv_mr **ret, struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access);
-/* DMA-BUF support */
-ncclResult_t wrap_ibv_reg_dmabuf_mr(struct ibv_mr **ret, struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
-struct ibv_mr * wrap_direct_ibv_reg_dmabuf_mr(struct ibv_pd *pd, uint64_t offset, size_t length, uint64_t iova, int fd, int access);
-ncclResult_t wrap_ibv_dereg_mr(struct ibv_mr *mr);
-ncclResult_t wrap_ibv_create_comp_channel(struct ibv_comp_channel **ret, struct ibv_context *context);
-ncclResult_t wrap_ibv_destroy_comp_channel(struct ibv_comp_channel *channel);
-ncclResult_t wrap_ibv_create_cq(struct ibv_cq **ret, struct ibv_context *context, int cqe, void *cq_context, struct ibv_comp_channel *channel, int comp_vector);
-ncclResult_t wrap_ibv_destroy_cq(struct ibv_cq *cq);
-static inline ncclResult_t wrap_ibv_poll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *wc, int* num_done) {
-  int done = cq->context->ops.poll_cq(cq, num_entries, wc); /*returns the number of wcs or 0 on success, a negative number otherwise*/
-  if (done < 0) {
-    WARN("Call to ibv_poll_cq() returned %d", done);
-    return ncclSystemError;
-  }
-  *num_done = done;
-  return ncclSuccess;
-}
-ncclResult_t wrap_ibv_create_qp(struct ibv_qp **ret, struct ibv_pd *pd, struct ibv_qp_init_attr *qp_init_attr);
-ncclResult_t wrap_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
-ncclResult_t wrap_ibv_destroy_qp(struct ibv_qp *qp);
-ncclResult_t wrap_ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
-ncclResult_t wrap_ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece, int* supported);
-
-static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) {
-  int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  if (ret != IBV_SUCCESS) {
-    WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr);
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-static inline ncclResult_t wrap_ibv_post_recv(struct ibv_qp *qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) {
-  int ret = qp->context->ops.post_recv(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/
-  if (ret != IBV_SUCCESS) {
-    WARN("ibv_post_recv() failed with error %s", strerror(ret));
-    return ncclSystemError;
-  }
-  return ncclSuccess;
-}
-
-ncclResult_t wrap_ibv_event_type_str(char **ret, enum ibv_event_type event);
-
-#endif //End include guard
diff --git a/nvls/info.h b/nvls/info.h
deleted file mode 100644
index f65ed2e69..000000000
--- a/nvls/info.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_INFO_H_
-#define NCCL_INFO_H_
-
-#include "nccl.h"
-#include "device.h"
-#include "collectives.h"
-#include "core.h"
-#include "utils.h"
-#include "strongstream.h"
-
-typedef enum : uint8_t {
-  ncclPatternRing,
-  ncclPatternRingTwice,
-  ncclPatternPipelineFrom,
-  ncclPatternPipelineTo,
-  ncclPatternTreeUp,
-  ncclPatternTreeDown,
-  ncclPatternTreeUpDown,
-  ncclPatternCollnetChain,
-  ncclPatternCollnetDirect,
-  ncclPatternNvls,
-  ncclPatternNvlsTree,
-  ncclPatternSend,
-  ncclPatternRecv
-} ncclPattern_t;
-
-// Used to pass NCCL call information between functions
-struct ncclInfo {
-  ncclFunc_t coll;
-  const char* opName;
-  // NCCL Coll Args
-  const void* sendbuff;
-  void* recvbuff;
-  size_t count;
-  ncclDataType_t datatype;
-  ncclRedOp_t op;
-  int root; // peer for p2p operations
-  ncclComm_t comm;
-  cudaStream_t stream;
-  // Algorithm details
-  int chunkSteps;
-  int sliceSteps;
-  // Computed later
-  ncclDevRedOpFull opFull;
-  int algorithm;
-  int protocol;
-  ncclPattern_t pattern;
-  int nChannels;
-  int nThreads;
-  size_t nBytes;
-  size_t sendbuffSize;
-  size_t recvbuffSize;
-  int nstepsPerLoop;
-  int nchunksPerLoop;
-  int chunkSize;
-  int channelId;
-};
-
-inline ncclResult_t ncclInfoSetDerived(struct ncclInfo* info, int nRanks) {
-  info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
-    info->count = info->nBytes;
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= nRanks; // count is per rank
-
-  /* compute buffer size for NVLS buffer registration */
-  if (info->coll == ncclFuncAllGather) {
-    info->sendbuffSize = info->count * ncclTypeSize(info->datatype);
-    info->recvbuffSize = info->sendbuffSize * nRanks;
-  } else if (info->coll == ncclFuncReduceScatter) {
-    info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
-    info->sendbuffSize = info->recvbuffSize * nRanks;
-  } else {
-    info->sendbuffSize = info->recvbuffSize = info->count * ncclTypeSize(info->datatype);
-  }
-  return ncclSuccess;
-}
-
-struct ncclTaskColl {
-  struct ncclTaskColl* next;
-  ncclFunc_t func;
-  void const* sendbuff;
-  void* recvbuff;
-  size_t count;
-  int root;
-  ncclDataType_t datatype;
-  ncclDevRedOpFull op;
-  int chunkSteps, sliceSteps;
-};
-struct ncclTaskP2p {
-  ncclTaskP2p *next;
-  void *buff;
-  size_t bytes;
-  // Stateful chunk index. If a p2p gets "cut" over two plans this keeps track
-  // of where it left off.
-  int chunk;
-};
-
-struct ncclCudaStreamList {
-  struct ncclCudaStreamList *next;
-  cudaStream_t stream;
-};
-struct ncclTasks {
-  struct Peer {
-    bool sendSeen, recvSeen;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> sendQueue;
-    struct ncclIntruQueue<struct ncclTaskP2p, &ncclTaskP2p::next> recvQueue;
-  };
-  struct ncclIntruQueue<ncclTaskColl, &ncclTaskColl::next> collQueue;
-  size_t collBytesTotal;
-  struct Peer* peers/*[nRanks]*/;
-  int *p2pSendOrder, *p2pRecvOrder;
-  int p2pOrderSteps;
-  int nTasksColl, nTasksP2p;
-
-  // The list of user streams aggregated over all tasks present.
-  struct ncclCudaStreamList* streams;
-  // The most recent user stream. Ignored if streams==nullptr
-  cudaStream_t streamRecent;
-  // The graph capturing all user streams or invalid if none. Thus we restrict the
-  // user that all streams must be captured in the same graph or not captured
-  // at all. Technically we could probably relax this, but that would mean
-  // collecting a different `ncclTasks` per graph and one for non-graph.
-  struct ncclCudaGraph capturingGraph;
-};
-
-#endif
diff --git a/nvls/ipcsocket.cc b/nvls/ipcsocket.cc
deleted file mode 100644
index 9d66ac719..000000000
--- a/nvls/ipcsocket.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See COPYRIGHT for license information
- */
-
-#include "ipcsocket.h"
-#include "utils.h"
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-// Enable Linux abstract socket naming
-#define USE_ABSTRACT_SOCKET
-
-#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
-
-/*
- * Create a Unix Domain Socket
- */
-ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag) {
-  int fd = -1;
-  struct sockaddr_un cliaddr;
-  char temp[NCCL_IPC_SOCKNAME_LEN] = "";
-
-  if (handle == NULL) {
-    return ncclInternalError;
-  }
-
-  handle->fd = -1;
-  handle->socketName[0] = '\0';
-  if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
-    WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
-    return ncclSystemError;
-  }
-
-  bzero(&cliaddr, sizeof(cliaddr));
-  cliaddr.sun_family = AF_UNIX;
-
-  // Create unique name for the socket.
-  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
-  if (len > (sizeof(cliaddr.sun_path) - 1)) {
-    WARN("UDS: Cannot bind provided name to socket. Name too large");
-    return ncclInternalError;
-  }
-#ifndef USE_ABSTRACT_SOCKET
-  unlink(temp);
-#endif
-
-  TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
-
-  strncpy(cliaddr.sun_path, temp, len);
-#ifdef USE_ABSTRACT_SOCKET
-  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
-#endif
-  if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
-    WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno), errno);
-    close(fd);
-    return ncclSystemError;
-  }
-
-  handle->fd = fd;
-  strcpy(handle->socketName, temp);
-
-  handle->abortFlag = abortFlag;
-  // Mark socket as non-blocking
-  if (handle->abortFlag) {
-    int flags;
-    EQCHECK(flags = fcntl(fd, F_GETFL), -1);
-    SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
-  }
-
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd) {
-  if (handle == NULL) {
-    WARN("ncclSocketGetFd: pass NULL socket");
-    return ncclInvalidArgument;
-  }
-  if (fd) *fd = handle->fd;
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
-  if (handle == NULL) {
-    return ncclInternalError;
-  }
-  if (handle->fd <= 0) {
-    return ncclSuccess;
-  }
-#ifndef USE_ABSTRACT_SOCKET
-  if (handle->socketName[0] != '\0') {
-    unlink(handle->socketName);
-  }
-#endif
-  close(handle->fd);
-
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, int *recvFd) {
-  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
-  struct iovec iov[1];
-
-  // Union to guarantee alignment requirements for control array
-  union {
-    struct cmsghdr cm;
-    char control[CMSG_SPACE(sizeof(int))];
-  } control_un;
-
-  struct cmsghdr *cmptr;
-  char dummy_buffer[1];
-  int ret;
-
-  msg.msg_control = control_un.control;
-  msg.msg_controllen = sizeof(control_un.control);
-
-  if (hdr == NULL) {
-    iov[0].iov_base = (void *)dummy_buffer;
-    iov[0].iov_len = sizeof(dummy_buffer);
-  } else {
-    iov[0].iov_base = hdr;
-    iov[0].iov_len = hdrLen;
-  }
-
-  msg.msg_iov = iov;
-  msg.msg_iovlen = 1;
-
-  while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
-    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
-      WARN("UDS: Receiving data over socket failed : %d", errno);
-      return ncclSystemError;
-    }
-    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
-  }
-
-  if (recvFd != NULL) {
-    if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
-      if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
-        WARN("UDS: Receiving data over socket failed");
-      return ncclSystemError;
-      }
-
-      memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
-    } else {
-      WARN("UDS: Receiving data over socket %s failed", handle->socketName);
-      return ncclSystemError;
-    }
-    TRACE(NCCL_INIT|NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd, handle->socketName);
-  }
-
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
-  return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
-}
-
-ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen, const int sendFd, int rank, uint64_t hash) {
-  struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
-  struct iovec iov[1];
-  char temp[NCCL_IPC_SOCKNAME_LEN];
-
-  union {
-    struct cmsghdr cm;
-    char control[CMSG_SPACE(sizeof(int))];
-  } control_un;
-
-  struct cmsghdr *cmptr;
-  char dummy_buffer[1];
-  struct sockaddr_un cliaddr;
-
-  // Construct client address to send this shareable handle to
-  bzero(&cliaddr, sizeof(cliaddr));
-  cliaddr.sun_family = AF_UNIX;
-
-  int len = snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
-  if (len > (sizeof(cliaddr.sun_path) - 1)) {
-    WARN("UDS: Cannot connect to provided name for socket. Name too large");
-    return ncclInternalError;
-  }
-  (void) strncpy(cliaddr.sun_path, temp, len);
-
-#ifdef USE_ABSTRACT_SOCKET
-  cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
-#endif
-
-  TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen, temp);
-
-  if (sendFd != -1) {
-    TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
-
-    msg.msg_control = control_un.control;
-    msg.msg_controllen = sizeof(control_un.control);
-
-    cmptr = CMSG_FIRSTHDR(&msg);
-    cmptr->cmsg_len = CMSG_LEN(sizeof(int));
-    cmptr->cmsg_level = SOL_SOCKET;
-    cmptr->cmsg_type = SCM_RIGHTS;
-    memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
-  }
-
-  msg.msg_name = (void *)&cliaddr;
-  msg.msg_namelen = sizeof(struct sockaddr_un);
-
-  if (hdr == NULL) {
-    iov[0].iov_base = (void *)dummy_buffer;
-    iov[0].iov_len = sizeof(dummy_buffer);
-  } else {
-    iov[0].iov_base = hdr;
-    iov[0].iov_len = hdrLen;
-  }
-  msg.msg_iov = iov;
-  msg.msg_iovlen = 1;
-  msg.msg_flags = 0;
-
-  ssize_t sendResult;
-  while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
-    if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
-      WARN("UDS: Sending data over socket %s failed : %s (%d)", temp, strerror(errno), errno);
-      return ncclSystemError;
-    }
-    if (handle->abortFlag && *handle->abortFlag) return ncclInternalError;
-  }
-
-  return ncclSuccess;
-}
-
-ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd, int rank, uint64_t hash) {
-  return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
-}
diff --git a/nvls/ipcsocket.h b/nvls/ipcsocket.h
deleted file mode 100644
index ccecde84c..000000000
--- a/nvls/ipcsocket.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See COPYRIGHT for license information
- */
-
-#ifndef NCCL_IPCSOCKET_H
-#define NCCL_IPCSOCKET_H
-
-#include "nccl.h"
-#include <stdio.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/wait.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <memory.h>
-#include <sys/un.h>
-#include <inttypes.h>
-
-#define NCCL_IPC_SOCKNAME_LEN 64
-
-struct ncclIpcSocket {
-  int fd;
-  char socketName[NCCL_IPC_SOCKNAME_LEN];
-  volatile uint32_t* abortFlag;
-};
-
-ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank, uint64_t hash, volatile uint32_t* abortFlag);
-ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
-ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket* handle, int* fd);
-
-ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
-ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd, int rank, uint64_t hash);
-
-#endif /* NCCL_IPCSOCKET_H */
diff --git a/nvls/nccl_common.h b/nvls/nccl_common.h
deleted file mode 100644
index a37ac203e..000000000
--- a/nvls/nccl_common.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_DEBUG_H_
-#define NCCL_DEBUG_H_
-
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
-
-#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
-
-#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
-#define NCCL_ALGO_UNDEF -1
-#define NCCL_ALGO_TREE 0
-#define NCCL_ALGO_RING 1
-#define NCCL_ALGO_COLLNET_DIRECT 2
-#define NCCL_ALGO_COLLNET_CHAIN 3
-#define NCCL_ALGO_NVLS 4
-#define NCCL_ALGO_NVLS_TREE 5
-
-#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
-#define NCCL_PROTO_UNDEF -1
-#define NCCL_PROTO_LL 0
-#define NCCL_PROTO_LL128 1
-#define NCCL_PROTO_SIMPLE 2
-
-#endif
diff --git a/nvls/nccl_net.h b/nvls/nccl_net.h
deleted file mode 100644
index 9b3e6719f..000000000
--- a/nvls/nccl_net.h
+++ /dev/null
@@ -1,333 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-#include "net_device.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 32
-
-typedef struct {
-  char* name;                      // Used mostly for logging.
-  char* pciPath;                   // Path to the PCI device in /sys.
-  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
-                                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;                       // Port speed in Mbps.
-  int port;                        // Port number.
-  float latency;                   // Network latency
-  int maxComms;                    // Maximum number of comms we can create
-  int maxRecvs;                    // Maximum number of grouped receives.
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-} ncclNetProperties_v7_t;
-
-typedef ncclNetProperties_v7_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
-  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-
-  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
-  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
-
-  // Notify the plugin that a recv has completed by the device
-  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
-} ncclNet_v7_t;
-
-typedef ncclNet_v7_t ncclNet_t;
-
-#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v7
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v7
-
-#define NCCL_NET_MAX_REQUESTS_V6 8
-
-// v6 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-} ncclNetProperties_v6_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v7_t;
-
-typedef ncclCollNet_v7_t ncclCollNet_t;
-
-// v6 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-#endif // end include guard
diff --git a/nvls/nccl_tuner.h b/nvls/nccl_tuner.h
deleted file mode 100644
index b4a696e38..000000000
--- a/nvls/nccl_tuner.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TUNER_H_
-#define NCCL_TUNER_H_
-
-#include "nccl.h"
-#include "nccl_common.h"
-
-// API to be implemented by external tuner
-typedef struct {
-  // Name of the tuner
-  const char* name;
-
-  // Initializes tuner states.
-  // nRanks: number of ranks in current communicator. Each communicator initialize its own tuner.
-  // nNodes: number of nodes in current communicator.
-  // logFunction: a logFunction can be useful to integrate logging together with NCCL core.
-  ncclResult_t (*init)(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction);
-
-  // Gets info (algo, protocol, number of ctas and threads) for a given collective.
-  // Inputs:
-  //   - collType: collective type , e.g., allreduce, allgather…
-  //   - nBytes: collective size in bytes
-  //   - collNetTypeSupport: whether collnet supports this type
-  //   - nvlsTypeSupport: whether nvlink sharp supports this time
-  //   - numPipeOps: number of operations in the group
-  //
-  // Outputs:
-  //   - algorithm: selected algorithm to be used for the given collective
-  //   - protocol: selected protocol to be used for the given collective
-  //   - nChannels: number of channels (hence SMs) to be used.
-  //
-  // If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
-  // default tuning for the given collective.
-  // Also, the plugin is allowed to not set any output, or set only the
-  // algorithm and protocol, but not only the algorithm or only the protocol.
-  // Unset fields will be set automatically by NCCL.
-  ncclResult_t (*getCollInfo)(ncclFunc_t collType, size_t nBytes,
-                              int collNetSupport, int nvlsSupport, int numPipeOps,
-                              int *algorithm, int *protocol, int* nChannels);
-
-  // Terminates the plugin and cleans up any resources that the plugin allocated.
-  ncclResult_t (*destroy)();
-} ncclTuner_v1_t;
-
-typedef ncclTuner_v1_t ncclTuner_t;
-
-#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v1"
-
-#endif
diff --git a/nvls/net.h b/nvls/net.h
deleted file mode 100644
index b5df58968..000000000
--- a/nvls/net.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_INT_NET_H_
-#define NCCL_INT_NET_H_
-
-#include "nccl.h"
-#include "nccl_net.h"
-#include "comm.h"
-#include "checks.h"
-
-typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
-
-ncclResult_t ncclNetPluginInit();
-ncclResult_t ncclNetInit(struct ncclComm* comm);
-int ncclNetVersion(struct ncclComm* comm);
-
-// Test whether the current GPU support GPU Direct RDMA.
-ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
-
-extern ncclNet_t ncclNetIb;
-extern ncclNet_t ncclNetSocket;
-
-#endif
diff --git a/nvls/net_device.h b/nvls/net_device.h
deleted file mode 100644
index 8f7c0d6e1..000000000
--- a/nvls/net_device.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_DEVICE_H_
-#define NCCL_NET_DEVICE_H_
-
-#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
-#define NCCL_NET_MTU_SIZE                    4096
-
-// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
-// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
-#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7  
-
-typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
-
-typedef struct {
-  ncclNetDeviceType netDeviceType; // Network offload type
-  int netDeviceVersion;            // Version number for network offload
-  void* handle;
-  size_t size;
-  int needsProxyProgress;
-} ncclNetDeviceHandle_v7_t;
-
-typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_t;
-
-#endif
diff --git a/nvls/nvmlwrap.h b/nvls/nvmlwrap.h
deleted file mode 100644
index 2ab8e3a2b..000000000
--- a/nvls/nvmlwrap.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NVMLWRAP_H_
-#define NCCL_NVMLWRAP_H_
-
-#include "nccl.h"
-
-//#define NCCL_NVML_DIRECT 1
-#ifndef NCCL_NVML_DIRECT
-#define NCCL_NVML_DIRECT 0
-#endif
-
-#if NCCL_NVML_DIRECT
-#include "nvml.h"
-#else
-// Dynamically handle dependencies on NVML
-
-/* Extracted from nvml.h */
-typedef struct nvmlDevice_st* nvmlDevice_t;
-#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE   16
-
-typedef enum nvmlEnableState_enum
-{
-    NVML_FEATURE_DISABLED    = 0,     //!< Feature disabled
-    NVML_FEATURE_ENABLED     = 1      //!< Feature enabled
-} nvmlEnableState_t;
-
-typedef enum nvmlNvLinkCapability_enum
-{
-    NVML_NVLINK_CAP_P2P_SUPPORTED = 0,     // P2P over NVLink is supported
-    NVML_NVLINK_CAP_SYSMEM_ACCESS = 1,     // Access to system memory is supported
-    NVML_NVLINK_CAP_P2P_ATOMICS   = 2,     // P2P atomics are supported
-    NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3,     // System memory atomics are supported
-    NVML_NVLINK_CAP_SLI_BRIDGE    = 4,     // SLI is supported over this link
-    NVML_NVLINK_CAP_VALID         = 5,     // Link is supported on this device
-    // should be last
-    NVML_NVLINK_CAP_COUNT
-} nvmlNvLinkCapability_t;
-
-typedef enum nvmlReturn_enum
-{
-    NVML_SUCCESS = 0,                   //!< The operation was successful
-    NVML_ERROR_UNINITIALIZED = 1,       //!< NVML was not first initialized with nvmlInit()
-    NVML_ERROR_INVALID_ARGUMENT = 2,    //!< A supplied argument is invalid
-    NVML_ERROR_NOT_SUPPORTED = 3,       //!< The requested operation is not available on target device
-    NVML_ERROR_NO_PERMISSION = 4,       //!< The current user does not have permission for operation
-    NVML_ERROR_ALREADY_INITIALIZED = 5, //!< Deprecated: Multiple initializations are now allowed through ref counting
-    NVML_ERROR_NOT_FOUND = 6,           //!< A query to find an object was unsuccessful
-    NVML_ERROR_INSUFFICIENT_SIZE = 7,   //!< An input argument is not large enough
-    NVML_ERROR_INSUFFICIENT_POWER = 8,  //!< A device's external power cables are not properly attached
-    NVML_ERROR_DRIVER_NOT_LOADED = 9,   //!< NVIDIA driver is not loaded
-    NVML_ERROR_TIMEOUT = 10,            //!< User provided timeout passed
-    NVML_ERROR_IRQ_ISSUE = 11,          //!< NVIDIA Kernel detected an interrupt issue with a GPU
-    NVML_ERROR_LIBRARY_NOT_FOUND = 12,  //!< NVML Shared Library couldn't be found or loaded
-    NVML_ERROR_FUNCTION_NOT_FOUND = 13, //!< Local version of NVML doesn't implement this function
-    NVML_ERROR_CORRUPTED_INFOROM = 14,  //!< infoROM is corrupted
-    NVML_ERROR_GPU_IS_LOST = 15,        //!< The GPU has fallen off the bus or has otherwise become inaccessible
-    NVML_ERROR_RESET_REQUIRED = 16,     //!< The GPU requires a reset before it can be used again
-    NVML_ERROR_OPERATING_SYSTEM = 17,   //!< The GPU control device has been blocked by the operating system/cgroups
-    NVML_ERROR_LIB_RM_VERSION_MISMATCH = 18,   //!< RM detects a driver/library version mismatch
-    NVML_ERROR_IN_USE = 19,             //!< An operation cannot be performed because the GPU is currently in use
-    NVML_ERROR_UNKNOWN = 999            //!< An internal driver error occurred
-} nvmlReturn_t;
-
-typedef struct nvmlPciInfo_st
-{
-    char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (&amp; NULL terminator)
-    unsigned int domain;             //!< The PCI domain on which the device's bus resides, 0 to 0xffff
-    unsigned int bus;                //!< The bus on which the device resides, 0 to 0xff
-    unsigned int device;             //!< The device's id on the bus, 0 to 31
-    unsigned int pciDeviceId;        //!< The combined 16-bit device id and 16-bit vendor id
-
-    // Added in NVML 2.285 API
-    unsigned int pciSubSystemId;     //!< The 32-bit Sub System Device ID
-
-    // NVIDIA reserved for internal use only
-    unsigned int reserved0;
-    unsigned int reserved1;
-    unsigned int reserved2;
-    unsigned int reserved3;
-} nvmlPciInfo_t;
-
-/* P2P Capability Index Status*/
-typedef enum nvmlGpuP2PStatus_enum
-{
-    NVML_P2P_STATUS_OK     = 0,
-    NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED,
-    NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
-    NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
-    NVML_P2P_STATUS_DISABLED_BY_REGKEY,
-    NVML_P2P_STATUS_NOT_SUPPORTED,
-    NVML_P2P_STATUS_UNKNOWN
-} nvmlGpuP2PStatus_t;
-
-/* P2P Capability Index*/
-typedef enum nvmlGpuP2PCapsIndex_enum
-{
-    NVML_P2P_CAPS_INDEX_READ = 0,
-    NVML_P2P_CAPS_INDEX_WRITE,
-    NVML_P2P_CAPS_INDEX_NVLINK,
-    NVML_P2P_CAPS_INDEX_ATOMICS,
-    NVML_P2P_CAPS_INDEX_PROP,
-    NVML_P2P_CAPS_INDEX_UNKNOWN
-} nvmlGpuP2PCapsIndex_t;
-
-/**
- * Represents the type for sample value returned
- */
-typedef enum nvmlValueType_enum
-{
-    NVML_VALUE_TYPE_DOUBLE = 0,
-    NVML_VALUE_TYPE_UNSIGNED_INT = 1,
-    NVML_VALUE_TYPE_UNSIGNED_LONG = 2,
-    NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3,
-    NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4,
-
-    // Keep this last
-    NVML_VALUE_TYPE_COUNT
-}nvmlValueType_t;
-
-
-/**
- * Union to represent different types of Value
- */
-typedef union nvmlValue_st
-{
-    double dVal;                    //!< If the value is double
-    unsigned int uiVal;             //!< If the value is unsigned int
-    unsigned long ulVal;            //!< If the value is unsigned long
-    unsigned long long ullVal;      //!< If the value is unsigned long long
-    signed long long sllVal;        //!< If the value is signed long long
-}nvmlValue_t;
-
-/**
- * Field Identifiers.
- *
- * All Identifiers pertain to a device. Each ID is only used once and is guaranteed never to change.
- */
-
-/* NVLink Speed */
-#define NVML_FI_DEV_NVLINK_SPEED_MBPS_COMMON 90  //!< Common NVLink Speed in MBps for active links
-#define NVML_FI_DEV_NVLINK_LINK_COUNT        91  //!< Number of NVLinks present on the device
-
-/**
- * Remote device NVLink ID
- *
- * Link ID needs to be specified in the scopeId field in nvmlFieldValue_t.
- */
-#define NVML_FI_DEV_NVLINK_REMOTE_NVLINK_ID     146 //!< Remote device NVLink ID
-
-/**
- * NVSwitch: connected NVLink count
- */
-#define NVML_FI_DEV_NVSWITCH_CONNECTED_LINK_COUNT   147  //!< Number of NVLinks connected to NVSwitch
-
-#define NVML_FI_DEV_NVLINK_GET_SPEED                  164
-#define NVML_FI_DEV_NVLINK_GET_STATE                  165
-#define NVML_FI_DEV_NVLINK_GET_VERSION                166
-
-#define NVML_FI_DEV_C2C_LINK_COUNT                    170 //!< Number of C2C Links present on the device
-#define NVML_FI_DEV_C2C_LINK_GET_STATUS               171 //!< C2C Link Status 0=INACTIVE 1=ACTIVE
-#define NVML_FI_DEV_C2C_LINK_GET_MAX_BW               172 //!< C2C Link Speed in MBps for active links
-
-#define NVML_FI_MAX 173 //!< One greater than the largest field ID defined above
-
-/**
- * Information for a Field Value Sample
- */
-typedef struct nvmlFieldValue_st
-{
-    unsigned int fieldId;       //!< ID of the NVML field to retrieve. This must be set before any call that uses this struct. See the constants starting with NVML_FI_ above.
-    unsigned int scopeId;       //!< Scope ID can represent data used by NVML depending on fieldId's context. For example, for NVLink throughput counter data, scopeId can represent linkId.
-    long long timestamp;        //!< CPU Timestamp of this value in microseconds since 1970
-    long long latencyUsec;      //!< How long this field value took to update (in usec) within NVML. This may be averaged across several fields that are serviced by the same driver call.
-    nvmlValueType_t valueType;  //!< Type of the value stored in value
-    nvmlReturn_t nvmlReturn;    //!< Return code for retrieving this value. This must be checked before looking at value, as value is undefined if nvmlReturn != NVML_SUCCESS
-    nvmlValue_t value;          //!< Value for this field. This is only valid if nvmlReturn == NVML_SUCCESS
-} nvmlFieldValue_t;
-
-/* End of nvml.h */
-#endif // NCCL_NVML_DIRECT
-
-constexpr int ncclNvmlMaxDevices = 32;
-struct ncclNvmlDeviceInfo {
-  nvmlDevice_t handle;
-  int computeCapabilityMajor, computeCapabilityMinor;
-};
-struct ncclNvmlDevicePairInfo {
-  nvmlGpuP2PStatus_t p2pStatusRead, p2pStatusWrite;
-};
-extern int ncclNvmlDeviceCount;
-extern ncclNvmlDeviceInfo ncclNvmlDevices[ncclNvmlMaxDevices];
-extern ncclNvmlDevicePairInfo ncclNvmlDevicePairs[ncclNvmlMaxDevices][ncclNvmlMaxDevices];
-
-// All ncclNvmlFoo() functions call ncclNvmlEnsureInitialized() implicitly.
-// Outsiders need only call it if they want to inspect the ncclNvml global
-// tables above.
-ncclResult_t ncclNvmlEnsureInitialized();
-
-ncclResult_t ncclNvmlDeviceGetHandleByPciBusId(const char* pciBusId, nvmlDevice_t* device);
-ncclResult_t ncclNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index);
-ncclResult_t ncclNvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device);
-ncclResult_t ncclNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive);
-ncclResult_t ncclNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci);
-ncclResult_t ncclNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult);
-ncclResult_t ncclNvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int* major, int* minor);
-ncclResult_t ncclNvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex, nvmlGpuP2PStatus_t* p2pStatus);
-ncclResult_t ncclNvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values);
-
-#endif // End include guard
diff --git a/nvls/nvtx.h b/nvls/nvtx.h
deleted file mode 100644
index ab32ef27f..000000000
--- a/nvls/nvtx.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NVTX_H_
-#define NCCL_NVTX_H_
-
-#include "nvtx3/nvtx3.hpp"
-
-#if __cpp_constexpr >= 201304L && !defined(NVTX3_CONSTEXPR_IF_CPP14)
-#define NVTX3_CONSTEXPR_IF_CPP14 constexpr
-#else
-#define NVTX3_CONSTEXPR_IF_CPP14
-#endif
-
-// Define all NCCL-provided static schema IDs here (avoid duplicates).
-#define NVTX_SID_CommInitRank  0
-#define NVTX_SID_CommInitAll   1
-#define NVTX_SID_CommDestroy   2 // same schema as NVTX_SID_CommInitRank
-#define NVTX_SID_CommAbort     3 // same schema as NVTX_SID_CommInitRank
-#define NVTX_SID_AllGather     4
-#define NVTX_SID_AllReduce     5
-#define NVTX_SID_Broadcast     6
-#define NVTX_SID_ReduceScatter 7
-#define NVTX_SID_Reduce        8
-#define NVTX_SID_Send          9
-#define NVTX_SID_Recv          10
-
-// Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 11 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
-
-extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
-
-struct nccl_domain{static constexpr char const* name{"NCCL"};};
-
-class payload_schema {
- public:
-  explicit payload_schema(const nvtxPayloadSchemaEntry_t entries[], size_t numEntries, const uint64_t schemaId, const char* schemaName = nullptr) noexcept
-  {
-    schema_attr.name = schemaName;
-    schema_attr.entries = entries;
-    schema_attr.numEntries = numEntries;
-    schema_attr.schemaId = schemaId;
-    nvtxPayloadSchemaRegister(nvtx3::domain::get<nccl_domain>(), &schema_attr);
-  }
-
-  payload_schema() = delete;
-  ~payload_schema() = default;
-  payload_schema(payload_schema const&) = default;
-  payload_schema& operator=(payload_schema const&) = default;
-  payload_schema(payload_schema&&) = default;
-  payload_schema& operator=(payload_schema&&) = default;
-
- private:
-  nvtxPayloadSchemaAttr_t schema_attr{
-    NVTX_PAYLOAD_SCHEMA_ATTR_TYPE |
-    NVTX_PAYLOAD_SCHEMA_ATTR_ENTRIES |
-    NVTX_PAYLOAD_SCHEMA_ATTR_NUM_ENTRIES |
-    NVTX_PAYLOAD_SCHEMA_ATTR_STATIC_SIZE |
-    NVTX_PAYLOAD_SCHEMA_ATTR_SCHEMA_ID,
-    nullptr,
-    NVTX_PAYLOAD_SCHEMA_TYPE_STATIC,
-    NVTX_PAYLOAD_SCHEMA_FLAG_NONE,
-    nullptr, 0, 0, 0};
-};
-
-// Create NVTX push/pop range with parameters
-// @param name of the operation (see `NVTX_SID_*`)
-// @param N  schema name
-// @param S  schema (entries)
-// @param P  payload (struct)
-#define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
-  static const payload_schema schema{S, std::extent<decltype(S)>::value, \
-    NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, #ID}; \
-  static ::nvtx3::v1::registered_string_in<nccl_domain> const nvtx3_func_name__{__func__}; \
-  nvtxPayloadData_t nvtx3_bpl__[] = { \
-    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
-  ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__, nvtx3_bpl__}; \
-  ::nvtx3::v1::scoped_range_in<nccl_domain> const nvtx3_range__{nvtx3_func_attr__};
-
-extern void initNvtxRegisteredEnums();
-
-#endif
diff --git a/nvls/p2p.h b/nvls/p2p.h
deleted file mode 100644
index 6ffba4b0e..000000000
--- a/nvls/p2p.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include <stdlib.h>
-
-#ifndef NCCL_P2P_H_
-#define NCCL_P2P_H_
-
-#define NCCL_P2P_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-
-typedef struct {
-  uint64_t data; // Needs to hold a CUmemGenericAllocationHandle for UDS fd support
-} ncclCuDesc;
-
-typedef union {
-  // Legacy CUDA IPC
-  cudaIpcMemHandle_t devIpc;
-  // cuMem API support
-  ncclCuDesc cuDesc;
-} ncclIpcDesc;
-
-ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, void **ptr);
-ncclResult_t ncclP2pFreeShareableBuffer(ncclIpcDesc *ipcDesc);
-ncclResult_t ncclP2pImportShareableBuffer(struct ncclComm *comm, int tpPeer, size_t size, ncclIpcDesc *ipcDesc, void **devMemPtr);
-
-#endif
diff --git a/nvls/param.h b/nvls/param.h
deleted file mode 100644
index 963da9d17..000000000
--- a/nvls/param.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PARAM_H_
-#define NCCL_PARAM_H_
-
-#include <stdint.h>
-
-const char* userHomeDir();
-void setEnvFile(const char* fileName);
-void initEnv();
-const char *ncclGetEnv(const char *name);
-
-void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
-
-#define NCCL_PARAM(name, env, deftVal) \
-  int64_t ncclParam##name() { \
-    constexpr int64_t uninitialized = INT64_MIN; \
-    static_assert(deftVal != uninitialized, "default value cannot be the uninitialized value."); \
-    static int64_t cache = uninitialized; \
-    if (__builtin_expect(__atomic_load_n(&cache, __ATOMIC_RELAXED) == uninitialized, false)) { \
-      ncclLoadParam("NCCL_" env, deftVal, uninitialized, &cache); \
-    } \
-    return cache; \
-  }
-
-#endif
diff --git a/nvls/profiler.h b/nvls/profiler.h
deleted file mode 100644
index 103af99ad..000000000
--- a/nvls/profiler.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROFILER_H_
-#define NCCL_PROFILER_H_
-
-#include "proxy.h"
-
-enum ncclProxyProfileState {
-  ncclProxyProfileBegin = 0,
-
-  ncclProxyProfileSendGPUWait = 1,
-  ncclProxyProfileSendWait = 2,
-
-  ncclProxyProfileRecvWait = 1,
-  ncclProxyProfileRecvFlushWait = 2,
-  ncclProxyProfileRecvGPUWait = 3,
-
-  ncclProxyProfileEnd = 4,
-
-  ncclProxyProfileSleep = 8,
-  ncclProxyProfileWakeup = 9,
-
-  ncclProxyProfileIdle = 16,
-  ncclProxyProfileActive = 17,
-
-  ncclProxyProfileAppend = 24,
-  ncclProxyProfileAppendEnd = 25
-};
-
-ncclResult_t ncclProfilingRecord(struct ncclProxyArgs* args, int sub, int step, int state);
-void ncclProfilingDump();
-
-#endif
diff --git a/nvls/proxy.h b/nvls/proxy.h
deleted file mode 100644
index 8093c0ce6..000000000
--- a/nvls/proxy.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_PROXY_H_
-#define NCCL_PROXY_H_
-
-#include "device.h"
-#include "info.h"
-#include "socket.h"
-#include "ipcsocket.h"
-#include "nccl_net.h"
-#include <pthread.h>
-#include "shm.h"
-#include "p2p.h"
-
-enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress };
-
-struct ncclProxyArgs;
-typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyState*, struct ncclProxyArgs*);
-
-#define NCCL_PROXY_MAX_SUBS MAXCHANNELS
-static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements");
-
-struct ncclProxyOp {
-  struct ncclProxyConnection* connection;
-  int channelId;
-  int nsteps;
-  ssize_t nbytes;
-  int root;
-  int next;
-
-  uint64_t opCount;
-  int sliceSteps;
-  int chunkSteps;
-  int chunkSize;
-  uint8_t /*ncclDataType_t*/ dtype;
-  uint8_t /*ncclDevRedOp_t*/ redOp;
-  uint8_t /*ncclPattern_t*/ pattern;
-  uint8_t protocol;
-
-  union {
-    uint64_t unused;
-    // For use by enqueue.cc
-    struct ncclProxyOp *enqNext;
-  };
-};
-static_assert(sizeof(struct ncclProxyOp) == 64, "Keep ProxyOp aligned with cache lines for effective prefetch");
-
-struct ncclProxySubArgs {
-  struct ncclProxyConnection* connection;
-  int channelId;
-  int nsteps;
-  ssize_t nbytes;
-  int peer;
-
-  int groupSize; // Number of consecutive sub operations sharing the same recvComm
-  uint64_t base;
-  uint64_t posted;
-  uint64_t received;
-  uint64_t flushed;
-  uint64_t transmitted;
-  uint64_t done;
-  uint64_t end;
-  void* requests[NCCL_STEPS];
-  void* profilingEvents[NCCL_STEPS];
-  void* recvRequestsCache[NCCL_STEPS];
-  int recvRequestsSubCount;
-};
-
-struct ncclProxyArgs {
-  struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS];
-  proxyProgressFunc_t progress;
-  int nsubs;
-  int done;
-  uint64_t opCount;
-  int sliceSteps;
-  int chunkSteps;
-  int chunkSize;
-  uint8_t /*ncclDataType_t*/ dtype;
-  uint8_t /*ncclDevRedOp_t*/ redOp;
-  uint8_t /*ncclPattern_t*/ pattern;
-  uint8_t protocol;
-  int state;
-  char* sharedBuff[NCCL_STEPS];
-  int sharedSize[NCCL_STEPS];
-
-  int idle;
-
-  // Element linking
-  struct ncclProxyArgs* next;
-  struct ncclProxyArgs* nextPeer;
-  struct ncclProxyArgs** proxyAppendPtr;
-};
-#define NCCL_MAX_NETDEVS 128
-
-// ProxyOps are used to communicate between main thread and service thread
-// Make sure we have enough to store two full rounds of operations on all channels.
-// Otherwise we'd be unable to post half of them to free new elements.
-#define MAX_OPS_PER_PEER (2*MAXCHANNELS*NCCL_MAX_WORK_ELEMENTS_P2P)
-#define NCCL_MAX_LOCAL_RANKS 64
-struct ncclProxyOpsPool {
-  struct ncclProxyOp ops[MAX_OPS_PER_PEER*NCCL_MAX_LOCAL_RANKS];
-  volatile int nextOps;
-  volatile int nextOpsEnd;
-  volatile int freeOps[NCCL_MAX_LOCAL_RANKS];
-  pthread_mutex_t mutex;
-  pthread_cond_t cond;
-};
-
-struct ncclProxyOps {
-  ncclProxyOpsPool* pool;
-  ncclShmHandle_t handle;
-  int count;
-  int freeOp;
-  int nextOps;
-  int nextOpsEnd;
-};
-
-struct ncclProxySharedP2p {
-  int refcount;
-  int size;
-  char* cudaBuff;
-  char* hostBuff;
-  // CUDA IPC
-  ncclIpcDesc ipcDesc;
-  struct ncclProxyArgs* proxyAppend[MAXCHANNELS]; // Separate send and recv
-};
-
-struct ncclProxyPeer {
-  struct ncclProxySharedP2p send;
-  struct ncclProxySharedP2p recv;
-};
-
-struct ncclSharedNetComms {
-  void* sendComm[MAXCHANNELS];
-  void* recvComm[MAXCHANNELS];
-  int sendRefCount[MAXCHANNELS];
-  int recvRefCount[MAXCHANNELS];
-};
-
-struct ncclProxyPool;
-struct ncclProxyProgressState {
-  // Used by main threads to send work to progress thread
-  struct ncclProxyOpsPool* opsPool;
-  ncclShmHandle_t handle;
-  char opsPoolShmSuffix[6];
-
-  pthread_t thread;
-  volatile int stop;
-  struct ncclProxyPeer** localPeers;
-  struct ncclSharedNetComms* netComms[NCCL_MAX_NETDEVS];
-  struct ncclProxyArgs* active;
-  struct ncclProxyArgs* pool;
-  struct ncclProxyPool* pools;
-  int nextOps;
-};
-
-// Expected proxy response fifo
-struct ncclExpectedProxyResponse {
-  void*                             opId;
-  int                               respSize;
-  bool                              done;
-  void*                             respBuff;
-  ncclResult_t                      res;
-  struct ncclExpectedProxyResponse* next;
-};
-
-struct ncclProxyAsyncOp {
-  int type;
-  struct ncclProxyConnection* connection;
-  int reqSize, respSize;
-  char *reqBuff, *respBuff;
-  void* opId;
-  ncclProxyAsyncOp* next;
-};
-
-struct ncclProxyLocalPeer {
-  struct ncclSocket sock;
-  int tpRank;
-  int tpLocalRank;
-  ncclProxyAsyncOp* asyncOps;
-  int asyncOpCounter;
-};
-
-// Common response header for all proxyOps
-// We pack this into a struct to reduce the number of blocking send and recv calls
-struct ncclProxyRpcResponseHeader {
-  void* opId;
-  ncclResult_t res;
-  int respSize;
-};
-
-struct ncclProxyState {
-  int refCount;
-  int tpRank;
-  int tpnRanks;
-  int tpLocalnRanks;
-  int cudaDev;
-  int p2pnChannels;
-  int p2pChunkSize;
-  int nChannels;
-  int buffSizes[NCCL_NUM_PROTOCOLS];
-  bool allocP2pNetLLBuffers;
-  bool dmaBufSupport;
-  ncclNet_t* ncclNet;
-  ncclCollNet_t* ncclCollNet;
-  volatile uint32_t* abortFlag;
-  // Service thread
-  pthread_t thread;
-  struct ncclSocket* listenSock;
-  int stop;
-  CUcontext cudaCtx;
-  ncclResult_t asyncResult;
-
-  // Used by main thread
-  union ncclSocketAddress* peerAddresses;
-  struct ncclSocket* peerSocks;
-  struct ncclProxyOps* proxyOps;
-  void** sharedDevMems;
-  struct ncclIpcSocket peerIpcSock; // cuMEM API support (UDS)
-
-  // Progress thread
-  struct ncclProxyProgressState progressState;
-
-  // Queue of expected responses from the proxy
-  struct ncclExpectedProxyResponse* expectedResponses;
-};
-
-enum proxyConnectState {
-  connUninitialized     = 0,
-  connInitialized       = 1,
-  connSharedInitialized = 2,
-  connSetupDone         = 3,
-  connConnected         = 4,
-  numConnStates         = 5
-};
-
-struct ncclProxyConnection {
-  int send, transport, shared;
-  int tpLocalRank, sameProcess;
-  struct ncclSocket* sock;
-  struct ncclTransportComm* tcomm;
-  struct ncclProxyArgs *proxyAppend;
-  struct ncclProxyArgs **proxyAppendPtr;
-  void* transportResources;
-  ncclNetDeviceHandle_t* netDeviceHandle;
-  void* mhandles[NCCL_NUM_PROTOCOLS];
-  proxyConnectState state;
-  struct ncclCollNetSharedRes* collNet;
-  int needsProxyProgress;
-};
-
-typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*);
-
-enum proxyMode {
-  proxyRing = 0,
-  proxyFrom = 1,
-  proxyTo = 2
-};
-
-ncclResult_t ncclProxySaveOp(struct ncclComm* comm, struct ncclProxyOp* proxyOp, bool *justInquire);
-ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyOp* proxyOp);
-ncclResult_t ncclProxyStart(struct ncclComm* comm);
-ncclResult_t ncclProxyInit(struct ncclComm* comm, struct ncclSocket* sock, union ncclSocketAddress* peerAddresses);
-ncclResult_t ncclProxyCreate(struct ncclComm* comm);
-ncclResult_t ncclProxyConnect(struct ncclComm* comm, int transport, int send, int proxyRank, struct ncclProxyConnector* proxyConn);
-enum ncclProxyMsgType {
-  ncclProxyMsgInit = 1,
-  ncclProxyMsgSharedInit = 2,
-  ncclProxyMsgSetup = 3,
-  ncclProxyMsgConnect = 4,
-  ncclProxyMsgStart = 5,
-  ncclProxyMsgClose = 6,
-  ncclProxyMsgAbort = 7,
-  ncclProxyMsgStop = 8,
-  ncclProxyMsgGetFd = 9, // cuMem API support (UDS)
-};
-
-// This function is called by a client of the proxy that needs to invoke any of the non-progress proxyOp types
-// Call this function on the client, supplying a locally unique opId. Then, poll on the return value of
-// ncclPollProxyResponse(), supplying the same opId to confirm the operation has completed
-ncclResult_t ncclProxyCallAsync(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, int respSize, void* opId);
-
-// This function will internally call ncclProxyCallAsync() and spin until ncclPollProxyResponse() confirms the result is received
-ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, int type, void* reqBuff, int reqSize, void* respBuff, int respSize);
-ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void* respBuff, void* opId);
-
-ncclResult_t ncclProxyClientGetFdBlocking(struct ncclComm* comm, struct ncclProxyConnector* proxyConn, void *handle, int* convertedFd);
-
-ncclResult_t ncclProxyStop(struct ncclComm* comm);
-ncclResult_t ncclProxyShmUnlink(struct ncclComm* comm);
-ncclResult_t ncclProxyDestroy(struct ncclComm* comm);
-#endif
diff --git a/nvls/shm.h b/nvls/shm.h
deleted file mode 100644
index e75caa6a6..000000000
--- a/nvls/shm.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_SHM_H_
-#define NCCL_SHM_H_
-
-#include "nccl.h"
-
-typedef void* ncclShmHandle_t;
-ncclResult_t ncclShmOpen(char* shmPath, size_t shmSize, void** shmPtr, void** devShmPtr, int refcount, ncclShmHandle_t* handle);
-ncclResult_t ncclShmClose(ncclShmHandle_t handle);
-ncclResult_t ncclShmUnlink(ncclShmHandle_t handle);
-
-struct ncclShmemCollBuff {
-  volatile size_t *cnt[2];
-  volatile void *ptr[2];
-  int round;
-};
-
-ncclResult_t ncclShmemAllgather(struct ncclComm *comm, struct ncclShmemCollBuff *shmem, void *sendbuff, void *recvbuff, size_t typeSize);
-
-#endif
diff --git a/nvls/socket.h b/nvls/socket.h
deleted file mode 100644
index 9e5137289..000000000
--- a/nvls/socket.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_SOCKET_H_
-#define NCCL_SOCKET_H_
-
-#include "nccl.h"
-#include <sys/socket.h>
-#include <arpa/inet.h>
-#include <netinet/tcp.h>
-#include <netdb.h>
-#include <fcntl.h>
-#include <poll.h>
-
-#define MAX_IFS 16
-#define MAX_IF_NAME_SIZE 16
-#define SLEEP_INT            1000 // connection retry sleep interval in usec
-#define RETRY_REFUSED_TIMES   2e4 // connection refused retry times before reporting a timeout (20 sec)
-#define RETRY_TIMEDOUT_TIMES    3 // connection timed out retry times (each one can take 20s)
-#define SOCKET_NAME_MAXLEN (NI_MAXHOST+NI_MAXSERV)
-#define NCCL_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
-
-/* Common socket address storage structure for IPv4/IPv6 */
-union ncclSocketAddress {
-  struct sockaddr sa;
-  struct sockaddr_in sin;
-  struct sockaddr_in6 sin6;
-};
-
-enum ncclSocketState {
-  ncclSocketStateNone = 0,
-  ncclSocketStateInitialized = 1,
-  ncclSocketStateAccepting = 2,
-  ncclSocketStateAccepted = 3,
-  ncclSocketStateConnecting = 4,
-  ncclSocketStateConnectPolling = 5,
-  ncclSocketStateConnected = 6,
-  ncclSocketStateReady = 7,
-  ncclSocketStateClosed = 8,
-  ncclSocketStateError = 9,
-  ncclSocketStateNum = 10
-};
-
-enum ncclSocketType {
-  ncclSocketTypeUnknown = 0,
-  ncclSocketTypeBootstrap = 1,
-  ncclSocketTypeProxy = 2,
-  ncclSocketTypeNetSocket = 3,
-  ncclSocketTypeNetIb = 4
-};
-
-struct ncclSocket {
-  int fd;
-  int acceptFd;
-  int timedOutRetries;
-  int refusedRetries;
-  union ncclSocketAddress addr;
-  volatile uint32_t* abortFlag;
-  int asyncFlag;
-  enum ncclSocketState state;
-  int salen;
-  uint64_t magic;
-  enum ncclSocketType type;
-};
-
-const char *ncclSocketToString(union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
-ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
-int ncclFindInterfaceMatchSubnet(char* ifNames, union ncclSocketAddress* localAddrs, union ncclSocketAddress* remoteAddr, int ifNameMaxSize, int maxIfs);
-int ncclFindInterfaces(char* ifNames, union ncclSocketAddress *ifAddrs, int ifNameMaxSize, int maxIfs);
-
-// Initialize a socket
-ncclResult_t ncclSocketInit(struct ncclSocket* sock, union ncclSocketAddress* addr = NULL, uint64_t magic = NCCL_SOCKET_MAGIC, enum ncclSocketType type = ncclSocketTypeUnknown, volatile uint32_t* abortFlag = NULL, int asyncFlag = 0);
-// Create a listening socket. sock->addr can be pre-filled with IP & port info. sock->fd is set after a successful call
-ncclResult_t ncclSocketListen(struct ncclSocket* sock);
-ncclResult_t ncclSocketGetAddr(struct ncclSocket* sock, union ncclSocketAddress* addr);
-// Connect to sock->addr. sock->fd is set after a successful call.
-ncclResult_t ncclSocketConnect(struct ncclSocket* sock);
-// Return socket connection state.
-ncclResult_t ncclSocketReady(struct ncclSocket* sock, int *running);
-// Accept an incoming connection from listenSock->fd and keep the file descriptor in sock->fd, with the remote side IP/port in sock->addr.
-ncclResult_t ncclSocketAccept(struct ncclSocket* sock, struct ncclSocket* ulistenSock);
-ncclResult_t ncclSocketGetFd(struct ncclSocket* sock, int* fd);
-ncclResult_t ncclSocketSetFd(int fd, struct ncclSocket* sock);
-
-#define NCCL_SOCKET_SEND 0
-#define NCCL_SOCKET_RECV 1
-
-ncclResult_t ncclSocketProgress(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
-ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size, int* offset);
-ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
-ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
-ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
-ncclResult_t ncclSocketClose(struct ncclSocket* sock);
-#endif
diff --git a/nvls/strongstream.h b/nvls/strongstream.h
deleted file mode 100644
index 0984dfe57..000000000
--- a/nvls/strongstream.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_STRONGSTREAM_H_
-#define NCCL_STRONGSTREAM_H_
-
-#include "nccl.h"
-#include "checks.h"
-
-#include <stdint.h>
-
-/* ncclCudaGraph: Wraps a cudaGraph_t so that we can support pre-graph CUDA runtimes
- * easily.
- */
-struct ncclCudaGraph {
-#if CUDART_VERSION >= 11030
-  cudaGraph_t graph;
-  unsigned long long graphId;
-#endif
-};
-
-inline struct ncclCudaGraph ncclCudaGraphNone() {
-  struct ncclCudaGraph tmp;
-  #if CUDART_VERSION >= 11030
-    tmp.graph = nullptr;
-    tmp.graphId = ULLONG_MAX;
-  #endif
-  return tmp;
-}
-
-inline bool ncclCudaGraphValid(struct ncclCudaGraph graph) {
-  #if CUDART_VERSION >= 11030
-    return graph.graph != nullptr;
-  #else
-    return false;
-  #endif
-}
-
-inline bool ncclCudaGraphSame(struct ncclCudaGraph a, struct ncclCudaGraph b) {
-  #if CUDART_VERSION >= 11030
-    return a.graphId == b.graphId;
-  #else
-    return true;
-  #endif
-}
-
-ncclResult_t ncclCudaGetCapturingGraph(struct ncclCudaGraph* graph, cudaStream_t stream);
-ncclResult_t ncclCudaGraphAddDestructor(struct ncclCudaGraph graph, cudaHostFn_t fn, void* arg);
-
-/* ncclStrongStream: An abstraction over CUDA streams that do not lose their
- * identity while being captured. Regular streams have the deficiency that the
- * captured form of a stream in one graph launch has no relation to the
- * uncaptured stream or to the captured form in other graph launches. This makes
- * streams unfit for the use of serializing access to a persistent resource.
- * Strong streams have been introduced to address this need.
- *
- * - All updates to a strong stream must be enclosed by a Acquire/Release pair.
- *
- * - The Acquire, Release, and all updates take a ncclCudaGraph parameter
- *   indicating the currently capturing graph (or none). This parameter must be
- *   the same for the entire sequence of {Acquire; ...; Release}.
- *
- * - An {Acquire; ...; Release} sequence must not be concurrent with any
- *   other operations against the strong stream including graph launches which
- *   reference this stream.
- */
-struct ncclStrongStream;
-
-ncclResult_t ncclStrongStreamConstruct(struct ncclStrongStream* ss);
-ncclResult_t ncclStrongStreamDestruct(struct ncclStrongStream* ss);
-
-// Acquire-fence the strong stream.
-ncclResult_t ncclStrongStreamAcquire(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss
-);
-
-// Acquire-fence the strong stream assuming no graph is capturing. This permits
-// the caller to enqueue directly to the `ss->cudaStream` member using native CUDA
-// calls. Strong stream still must be released via:
-//   ncclStrongStreamRelease(ncclCudaGraphNone(), ss);
-ncclResult_t ncclStrongStreamAcquireUncaptured(struct ncclStrongStream* ss);
-
-// Release-fence of the strong stream.
-ncclResult_t ncclStrongStreamRelease(struct ncclCudaGraph graph, struct ncclStrongStream* ss);
-
-// Add a host launch to the stream.
-ncclResult_t ncclStrongStreamLaunchHost(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  cudaHostFn_t fn, void* arg
-);
-// Add a kernel launch to the stream.
-ncclResult_t ncclStrongStreamLaunchKernel(
-  struct ncclCudaGraph graph, struct ncclStrongStream* ss,
-  void* fn, dim3 grid, dim3 block, void** args, size_t sharedMemBytes
-);
-
-// Cause `a` to wait for the current state `b`. Both `a` and `b` must be acquired.
-// `b_subsumes_a` indicates that all work in `a` is already present in `b`, thus
-// we want to fast-forward `a` to be a clone of `b`. Knowing this permits the
-// implementation to induce few graph dependencies.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-// `b` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, struct ncclStrongStream* a, cudaStream_t b, bool b_subsumes_a=false
-);
-// `a` must be capturing within `graph`.
-ncclResult_t ncclStrongStreamWaitStream(
-  struct ncclCudaGraph graph, cudaStream_t a, struct ncclStrongStream* b, bool b_subsumes_a=false
-);
-
-// Synchrnoization does not need the strong stream to be acquired.
-ncclResult_t ncclStrongStreamSynchronize(struct ncclStrongStream* ss);
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ncclStrongStreamGraph; // internal to ncclStrongStream
-
-struct ncclStrongStream {
-  // Used when not graph capturing.
-  cudaStream_t cudaStream;
-#if CUDART_VERSION >= 11030
-  // The event used to establish order between graphs and streams. During acquire
-  // this event is waited on, during release it is recorded to.
-  cudaEvent_t serialEvent;
-  // This stream ever appeared in a graph capture.
-  bool everCaptured;
-  // Tracks whether serialEvent needs to be recorded to upon Release().
-  bool serialEventNeedsRecord;
-  struct ncclStrongStreamGraph* graphHead;
-#else
-  cudaEvent_t scratchEvent;
-#endif
-};
-
-#endif
diff --git a/nvls/timer.h b/nvls/timer.h
deleted file mode 100644
index 284fec6e0..000000000
--- a/nvls/timer.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TIMER_H_
-#define NCCL_TIMER_H_
-#if ENABLE_TIMER
-#include <unistd.h>
-#include <sys/time.h>
-#include <x86intrin.h>
-static double freq = -1;
-static void calibrate() {
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  uint64_t timeCycles = __rdtsc();
-  double time = - tv.tv_sec*1E6 - tv.tv_usec;
-  uint64_t total = 0ULL;
-  for (int i=0; i<10000; i++) total += __rdtsc();
-  gettimeofday(&tv, NULL);
-  timeCycles = __rdtsc() - timeCycles;
-  time += tv.tv_sec*1E6 + tv.tv_usec;
-  freq = timeCycles/time;
-}
-static inline double gettime() {
-  if (freq == -1) calibrate();
-  return __rdtsc()/freq;
-}
-static uint64_t counts[8];
-static double times[8];
-static double startTimes[8];
-#define TIME_START(index) do { \
-  counts[index]++; \
-  startTimes[index] = gettime(); \
-} while (0);
-
-#define TIME_STOP(index) do { \
-  times[index] += gettime() - startTimes[index]; \
-} while (0);
-
-#define TIME_CANCEL(index) do { \
-  counts[index]--; \
-} while (0);
-
-#define TIME_PRINT(name) do { \
-  printf("%s stats", name); \
-  for (int i=0; i<8; i++) { \
-    if (counts[i]) printf(" [%d] %g/%ld = %g", i, times[i], counts[i], times[i]/counts[i]); \
-    counts[i] = 0; \
-  } \
-  printf("\n"); \
-} while (0);
-#else
-#define TIME_START(index) while(0);
-#define TIME_STOP(index) while(0);
-#define TIME_CANCEL(index) while(0);
-#define TIME_PRINT(name)
-#endif
-#endif
diff --git a/nvls/transport.h b/nvls/transport.h
deleted file mode 100644
index 27529df5e..000000000
--- a/nvls/transport.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TRANSPORT_H_
-#define NCCL_TRANSPORT_H_
-
-#include "device.h"
-#include "graph.h"
-#include "nvmlwrap.h"
-#include "core.h"
-
-#define NTRANSPORTS 4
-#define TRANSPORT_P2P 0
-#define TRANSPORT_SHM 1
-#define TRANSPORT_NET 2
-#define TRANSPORT_COLLNET 3
-
-#include "proxy.h"
-
-extern struct ncclTransport p2pTransport;
-extern struct ncclTransport shmTransport;
-extern struct ncclTransport netTransport;
-extern struct ncclTransport collNetTransport;
-
-extern struct ncclTransport* ncclTransports[];
-
-// Forward declarations
-struct ncclRing;
-struct ncclConnector;
-struct ncclComm;
-
-struct ncclPeerInfo {
-  int rank;
-  int cudaDev;
-  int nvmlDev;
-  int gdrSupport;
-  uint64_t hostHash;
-  uint64_t pidHash;
-  dev_t shmDev;
-  int64_t busId;
-  struct ncclComm* comm;
-  int cudaCompCap;
-};
-
-#define CONNECT_SIZE 128
-struct ncclConnect {
-  char data[CONNECT_SIZE];
-};
-
-#if CUDART_VERSION >= 12010
-
-#define NVLS_HANDLE_SIZE 64
-struct ncclNvlsSharedRes {
-  int refCount;
-  CUmulticastObjectProp properties;
-  CUmemAccessDesc accessDesc;
-  int dev;
-  size_t size;
-  size_t granularity;
-  CUmemGenericAllocationHandle mcHandle; // Multicast handle for NVLS buffer
-  char* mcBuff; // Multicast NVLS buffer address
-  CUmemGenericAllocationHandle ucHandle; // Unicast Handle for NVLS buffer
-  char* ucBuff; // Unicast NVLS buffer address
-  char shareableHandle[NVLS_HANDLE_SIZE];
-  size_t ucGran;
-  int nChannels;
-  struct ncclShmemCollBuff nvlsShmem;
-  void *nvlsShmemHandle;
-};
-
-#endif /* CUDART_VERSION >= 12010 */
-
-struct ncclCollNetSharedRes {
-  int refCount;
-  int size;
-  char* cudaBuff;
-  char* hostBuff;
-  struct ncclProxyArgs* proxyAppend[2*NCCL_MAX_NETDEVS];
-  void* resources;
-  int nChannels;
-  size_t buffSize;
-};
-
-struct ncclTransportComm {
-  ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex);
-  ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*);
-  ncclResult_t (*free)(struct ncclConnector*);
-  ncclResult_t (*proxySharedInit)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, int nChannels);
-  ncclResult_t (*proxySetup)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyConnect)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState, void* reqBuff, int reqSize, void* respBuff, int respSize, int* done);
-  ncclResult_t (*proxyFree)(struct ncclProxyConnection* connection, struct ncclProxyState* proxyState);
-  ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*);
-};
-
-struct ncclTransport {
-  const char name[8];
-  ncclResult_t (*canConnect)(int*, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*);
-  struct ncclTransportComm send;
-  struct ncclTransportComm recv;
-};
-
-ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
-ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, int* highestTransportType=NULL);
-
-// Currently we only support POSIX_FILE_DESCRIPTOR handle exchange
-#define USE_POSIX_FD 1
-
-#if USE_POSIX_FD
-#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-#else
-#define NVLS_CU_MEM_HANDLE_TYPE CU_MEM_HANDLE_TYPE_NONE
-#endif
-
-ncclResult_t ncclNvlsInit(struct ncclComm* comm);
-ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
-ncclResult_t ncclNvlsGraphRegisterBuffer(struct ncclComm *comm, struct ncclKernelPlan *plan, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsLocalRegisterBuffer(struct ncclComm *comm, const void *sendbuff, void *recvbuff, size_t sendbuffSize, size_t recvbuffSize, bool *outRegBufUsed, void **outRegBufSend, void **outRegBufRecv);
-ncclResult_t ncclNvlsDeregBuffer(CUmemGenericAllocationHandle *mcHandler, CUdeviceptr ptr, int dev, size_t size);
-ncclResult_t ncclNvlsFree(struct ncclComm* comm);
-
-enum { collNetRecv=0, collNetSend=1 };
-int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type);
-ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail);
-ncclResult_t ncclTransportCollNetFree(struct ncclComm* comm);
-#endif
diff --git a/nvls/trees.h b/nvls/trees.h
deleted file mode 100644
index ded84a667..000000000
--- a/nvls/trees.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_TREES_H_
-#define NCCL_TREES_H_
-
-ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0, int* parentChildType);
-ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* parentChildType0, int* u1, int* d1_0, int* d1_1, int* parentChildType1);
-
-#endif
diff --git a/nvls/tuner.h b/nvls/tuner.h
deleted file mode 100644
index d8b275017..000000000
--- a/nvls/tuner.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
- * Copyright (c) 2023, Meta Platforms, Inc. and affiliates.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_INT_TUNER_H_
-#define NCCL_INT_TUNER_H_
-
-#include "nccl_tuner.h"
-
-// Tuning plugin to override NCCL's default algorithm/protocol tuning.
-
-// Attempts to load NCCL tuner from environmental variable.
-// Returns ncclSuccess if the correct tuner symbol has been found and
-// successully loaded.  Otherwise returns an error and also logs the error.
-ncclResult_t ncclLoadTunerPlugin(ncclTuner_t** tuner);
-
-// Cleans up NCCL tuner plugin.
-ncclResult_t ncclCloseTunerPlugin(ncclTuner_t** tuner);
-#endif
diff --git a/nvls/utils.h b/nvls/utils.h
deleted file mode 100644
index 60f6efb5f..000000000
--- a/nvls/utils.h
+++ /dev/null
@@ -1,524 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_UTILS_H_
-#define NCCL_UTILS_H_
-
-#include "nccl.h"
-#include "alloc.h"
-#include "checks.h"
-#include <stdint.h>
-#include <time.h>
-#include <sched.h>
-#include <algorithm>
-#include <new>
-
-int ncclCudaCompCap();
-
-// PCI Bus ID <-> int64 conversion functions
-ncclResult_t int64ToBusId(int64_t id, char* busId);
-ncclResult_t busIdToInt64(const char* busId, int64_t* id);
-
-ncclResult_t getBusId(int cudaDev, int64_t *busId);
-
-ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
-uint64_t getHash(const char* string, int n);
-uint64_t getHostHash();
-uint64_t getPidHash();
-ncclResult_t getRandomData(void* buffer, size_t bytes);
-
-struct netIf {
-  char prefix[64];
-  int port;
-};
-
-int parseStringList(const char* string, struct netIf* ifList, int maxList);
-bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
-
-static long log2i(long n) {
- long l = 0;
- while (n>>=1) l++;
- return l;
-}
-
-inline uint64_t clockNano() {
-  struct timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  return uint64_t(ts.tv_sec)*1000*1000*1000 + ts.tv_nsec;
-}
-
-/* get any bytes of random data from /dev/urandom, return 0 if it succeeds; else
- * return -1 */
-inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
-  ncclResult_t ret = ncclSuccess;
-  if (bytes > 0) {
-    const size_t one = 1UL;
-    FILE* fp = fopen("/dev/urandom", "r");
-    if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) ret = ncclSystemError;
-    if (fp) fclose(fp);
-  }
-  return ret;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename Int>
-inline void ncclAtomicRefCountIncrement(Int* refs) {
-  __atomic_fetch_add(refs, 1, __ATOMIC_RELAXED);
-}
-
-template<typename Int>
-inline Int ncclAtomicRefCountDecrement(Int* refs) {
-  return __atomic_sub_fetch(refs, 1, __ATOMIC_ACQ_REL);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-/* ncclMemoryStack: Pools memory for fast LIFO ordered allocation. Note that
- * granularity of LIFO is not per object, instead frames containing many objects
- * are pushed and popped. Therefor deallocation is extremely cheap since its
- * done at the frame granularity.
- *
- * The initial state of the stack is with one frame, the "nil" frame, which
- * cannot be popped. Therefor objects allocated in the nil frame cannot be
- * deallocated sooner than stack destruction.
- */
-struct ncclMemoryStack;
-
-void ncclMemoryStackConstruct(struct ncclMemoryStack* me);
-void ncclMemoryStackDestruct(struct ncclMemoryStack* me);
-void ncclMemoryStackPush(struct ncclMemoryStack* me);
-void ncclMemoryStackPop(struct ncclMemoryStack* me);
-template<typename T>
-T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n=1);
-
-////////////////////////////////////////////////////////////////////////////////
-/* ncclMemoryPool: A free-list of same-sized allocations. It is an invalid for
- * a pool instance to ever hold objects whose type have differing
- * (sizeof(T), alignof(T)) pairs. The underlying memory is supplied by
- * a backing `ncclMemoryStack` passed during Alloc(). If memory
- * backing any currently held object is deallocated then it is an error to do
- * anything other than reconstruct it, after which it is a valid empty pool.
- */
-struct ncclMemoryPool;
-
-// Equivalent to zero-initialization
-void ncclMemoryPoolConstruct(struct ncclMemoryPool* me);
-template<typename T>
-T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing);
-template<typename T>
-void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj);
-void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from);
-
-////////////////////////////////////////////////////////////////////////////////
-/* ncclIntruQueue: A singly-linked list queue where the per-object next pointer
- * field is given via the `next` template argument.
- *
- * Example:
- *   struct Foo {
- *     struct Foo *next1, *next2; // can be a member of two lists at once
- *   };
- *   ncclIntruQueue<Foo, &Foo::next1> list1;
- *   ncclIntruQueue<Foo, &Foo::next2> list2;
- */
-template<typename T, T *T::*next>
-struct ncclIntruQueue;
-
-template<typename T, T *T::*next>
-void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me);
-template<typename T, T *T::*next>
-bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me);
-template<typename T, T *T::*next>
-T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me);
-template<typename T, T *T::*next>
-void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x);
-template<typename T, T *T::*next>
-T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me);
-template<typename T, T *T::*next>
-T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me);
-template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *memPool);
-
-////////////////////////////////////////////////////////////////////////////////
-/* ncclThreadSignal: Couples a pthread mutex and cond together. The "mutex"
- * and "cond" fields are part of the public interface.
- */
-struct ncclThreadSignal {
-  pthread_mutex_t mutex;
-  pthread_cond_t cond;
-};
-
-// returns {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER}
-constexpr ncclThreadSignal ncclThreadSignalStaticInitializer();
-
-void ncclThreadSignalConstruct(struct ncclThreadSignal* me);
-void ncclThreadSignalDestruct(struct ncclThreadSignal* me);
-
-// A convenience instance per-thread.
-extern __thread struct ncclThreadSignal ncclThreadSignalLocalInstance;
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename T, T *T::*next>
-struct ncclIntruQueueMpsc;
-
-template<typename T, T *T::*next>
-void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me);
-template<typename T, T *T::*next>
-bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me);
-// Enqueue element. Returns true if queue is not abandoned. Even if queue is
-// abandoned the element enqueued, so the caller needs to make arrangements for
-// the queue to be tended.
-template<typename T, T *T::*next>
-bool ncclIntruQueueMpscEnqueue(struct ncclIntruQueueMpsc<T,next>* me, T* x);
-// Dequeue all elements at a glance. If there aren't any and `waitSome` is
-// true then this call will wait until it can return a non empty list.
-template<typename T, T *T::*next>
-T* ncclIntruQueueMpscDequeueAll(struct ncclIntruQueueMpsc<T,next>* me, bool waitSome);
-// Dequeue all elements and set queue to abandoned state.
-template<typename T, T *T::*next>
-T* ncclIntruQueueMpscAbandon(struct ncclIntruQueueMpsc<T,next>* me);
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ncclMemoryStack {
-  struct Hunk {
-    struct Hunk* above; // reverse stack pointer
-    size_t size; // size of this allocation (including this header struct)
-  };
-  struct Unhunk { // proxy header for objects allocated out-of-hunk
-    struct Unhunk* next;
-    void* obj;
-  };
-  struct Frame {
-    struct Hunk* hunk; // top of non-empty hunks
-    uintptr_t bumper, end; // points into top hunk
-    struct Unhunk* unhunks;
-    struct Frame* below;
-  };
-
-  static void* allocateSpilled(struct ncclMemoryStack* me, size_t size, size_t align);
-  static void* allocate(struct ncclMemoryStack* me, size_t size, size_t align);
-
-  struct Hunk stub;
-  struct Frame topFrame;
-};
-
-inline void ncclMemoryStackConstruct(struct ncclMemoryStack* me) {
-  me->stub.above = nullptr;
-  me->stub.size = 0;
-  me->topFrame.hunk = &me->stub;
-  me->topFrame.bumper = 0;
-  me->topFrame.end = 0;
-  me->topFrame.unhunks = nullptr;
-  me->topFrame.below = nullptr;
-}
-
-inline void* ncclMemoryStack::allocate(struct ncclMemoryStack* me, size_t size, size_t align) {
-  uintptr_t o = (me->topFrame.bumper + align-1) & -uintptr_t(align);
-  void* obj;
-  if (__builtin_expect(o + size <= me->topFrame.end, true)) {
-    me->topFrame.bumper = o + size;
-    obj = reinterpret_cast<void*>(o);
-  } else {
-    obj = allocateSpilled(me, size, align);
-  }
-  return obj;
-}
-
-template<typename T>
-inline T* ncclMemoryStackAlloc(struct ncclMemoryStack* me, size_t n) {
-  void *obj = ncclMemoryStack::allocate(me, n*sizeof(T), alignof(T));
-  memset(obj, 0, n*sizeof(T));
-  return (T*)obj;
-}
-
-inline void ncclMemoryStackPush(struct ncclMemoryStack* me) {
-  using Frame = ncclMemoryStack::Frame;
-  Frame tmp = me->topFrame;
-  Frame* snapshot = (Frame*)ncclMemoryStack::allocate(me, sizeof(Frame), alignof(Frame));
-  *snapshot = tmp; // C++ struct assignment
-  me->topFrame.unhunks = nullptr;
-  me->topFrame.below = snapshot;
-}
-
-inline void ncclMemoryStackPop(struct ncclMemoryStack* me) {
-  ncclMemoryStack::Unhunk* un = me->topFrame.unhunks;
-  while (un != nullptr) {
-    free(un->obj);
-    un = un->next;
-  }
-  me->topFrame = *me->topFrame.below; // C++ struct assignment
-}
-
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ncclMemoryPool {
-  struct Cell {
-    Cell *next;
-  };
-  struct Cell* head;
-  struct Cell* tail; // meaningful only when head != nullptr
-};
-
-inline void ncclMemoryPoolConstruct(struct ncclMemoryPool* me) {
-  me->head = nullptr;
-}
-
-template<typename T>
-inline T* ncclMemoryPoolAlloc(struct ncclMemoryPool* me, struct ncclMemoryStack* backing) {
-  using Cell = ncclMemoryPool::Cell;
-  Cell* cell;
-  if (__builtin_expect(me->head != nullptr, true)) {
-    cell = me->head;
-    me->head = cell->next;
-  } else {
-    // Use the internal allocate() since it doesn't memset to 0 yet.
-    size_t cellSize = std::max(sizeof(Cell), sizeof(T));
-    size_t cellAlign = std::max(alignof(Cell), alignof(T));
-    cell = (Cell*)ncclMemoryStack::allocate(backing, cellSize, cellAlign);
-  }
-  memset(cell, 0, sizeof(T));
-  return reinterpret_cast<T*>(cell);
-}
-
-template<typename T>
-inline void ncclMemoryPoolFree(struct ncclMemoryPool* me, T* obj) {
-  using Cell = ncclMemoryPool::Cell;
-  Cell* cell = reinterpret_cast<Cell*>(obj);
-  cell->next = me->head;
-  if (me->head == nullptr) me->tail = cell;
-  me->head = cell;
-}
-
-inline void ncclMemoryPoolTakeAll(struct ncclMemoryPool* me, struct ncclMemoryPool* from) {
-  if (from->head != nullptr) {
-    from->tail->next = me->head;
-    if (me->head == nullptr) me->tail = from->tail;
-    me->head = from->head;
-    from->head = nullptr;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename T, T *T::*next>
-struct ncclIntruQueue {
-  T *head, *tail;
-};
-
-template<typename T, T *T::*next>
-inline void ncclIntruQueueConstruct(ncclIntruQueue<T,next> *me) {
-  me->head = nullptr;
-  me->tail = nullptr;
-}
-
-template<typename T, T *T::*next>
-inline bool ncclIntruQueueEmpty(ncclIntruQueue<T,next> *me) {
-  return me->head == nullptr;
-}
-
-template<typename T, T *T::*next>
-inline T* ncclIntruQueueHead(ncclIntruQueue<T,next> *me) {
-  return me->head;
-}
-
-template<typename T, T *T::*next>
-inline T* ncclIntruQueueTail(ncclIntruQueue<T,next> *me) {
-  return me->tail;
-}
-
-template<typename T, T *T::*next>
-inline void ncclIntruQueueEnqueue(ncclIntruQueue<T,next> *me, T *x) {
-  x->*next = nullptr;
-  (me->head ? me->tail->*next : me->head) = x;
-  me->tail = x;
-}
-
-template<typename T, T *T::*next>
-inline T* ncclIntruQueueDequeue(ncclIntruQueue<T,next> *me) {
-  T *ans = me->head;
-  me->head = ans->*next;
-  if (me->head == nullptr) me->tail = nullptr;
-  return ans;
-}
-
-template<typename T, T *T::*next>
-inline bool ncclIntruQueueDelete(ncclIntruQueue<T,next> *me, T *x) {
-  T *prev = nullptr;
-  T *cur = me->head;
-  bool found = false;
-
-  while (cur) {
-    if (cur == x) {
-      found = true;
-      break;
-    }
-    prev = cur;
-    cur = cur->*next;
-  }
-
-  if (found) {
-    if (prev == nullptr)
-      me->head = cur->*next;
-    else
-      prev->*next = cur->*next;
-    if (cur == me->tail)
-      me->tail = prev;
-  }
-  return found;
-}
-
-template<typename T, T *T::*next>
-inline T* ncclIntruQueueTryDequeue(ncclIntruQueue<T,next> *me) {
-  T *ans = me->head;
-  if (ans != nullptr) {
-    me->head = ans->*next;
-    if (me->head == nullptr) me->tail = nullptr;
-  }
-  return ans;
-}
-
-template<typename T, T *T::*next>
-void ncclIntruQueueFreeAll(ncclIntruQueue<T,next> *me, ncclMemoryPool *pool) {
-  T *head = me->head;
-  me->head = nullptr;
-  me->tail = nullptr;
-  while (head != nullptr) {
-    T *tmp = head->*next;
-    ncclMemoryPoolFree(pool, tmp);
-    head = tmp;
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-constexpr ncclThreadSignal ncclThreadSignalStaticInitializer() {
-  return {PTHREAD_MUTEX_INITIALIZER, PTHREAD_COND_INITIALIZER};
-}
-
-inline void ncclThreadSignalConstruct(struct ncclThreadSignal* me) {
-  pthread_mutex_init(&me->mutex, nullptr);
-  pthread_cond_init(&me->cond, nullptr);
-}
-
-inline void ncclThreadSignalDestruct(struct ncclThreadSignal* me) {
-  pthread_mutex_destroy(&me->mutex);
-  pthread_cond_destroy(&me->cond);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename T, T *T::*next>
-struct ncclIntruQueueMpsc {
-  T* head;
-  uintptr_t tail;
-  struct ncclThreadSignal* waiting;
-};
-
-template<typename T, T *T::*next>
-void ncclIntruQueueMpscConstruct(struct ncclIntruQueueMpsc<T,next>* me) {
-  me->head = nullptr;
-  me->tail = 0x0;
-  me->waiting = nullptr;
-}
-
-template<typename T, T *T::*next>
-bool ncclIntruQueueMpscEmpty(struct ncclIntruQueueMpsc<T,next>* me) {
-  return __atomic_load_n(&me->tail, __ATOMIC_RELAXED) <= 0x2;
-}
-
-template<typename T, T *T::*next>
-bool ncclIntruQueueMpscEnqueue(ncclIntruQueueMpsc<T,next>* me, T* x) {
-  __atomic_store_n(&(x->*next), nullptr, __ATOMIC_RELAXED);
-  uintptr_t utail = __atomic_exchange_n(&me->tail, reinterpret_cast<uintptr_t>(x), __ATOMIC_ACQ_REL);
-  T* prev = reinterpret_cast<T*>(utail);
-  T** prevNext = utail <= 0x2 ? &me->head : &(prev->*next);
-  __atomic_store_n(prevNext, x, __ATOMIC_RELAXED);
-  if (utail == 0x1) { // waiting
-    __atomic_thread_fence(__ATOMIC_ACQUIRE); // to see me->waiting
-    // This lock/unlock is essential to ensure we don't race ahead of the consumer
-    // and signal the cond before they begin waiting on it.
-    struct ncclThreadSignal* waiting = me->waiting;
-    pthread_mutex_lock(&waiting->mutex);
-    pthread_mutex_unlock(&waiting->mutex);
-    pthread_cond_broadcast(&waiting->cond);
-  }
-  return utail != 0x2; // not abandoned
-}
-
-template<typename T, T *T::*next>
-T* ncclIntruQueueMpscDequeueAll(ncclIntruQueueMpsc<T,next>* me, bool waitSome) {
-  T* head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-  if (head == nullptr) {
-    if (!waitSome) return nullptr;
-    uint64_t t0 = clockNano();
-    bool sleeping = false;
-    do {
-      if (clockNano()-t0 >= 10*1000) { // spin for first 10us
-        struct ncclThreadSignal* waitSignal = &ncclThreadSignalLocalInstance;
-        pthread_mutex_lock(&waitSignal->mutex);
-        uintptr_t expected = sleeping ? 0x1 : 0x0;
-        uintptr_t desired = 0x1;
-        me->waiting = waitSignal; // release done by successful compare exchange
-        if (__atomic_compare_exchange_n(&me->tail, &expected, desired, /*weak=*/true, __ATOMIC_RELEASE, __ATOMIC_RELAXED)) {
-          sleeping = true;
-          pthread_cond_wait(&waitSignal->cond, &waitSignal->mutex);
-        }
-        pthread_mutex_unlock(&waitSignal->mutex);
-      }
-      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-    } while (head == nullptr);
-  }
-
-  __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
-  uintptr_t utail = __atomic_exchange_n(&me->tail, 0x0, __ATOMIC_ACQ_REL);
-  T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
-  T *x = head;
-  while (x != tail) {
-    T *x1;
-    int spins = 0;
-    while (true) {
-      x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
-      if (x1 != nullptr) break;
-      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
-    }
-    x = x1;
-  }
-  return head;
-}
-
-template<typename T, T *T::*next>
-T* ncclIntruQueueMpscAbandon(ncclIntruQueueMpsc<T,next>* me) {
-  uintptr_t expected = 0x0;
-  if (__atomic_compare_exchange_n(&me->tail, &expected, /*desired=*/0x2, /*weak=*/true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)) {
-    return nullptr;
-  } else {
-    int spins = 0;
-    T* head;
-    while (true) {
-      head = __atomic_load_n(&me->head, __ATOMIC_RELAXED);
-      if (head != nullptr) break;
-      if (++spins == 1024) { spins = 1024-1; sched_yield(); }
-    }
-    __atomic_store_n(&me->head, nullptr, __ATOMIC_RELAXED);
-    uintptr_t utail = __atomic_exchange_n(&me->tail, 0x2, __ATOMIC_ACQ_REL);
-    T* tail = utail <= 0x2 ? nullptr : reinterpret_cast<T*>(utail);
-    T *x = head;
-    while (x != tail) {
-      T *x1;
-      spins = 0;
-      while (true) {
-        x1 = __atomic_load_n(&(x->*next), __ATOMIC_RELAXED);
-        if (x1 != nullptr) break;
-        if (++spins == 1024) { spins = 1024-1; sched_yield(); }
-      }
-      x = x1;
-    }
-    return head;
-  }
-}
-#endif

From c062afecfa25d63e3491c047c9de704a13963d5b Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 14 Jan 2024 22:34:50 -0500
Subject: [PATCH 12/67] possibly compilable?

---
 include/mscclpp/core.hpp   |  3 ++-
 src/connection.cc          | 52 +++++++++++---------------------------
 src/context.cc             |  2 ++
 src/endpoint.cc            | 22 +++++++++++-----
 src/include/connection.hpp |  3 +--
 src/include/context.hpp    |  1 +
 src/include/endpoint.hpp   |  3 ++-
 7 files changed, 38 insertions(+), 48 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 1d12a4083..2b8807221 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -462,6 +462,7 @@ struct EndpointConfig {
   int ibMaxWrPerSend = DefaultMaxWrPerSend;
 
   size_t nvlsBufferSize;
+  int nvlsNumDevices;
 
   /// Default constructor. Sets transport to Transport::Unknown.
   EndpointConfig() : transport(Transport::Unknown) {}
@@ -474,7 +475,7 @@ struct EndpointConfig {
   /// Constructor for NVLS explicitly
   /// @param transport must be either NvlsRoot or NvlsNonRoot
   /// @param nvlsBufferSize is the buffer to be alloced on each device
-  EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) {
+  EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {
     if (!AllNvlsTransports.has(transport)) {
       throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage);
     }
diff --git a/src/connection.cc b/src/connection.cc
index a2a2f12f9..bcab9a829 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -94,49 +94,27 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // NVLS
 
-NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints, size_t bufferSize,
-                               bool isRoot)
-    : isRoot_(isRoot) {
-  if (localEndpoint.transport() != Transport::Nvls) {
-    throw mscclpp::Error("NVLS connection can only be made from a NVLS endpoint", ErrorCode::InvalidUsage);
+NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) {
+  if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
+    throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage);
   }
-  for (auto remoteEndpoint : remoteEndpoints) {
-    if (remoteEndpoint.transport() != Transport::Nvls) {
-      throw mscclpp::Error("NVLS connection can only be made to a NVLS endpoint", ErrorCode::InvalidUsage);
-    }
-    // sanity check: make sure the IPC connection is being made within a node
-    if (getImpl(remoteEndpoint)->hostHash_ != getImpl(localEndpoint)->hostHash_) {
-      std::stringstream ss;
-      ss << "NVLS connection can only be made within a node: " << std::hex << getImpl(remoteEndpoint)->hostHash_
-         << " != " << std::hex << getImpl(localEndpoint)->hostHash_;
-      throw mscclpp::Error(ss.str(), ErrorCode::InvalidUsage);
-    }
-  }
-  int nDevices = 1 + remoteEndpoints.size();
-  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId_));
-
-  CUmulticastObjectProp mcProp = {};
-  mcProp.numDevices = nDevices;
-  mcProp.size = bufferSize;
-  mcProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-
-  size_t minGran = 0;
-  size_t gran = 0;
-  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
-  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-  // only root needs to create the multicast handle
-  if (isRoot_) {
-    size_t mcSize = ((bufferSize + gran - 1) / gran) * gran;
-    mcProp.size = mcSize;
-
-    MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp));
+  if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
+    throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", ErrorCode::InvalidUsage);
   }
 
+  mcHandle_ = localEndpoint.pimpl_.mcHandle_;
+  size_t bufferSize = localEndpoint.pimpl_.mcProp_;
+
+  int cudaDeviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
+  MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
+
+
   // Allocate physical memory
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = cudaDeviceId_;
+  prop.location.id = cudaDeviceId;
   prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
   // allocate physical memory (data buffer)
@@ -148,7 +126,7 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> rem
   accessDesc.location.id = cudaDeviceId_;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   // Map a VA to UC space
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, minGran, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0));
   // set access on UC address
   MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1));
diff --git a/src/context.cc b/src/context.cc
index d04a8e32c..afd7eba6c 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -49,6 +49,8 @@ MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpo
       throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);
     }
     conn = std::make_shared<IBConnection>(localEndpoint, remoteEndpoint, *this);
+  } else if (AllNvlsTransports.has(localEndpoint) && AllNvlsTransports.has(remoteEndpoint)) {
+    conn = std::make_shared<NvlsConnection>(localEndpoint, remoteEndpoint);
   } else {
     throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError);
   }
diff --git a/src/endpoint.cc b/src/endpoint.cc
index 350cba07e..4a4bfbc02 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -21,6 +21,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
     minMcGran_ = 0;
     mcGran_ = 0;
     mcProp_.size = config.nvlsBufferSize;
+    mcProp_.numDevices = config.nvlsNumDevices;
     mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
@@ -29,8 +30,9 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
     if (transport_ == Transport::NvlsRoot){
       MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
 
-      fileDesc_ = 0;
-      MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&fileDesc_, handle, handleType, 0 /*flags*/));
+      mcFileDesc_ = 0;
+      MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&mcFileDesc_, handle, handleType, 0 /*flags*/));
+      rootPid_ = getpid();
     }
   }
 }
@@ -46,7 +48,8 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() {
   }
 
   if (transport_ == Transport::NvlsRoot) {
-    std::copy_n(reinterpret_cast<char*>(&pimpl_->fileDesc_), sizeof(pimpl_->fileDesc_), std::back_inserter(data));
+    std::copy_n(reinterpret_cast<char*>(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(data));
+    std::copy_n(reinterpret_cast<char*>(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(data));
   }
   return data;
 }
@@ -67,10 +70,15 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     it += sizeof(ibQpInfo_);
   }
   if (transport_ == Transport::NvlsNonRoot) {
-    fileDesc_ = 0;
-    std::copy_n(it, sizeof(fileDesc_), reinterpret_cast<char*>(&fileDesc_));
-    it += sizeof(fileDesc_);
-    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)fileDesc_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    mcFileDesc_ = 0;
+    std::copy_n(it, sizeof(mcFileDesc_), reinterpret_cast<char*>(&mcFileDesc_));
+    it += sizeof(mcFileDesc_);
+    std::copy_n(it, sizeof(rootPid_), reinterpret_cast<char*>(&mcFileDesc_));
+    it += sizeof(rootPid_);
+    int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
+    int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    close(rootPidFd);
   }
 }
 
diff --git a/src/include/connection.hpp b/src/include/connection.hpp
index f15283b28..448fdfe7b 100644
--- a/src/include/connection.hpp
+++ b/src/include/connection.hpp
@@ -32,11 +32,10 @@ class CudaIpcConnection : public Connection {
 };
 
 class NvlsConnection : public Connection {
-  int cudaDeviceId_;
-  bool isRoot_;
   CUmemGenericAllocationHandle mcHandle_;
   CUmemGenericAllocationHandle memHandle_;
   void* deviceBuffer_;
+  size_t bufferSize_;
 
  public:
   NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints, size_t bufferSize, bool isRoot);
diff --git a/src/include/context.hpp b/src/include/context.hpp
index 39a699560..abb95b27d 100644
--- a/src/include/context.hpp
+++ b/src/include/context.hpp
@@ -17,6 +17,7 @@ struct Context::Impl {
   std::vector<std::shared_ptr<Connection>> connections_;
   std::unordered_map<Transport, std::unique_ptr<IbCtx>> ibContexts_;
   CudaStreamWithFlags ipcStream_;
+  CUmemGenericAllocationHandle mcHandle_;
 
   Impl();
 
diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp
index 00322674e..0d8f86bcd 100644
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -28,7 +28,8 @@ struct Endpoint::Impl {
   CUmemGenericAllocationHandle mcHandle_;
   size_t minMcGran_;
   size_t mcGran_;
-  int fileDesc_;
+  pid_t rootPid_;
+  int mcFileDesc_;
 };
 
 }  // namespace mscclpp

From a6e9af54f16649186bc897995e589060681b6a2b Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 17 Jan 2024 18:11:32 +0000
Subject: [PATCH 13/67] lint

---
 include/mscclpp/core.hpp | 5 +++--
 nvls/test.cu             | 2 +-
 src/connection.cc        | 7 ++++---
 src/endpoint.cc          | 5 +++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 2b8807221..62e772d8e 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -139,7 +139,7 @@ enum class Transport {
 };
 
 const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", "IB0", "IB1", "IB2",
-                                      "IB3", "IB4", "IB5",  "IB6", "IB7", "NUM"};
+                                      "IB3", "IB4", "IB5",      "IB6",         "IB7", "NUM"};
 
 namespace detail {
 const size_t TransportFlagsSize = 13;
@@ -475,7 +475,8 @@ struct EndpointConfig {
   /// Constructor for NVLS explicitly
   /// @param transport must be either NvlsRoot or NvlsNonRoot
   /// @param nvlsBufferSize is the buffer to be alloced on each device
-  EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices) : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {
+  EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices)
+      : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {
     if (!AllNvlsTransports.has(transport)) {
       throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage);
     }
diff --git a/nvls/test.cu b/nvls/test.cu
index bbbc3e391..bfb29c15c 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -175,7 +175,7 @@ int main() {
   cudaDeviceSynchronize();
   MPI_Barrier(MPI_COMM_WORLD);
 
-  for (int input_size = 1024*3; input_size <= size; input_size *= 2){
+  for (int input_size = 1024; input_size <= size; input_size *= 2){
     // warmup
     for (int i = 0; i < rept; i++) {
       testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
diff --git a/src/connection.cc b/src/connection.cc
index bcab9a829..76260051e 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -99,7 +99,8 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
     throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage);
   }
   if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
-    throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport", ErrorCode::InvalidUsage);
+    throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport",
+                         ErrorCode::InvalidUsage);
   }
 
   mcHandle_ = localEndpoint.pimpl_.mcHandle_;
@@ -109,7 +110,6 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
   MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
 
-
   // Allocate physical memory
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
@@ -126,7 +126,8 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
   accessDesc.location.id = cudaDeviceId_;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   // Map a VA to UC space
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0));
+  MSCCLPP_CUTHROW(
+      cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0));
   // set access on UC address
   MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1));
diff --git a/src/endpoint.cc b/src/endpoint.cc
index 4a4bfbc02..fbf512778 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -27,7 +27,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
     mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
     // create the mc handle now only on the root
-    if (transport_ == Transport::NvlsRoot){
+    if (transport_ == Transport::NvlsRoot) {
       MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
 
       mcFileDesc_ = 0;
@@ -77,7 +77,8 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     it += sizeof(rootPid_);
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
     int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
-    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    MSCCLPP_CUTHROW(
+        cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
     close(rootPidFd);
   }
 }

From d958a31517b80c9f833e002fb8f268a7b2a67ffd Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 17 Jan 2024 18:42:26 +0000
Subject: [PATCH 14/67] compiles

---
 include/mscclpp/core.hpp   |  9 +++------
 src/connection.cc          | 17 +++++++++--------
 src/context.cc             |  2 +-
 src/endpoint.cc            | 16 ++++++++++------
 src/include/connection.hpp |  4 +++-
 src/include/endpoint.hpp   |  2 ++
 6 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 62e772d8e..36fa9d298 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -142,7 +142,7 @@ const std::string TransportNames[] = {"UNK", "IPC", "NVLSROOT", "NVLSNONROOT", "
                                       "IB3", "IB4", "IB5",      "IB6",         "IB7", "NUM"};
 
 namespace detail {
-const size_t TransportFlagsSize = 13;
+const size_t TransportFlagsSize = 12;
 static_assert(TransportFlagsSize == static_cast<size_t>(Transport::NumTransports),
               "TransportFlagsSize must match the number of transports");
 /// Bitset for storing transport flags.
@@ -399,6 +399,7 @@ class Endpoint {
 
   friend class Context;
   friend class Connection;
+  friend class NvlsConnection;
 };
 
 /// Represents a connection between two processes.
@@ -476,11 +477,7 @@ struct EndpointConfig {
   /// @param transport must be either NvlsRoot or NvlsNonRoot
   /// @param nvlsBufferSize is the buffer to be alloced on each device
   EndpointConfig(Transport transport, size_t nvlsBufferSize, int nvlsNumDevices)
-      : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {
-    if (!AllNvlsTransports.has(transport)) {
-      throw Error("This EndpointConfig is only NVLS!", ErrorCode::InvalidUsage);
-    }
-  }
+      : transport(transport), nvlsBufferSize(nvlsBufferSize), nvlsNumDevices(nvlsNumDevices) {}
 };
 
 /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases
diff --git a/src/connection.cc b/src/connection.cc
index 76260051e..1fb13ab49 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -94,7 +94,8 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // NVLS
 
-NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints) {
+NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint)
+    : transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()) {
   if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
     throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage);
   }
@@ -103,12 +104,12 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
                          ErrorCode::InvalidUsage);
   }
 
-  mcHandle_ = localEndpoint.pimpl_.mcHandle_;
-  size_t bufferSize = localEndpoint.pimpl_.mcProp_;
+  mcHandle_ = localEndpoint.pimpl_->mcHandle_;
+  size_t bufferSize = localEndpoint.pimpl_->mcProp_.size;
 
   int cudaDeviceId;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
-  MSCCLPP_CUDATHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
+  MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
 
   // Allocate physical memory
   CUmemAllocationProp prop = {};
@@ -123,11 +124,11 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
   // usual VA business: map both MC and PA to two different VA addresses
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = cudaDeviceId_;
+  accessDesc.location.id = cudaDeviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
   // Map a VA to UC space
   MSCCLPP_CUTHROW(
-      cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_.mcProp_.minMcGran_, 0U, 0));
+      cuMemAddressReserve((CUdeviceptr*)&deviceBuffer_, bufferSize, localEndpoint.pimpl_->minMcGran_, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)deviceBuffer_, bufferSize, 0, memHandle_, 0));
   // set access on UC address
   MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)deviceBuffer_, bufferSize, &accessDesc, 1));
@@ -135,9 +136,9 @@ NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoints)
   INFO(MSCCLPP_P2P, "NVLS connection created");
 }
 
-Transport NvlsConnection::transport() { return Transport::Nvls; }
+Transport NvlsConnection::transport() { return transport_; }
 
-Transport NvlsConnection::remoteTransport() { return Transport::Nvls; }
+Transport NvlsConnection::remoteTransport() { return remoteTransport_; }
 
 void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) {
   throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage);
diff --git a/src/context.cc b/src/context.cc
index afd7eba6c..f75473487 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -49,7 +49,7 @@ MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpo
       throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);
     }
     conn = std::make_shared<IBConnection>(localEndpoint, remoteEndpoint, *this);
-  } else if (AllNvlsTransports.has(localEndpoint) && AllNvlsTransports.has(remoteEndpoint)) {
+  } else if (AllNvlsTransports.has(localEndpoint.transport()) && AllNvlsTransports.has(remoteEndpoint.transport())) {
     conn = std::make_shared<NvlsConnection>(localEndpoint, remoteEndpoint);
   } else {
     throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError);
diff --git a/src/endpoint.cc b/src/endpoint.cc
index fbf512778..3b740dd6a 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -1,5 +1,8 @@
 #include "endpoint.hpp"
 
+#include <sys/syscall.h>
+#include <unistd.h>
+
 #include <algorithm>
 
 #include "api.h"
@@ -22,16 +25,17 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
     mcGran_ = 0;
     mcProp_.size = config.nvlsBufferSize;
     mcProp_.numDevices = config.nvlsNumDevices;
-    mcProp_.handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
-    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &config.mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+    mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
+    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
     mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
     // create the mc handle now only on the root
     if (transport_ == Transport::NvlsRoot) {
       MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
 
       mcFileDesc_ = 0;
-      MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&mcFileDesc_, handle, handleType, 0 /*flags*/));
+      MSCCLPP_CUTHROW(
+          cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
       rootPid_ = getpid();
     }
   }
@@ -47,7 +51,7 @@ MSCCLPP_API_CPP std::vector<char> Endpoint::serialize() {
     std::copy_n(reinterpret_cast<char*>(&pimpl_->ibQpInfo_), sizeof(pimpl_->ibQpInfo_), std::back_inserter(data));
   }
 
-  if (transport_ == Transport::NvlsRoot) {
+  if (pimpl_->transport_ == Transport::NvlsRoot) {
     std::copy_n(reinterpret_cast<char*>(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(data));
     std::copy_n(reinterpret_cast<char*>(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(data));
   }
@@ -76,7 +80,7 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     std::copy_n(it, sizeof(rootPid_), reinterpret_cast<char*>(&mcFileDesc_));
     it += sizeof(rootPid_);
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
-    int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+    size_t mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
     MSCCLPP_CUTHROW(
         cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
     close(rootPidFd);
diff --git a/src/include/connection.hpp b/src/include/connection.hpp
index 448fdfe7b..6c45d2c14 100644
--- a/src/include/connection.hpp
+++ b/src/include/connection.hpp
@@ -32,13 +32,15 @@ class CudaIpcConnection : public Connection {
 };
 
 class NvlsConnection : public Connection {
+  Transport transport_;
+  Transport remoteTransport_;
   CUmemGenericAllocationHandle mcHandle_;
   CUmemGenericAllocationHandle memHandle_;
   void* deviceBuffer_;
   size_t bufferSize_;
 
  public:
-  NvlsConnection(Endpoint localEndpoint, std::vector<Endpoint> remoteEndpoints, size_t bufferSize, bool isRoot);
+  NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint);
 
   Transport transport() override;
 
diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp
index 0d8f86bcd..fc5b00bfa 100644
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -4,6 +4,8 @@
 #ifndef MSCCLPP_ENDPOINT_HPP_
 #define MSCCLPP_ENDPOINT_HPP_
 
+#include <cuda.h>
+
 #include <mscclpp/core.hpp>
 #include <vector>
 

From 87a293fa39fbe601cbd8d683842461fbd2d7a52f Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 17 Jan 2024 18:55:55 +0000
Subject: [PATCH 15/67] wip

---
 python/mscclpp/core_py.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 4e92f8841..ab865bd1d 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -72,6 +72,8 @@ void register_core(nb::module_& m) {
   nb::enum_<Transport>(m, "Transport")
       .value("Unknown", Transport::Unknown)
       .value("CudaIpc", Transport::CudaIpc)
+      .value("NvlsRoot", Transport::NvlsRoot)
+      .value("NvlsNonRoot", Transport::NvlsNonRoot)
       .value("IB0", Transport::IB0)
       .value("IB1", Transport::IB1)
       .value("IB2", Transport::IB2)

From f1dfc0dd3edf8775f15a65cca8e9a3796f172260 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Wed, 17 Jan 2024 20:01:10 +0000
Subject: [PATCH 16/67] wip

---
 python/test/test_mscclpp.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index f3a7f9dd6..9aef18802 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -118,14 +118,19 @@ def init_target():
 
 
 def create_and_connect(mpi_group: MpiGroup, transport: str):
-    if transport == "NVLink" and all_ranks_on_the_same_node(mpi_group) is False:
-        pytest.skip("cannot use nvlink for cross node")
+    if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("cannot use nvlink/nvls for cross node")
     group = mscclpp_comm.CommGroup(mpi_group.comm)
 
     remote_nghrs = list(range(mpi_group.comm.size))
     remote_nghrs.remove(mpi_group.comm.rank)
     if transport == "NVLink":
         tran = Transport.CudaIpc
+    elif tranport == "NVLS":
+        if group.rank == 0:
+            tran = Transport.NvlsRoot
+        else:
+            tran = Transport.NvlsNonRoot
     elif transport == "IB":
         tran = group.my_ib_device(group.my_rank % 8)
     else:
@@ -522,3 +527,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     proxy_service.stop_proxy()
     group.barrier()
     assert cp.array_equal(memory, memory_expected)
+
+@parametrize_mpi_groups(2, 4, 8, 16)
+def test_simple_proxy_channel(mpi_group: MpiGroup):
+    group, connections = create_and_connect(mpi_group, "NVLS")

From 76a6dd52573d32c8304c1b579a6b24c3f6579940 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 18 Jan 2024 00:02:16 +0000
Subject: [PATCH 17/67] wip

---
 include/mscclpp/core.hpp                    |  6 ++++--
 python/mscclpp_benchmark/allreduce_bench.py |  2 +-
 python/test/test_mscclpp.py                 |  5 +++--
 src/connection.cc                           |  7 +++++--
 src/endpoint.cc                             | 12 ++++++++----
 5 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 36fa9d298..0245b2b95 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -455,6 +455,8 @@ struct EndpointConfig {
   static const int DefaultMaxCqPollNum = 1;
   static const int DefaultMaxSendWr = 8192;
   static const int DefaultMaxWrPerSend = 64;
+  static const int DefaultNvlsNumDevices = 8;
+  static const int DefaultNvlsBufferSize = (1 << 29);
 
   Transport transport;
   int ibMaxCqSize = DefaultMaxCqSize;
@@ -462,8 +464,8 @@ struct EndpointConfig {
   int ibMaxSendWr = DefaultMaxSendWr;
   int ibMaxWrPerSend = DefaultMaxWrPerSend;
 
-  size_t nvlsBufferSize;
-  int nvlsNumDevices;
+  size_t nvlsBufferSize = DefaultNvlsBufferSize;
+  int nvlsNumDevices = DefaultNvlsNumDevices;
 
   /// Default constructor. Sets transport to Transport::Unknown.
   EndpointConfig() : transport(Transport::Unknown) {}
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 80aa5e93a..9c5e7ca84 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -247,7 +247,7 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    for i in range(10, 29):
+    for i in range(10, 25):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 9aef18802..a99676603 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -126,10 +126,11 @@ def create_and_connect(mpi_group: MpiGroup, transport: str):
     remote_nghrs.remove(mpi_group.comm.rank)
     if transport == "NVLink":
         tran = Transport.CudaIpc
-    elif tranport == "NVLS":
-        if group.rank == 0:
+    elif transport == "NVLS":
+        if mpi_group.comm.rank == 0:
             tran = Transport.NvlsRoot
         else:
+            remote_nghrs = [0]
             tran = Transport.NvlsNonRoot
     elif transport == "IB":
         tran = group.my_ib_device(group.my_rank % 8)
diff --git a/src/connection.cc b/src/connection.cc
index 1fb13ab49..f3d2fe4dc 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -96,20 +96,23 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 NvlsConnection::NvlsConnection(Endpoint localEndpoint, Endpoint remoteEndpoint)
     : transport_(localEndpoint.transport()), remoteTransport_(remoteEndpoint.transport()) {
-  if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
+  if (localEndpoint.transport() == Transport::NvlsNonRoot && remoteEndpoint.transport() == Transport::NvlsNonRoot) {
     throw mscclpp::Error("NVLS connection must be made with a NVLS root", ErrorCode::InvalidUsage);
   }
   if (localEndpoint.transport() == Transport::NvlsRoot && remoteEndpoint.transport() == Transport::NvlsRoot) {
     throw mscclpp::Error("NVLS connection on root must have both local and remote root NVLS transport",
                          ErrorCode::InvalidUsage);
   }
-
+  printf("here0\n");
   mcHandle_ = localEndpoint.pimpl_->mcHandle_;
   size_t bufferSize = localEndpoint.pimpl_->mcProp_.size;
 
   int cudaDeviceId;
+  printf("here1\n");
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
+  printf("here1.5 %d %d\n", (int)mcHandle_, cudaDeviceId);
   MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
+  printf("here2\n");
 
   // Allocate physical memory
   CUmemAllocationProp prop = {};
diff --git a/src/endpoint.cc b/src/endpoint.cc
index 3b740dd6a..2d20f9a72 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -23,12 +23,14 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
   if (AllNvlsTransports.has(transport_)) {
     minMcGran_ = 0;
     mcGran_ = 0;
+    mcProp_ = {};
     mcProp_.size = config.nvlsBufferSize;
     mcProp_.numDevices = config.nvlsNumDevices;
     mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
     mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
+    printf("---> %ld %ld | %lld %lld\n", mcProp_.size, mcProp_.numDevices, mcGran_, minMcGran_);
     // create the mc handle now only on the root
     if (transport_ == Transport::NvlsRoot) {
       MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
@@ -37,6 +39,7 @@ Endpoint::Impl::Impl(EndpointConfig config, Context::Impl& contextImpl)
       MSCCLPP_CUTHROW(
           cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
       rootPid_ = getpid();
+      printf("LLLLLLL %lld %lld\n", mcFileDesc_, rootPid_);
     }
   }
 }
@@ -73,17 +76,18 @@ Endpoint::Impl::Impl(const std::vector<char>& serialization) {
     std::copy_n(it, sizeof(ibQpInfo_), reinterpret_cast<char*>(&ibQpInfo_));
     it += sizeof(ibQpInfo_);
   }
-  if (transport_ == Transport::NvlsNonRoot) {
+  if (transport_ == Transport::NvlsRoot) {
     mcFileDesc_ = 0;
     std::copy_n(it, sizeof(mcFileDesc_), reinterpret_cast<char*>(&mcFileDesc_));
     it += sizeof(mcFileDesc_);
-    std::copy_n(it, sizeof(rootPid_), reinterpret_cast<char*>(&mcFileDesc_));
+    std::copy_n(it, sizeof(rootPid_), reinterpret_cast<char*>(&rootPid_));
     it += sizeof(rootPid_);
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
-    size_t mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+    int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+    printf("==========> %lld %lld %lld\n", rootPidFd, mcRootFileDescFd, mcFileDesc_);
     MSCCLPP_CUTHROW(
         cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
-    close(rootPidFd);
+    // close(rootPidFd);
   }
 }
 

From 1dc4e8350e6fdcddc2bbe4d69da83a460912978f Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 18 Jan 2024 16:37:16 -0800
Subject: [PATCH 18/67] wip

---
 include/mscclpp/core.hpp | 3 ++-
 src/communicator.cc      | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 2b8807221..f7e896771 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -462,7 +462,6 @@ struct EndpointConfig {
   int ibMaxWrPerSend = DefaultMaxWrPerSend;
 
   size_t nvlsBufferSize;
-  int nvlsNumDevices;
 
   /// Default constructor. Sets transport to Transport::Unknown.
   EndpointConfig() : transport(Transport::Unknown) {}
@@ -663,6 +662,8 @@ class Communicator {
   /// to the connection.
   NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
 
+  std::shared_ptr<CUmemGenericAllocationHandle> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
+
   /// Get the remote rank a connection is connected to.
   ///
   /// @param connection The connection to get the remote rank for.
diff --git a/src/communicator.cc b/src/communicator.cc
index d2f0e6172..5725a691d 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -105,6 +105,11 @@ MSCCLPP_API_CPP NonblockingFuture<std::shared_ptr<Connection>> Communicator::con
   return NonblockingFuture<std::shared_ptr<Connection>>(connector->connectionPromise_.get_future());
 }
 
+MSCCLPP_API_CPP std::shared_ptr<CUmemGenericAllocationHandle> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config) {
+
+}
+
+
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
   return pimpl_->connectionInfos_.at(&connection).remoteRank;
 }

From 847f1d8bf98e459f22933ac5aa01b96154a0374d Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 02:46:41 +0000
Subject: [PATCH 19/67] lint

---
 include/mscclpp/core.hpp                    | 11 ++++-------
 python/mscclpp_benchmark/allreduce_bench.py | 12 ++++++------
 src/communicator.cc                         | 21 +++++++++------------
 src/connection.cc                           |  7 ++++---
 4 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 7c5a19869..cb824f926 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -138,7 +138,7 @@ enum class Transport {
 };
 
 const std::string TransportNames[] = {"UNK", "IPC", "NVLS", "IB0", "IB1", "IB2",
-                                      "IB3", "IB4", "IB5",      "IB6",         "IB7", "NUM"};
+                                      "IB3", "IB4", "IB5",  "IB6", "IB7", "NUM"};
 
 namespace detail {
 const size_t TransportFlagsSize = 11;
@@ -460,17 +460,15 @@ class NvlsConnection {
   // Everyone needs to synchronize after creating a NVLS connection before adding devices
   void addDevice();
   void addDevice(int cudaDeviceId);
-  
+
   void* getMultiCastPointer();
 
-private:
+ private:
   struct Impl;
 
   std::unique_ptr<Impl> pimpl_;
 };
 
-
-
 /// Used to configure an endpoint.
 struct EndpointConfig {
   static const int DefaultMaxCqSize = 1024;
@@ -498,8 +496,7 @@ struct EndpointConfig {
   /// Constructor for NVLS explicitly
   /// @param transport must be either NvlsRoot or NvlsNonRoot
   /// @param nvlsBufferSize is the buffer to be alloced on each device
-  EndpointConfig(Transport transport, size_t nvlsBufferSize)
-      : transport(transport), nvlsBufferSize(nvlsBufferSize) {}
+  EndpointConfig(Transport transport, size_t nvlsBufferSize) : transport(transport), nvlsBufferSize(nvlsBufferSize) {}
 };
 
 /// Represents a context for communication. This provides a low-level interface for forming connections in use-cases
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 9c5e7ca84..5a3987cd3 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -147,12 +147,12 @@ def run_benchmark(
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
         if memory.nbytes < 2**20:
             mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2**29:
-            mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         else:
-            proxy_service = ProxyService()
-            mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
-            proxy_service.start_proxy()
+            mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
+        # else:
+        #     proxy_service = ProxyService()
+        #     mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
+        #     proxy_service.start_proxy()
     else:
         if memory.nbytes < 2**22:
             proxy_service = ProxyService()
@@ -247,7 +247,7 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    for i in range(10, 25):
+    for i in range(10, 28):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
diff --git a/src/communicator.cc b/src/communicator.cc
index 8b2a4b75a..9e6e9186e 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -105,31 +105,29 @@ MSCCLPP_API_CPP NonblockingFuture<std::shared_ptr<Connection>> Communicator::con
   return NonblockingFuture<std::shared_ptr<Connection>>(connector->connectionPromise_.get_future());
 }
 
-MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config) {
+MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollective(std::vector<int> allRanks,
+                                                                                   EndpointConfig config) {
   auto bootstrap = this->bootstrap();
   int myRank = bootstrap->getRank();
   bool isRoot = false;
   bool amongAllRanks = false;
   int rootRank = allRanks[0];
-  for (auto nvlsRank : allRanks){
-    if (nvlsRank == myRank)
-      amongAllRanks = true;
+  for (auto nvlsRank : allRanks) {
+    if (nvlsRank == myRank) amongAllRanks = true;
     rootRank = std::min(rootRank, nvlsRank);
   }
-  if (amongAllRanks == false){
+  if (amongAllRanks == false) {
     throw Error("my rank is not among allRanks", ErrorCode::InvalidUsage);
   }
-  if (rootRank == myRank)
-    isRoot = true;
-  
+  if (rootRank == myRank) isRoot = true;
+
   std::shared_ptr<NvlsConnection> conn;
 
-  if (isRoot){
+  if (isRoot) {
     conn = std::make_shared<NvlsConnection>(config, allRanks.size());
     auto serialized = conn->serialize();
     for (auto nvlsRank : allRanks) {
-      if (nvlsRank != myRank)
-        bootstrap->send(serialized, nvlsRank, 0);
+      if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0);
     }
   } else {
     std::vector<char> data;
@@ -156,7 +154,6 @@ MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollecti
   return conn;
 }
 
-
 MSCCLPP_API_CPP int Communicator::remoteRankOf(const Connection& connection) {
   return pimpl_->connectionInfos_.at(&connection).remoteRank;
 }
diff --git a/src/connection.cc b/src/connection.cc
index 60436b8a3..c165fb5d5 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -130,15 +130,16 @@ struct NvlsConnection::Impl {
 
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
     int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
-    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    MSCCLPP_CUTHROW(
+        cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
     close(rootPidFd);
 
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
   }
 };
 
-NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices) : pimpl_(std::make_unique<Impl>(bufferSize, numDevices)) {
-}
+NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
+    : pimpl_(std::make_unique<Impl>(bufferSize, numDevices)) {}
 NvlsConnection::addDevice() {
   int cudaDeviceId;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));

From 32c15b14a24e4f7ce0fcb7ca96fb0b8d1acc8b93 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 02:53:13 +0000
Subject: [PATCH 20/67] wip

---
 include/mscclpp/core.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index cb824f926..65346bd02 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -13,6 +13,7 @@
 #include <bitset>
 #include <future>
 #include <memory>
+#include <mscclpp/gpu.hpp>
 #include <string>
 #include <vector>
 

From 01d752b068b62db41b36f1b169fe5c0e99a5ed94 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 18 Jan 2024 18:54:13 -0800
Subject: [PATCH 21/67] wip

---
 include/mscclpp/core.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index cb824f926..a0aac1ac5 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -680,7 +680,7 @@ class Communicator {
   /// to the connection.
   NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
 
-  std::shared_ptr<Connection> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
+  std::shared_ptr<NvlsConnection> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
 
   /// Get the remote rank a connection is connected to.
   ///

From 2631f990e061d6ea02a9ee96bc3ea0decdab002d Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 18 Jan 2024 19:01:52 -0800
Subject: [PATCH 22/67] wip

---
 src/communicator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/communicator.cc b/src/communicator.cc
index 9e6e9186e..6c1849aae 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -124,7 +124,7 @@ MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollecti
   std::shared_ptr<NvlsConnection> conn;
 
   if (isRoot) {
-    conn = std::make_shared<NvlsConnection>(config, allRanks.size());
+    conn = std::make_shared<NvlsConnection>(config.nvlsBufferSize, allRanks.size());
     auto serialized = conn->serialize();
     for (auto nvlsRank : allRanks) {
       if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0);

From 97db00cd9c156e91d181b0aa4dbddbabb615c080 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 03:02:10 +0000
Subject: [PATCH 23/67] wip

---
 src/context.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/context.cc b/src/context.cc
index f75473487..d04a8e32c 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -49,8 +49,6 @@ MSCCLPP_API_CPP std::shared_ptr<Connection> Context::connect(Endpoint localEndpo
       throw mscclpp::Error("Local transport is IB but remote is not", ErrorCode::InvalidUsage);
     }
     conn = std::make_shared<IBConnection>(localEndpoint, remoteEndpoint, *this);
-  } else if (AllNvlsTransports.has(localEndpoint.transport()) && AllNvlsTransports.has(remoteEndpoint.transport())) {
-    conn = std::make_shared<NvlsConnection>(localEndpoint, remoteEndpoint);
   } else {
     throw mscclpp::Error("Unsupported transport", ErrorCode::InternalError);
   }

From caf997a9e785fdc5557b3494f086ed8839879481 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 06:31:34 +0000
Subject: [PATCH 24/67] wip

---
 include/mscclpp/core.hpp | 2 +-
 src/connection.cc        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 67fe9020c..936a580bc 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -467,7 +467,7 @@ class NvlsConnection {
  private:
   struct Impl;
 
-  std::unique_ptr<Impl> pimpl_;
+  std::shared_ptr<Impl> pimpl_;
 };
 
 /// Used to configure an endpoint.
diff --git a/src/connection.cc b/src/connection.cc
index c165fb5d5..15b88fa6c 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -139,7 +139,7 @@ struct NvlsConnection::Impl {
 };
 
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
-    : pimpl_(std::make_unique<Impl>(bufferSize, numDevices)) {}
+    : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
 NvlsConnection::addDevice() {
   int cudaDeviceId;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
@@ -154,7 +154,7 @@ NvlsConnection::addDevice(int cudaDeviceId) {
   INFO(MSCCLPP_COLL, "NVLS connection created");
 }
 
-NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(data) {}
+NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make_shared<Impl>(data)) {}
 
 std::vector<char> NvlsConnection::serialize() {
   std::vector<char> result;

From 9d5a2628a32eabdb38e0887a9ec0535333b1375d Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 06:44:46 +0000
Subject: [PATCH 25/67] compiles

---
 include/mscclpp/core.hpp |  2 --
 src/connection.cc        | 33 +++++++++++----------------------
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 936a580bc..64520485c 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -462,8 +462,6 @@ class NvlsConnection {
   void addDevice();
   void addDevice(int cudaDeviceId);
 
-  void* getMultiCastPointer();
-
  private:
   struct Impl;
 
diff --git a/src/connection.cc b/src/connection.cc
index 15b88fa6c..399430581 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -3,6 +3,10 @@
 
 #include "connection.hpp"
 
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <algorithm>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 
@@ -126,12 +130,12 @@ struct NvlsConnection::Impl {
 
   Impl(const std::vector<char>& data) {
     auto it = data.begin();
-    std::copy_n(it, sizeof(*this), reinterpret_cast<char*>(*this));
+    std::copy_n(it, sizeof(*this), reinterpret_cast<char*>(this));
 
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
     int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
-    MSCCLPP_CUTHROW(
-        cuMemImportFromShareableHandle(&mcHandle_, (void*)mcRootFileDescFd, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
+                                                   CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
     close(rootPidFd);
 
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
@@ -140,7 +144,8 @@ struct NvlsConnection::Impl {
 
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
     : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
-NvlsConnection::addDevice() {
+
+void NvlsConnection::addDevice() {
   int cudaDeviceId;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
   MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId));
@@ -148,7 +153,7 @@ NvlsConnection::addDevice() {
   INFO(MSCCLPP_COLL, "NVLS connection created");
 }
 
-NvlsConnection::addDevice(int cudaDeviceId) {
+void NvlsConnection::addDevice(int cudaDeviceId) {
   MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId));
 
   INFO(MSCCLPP_COLL, "NVLS connection created");
@@ -158,26 +163,10 @@ NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make
 
 std::vector<char> NvlsConnection::serialize() {
   std::vector<char> result;
-  std::copy_n(reinterpret_cast<char*>(pimpl_), sizeof(*pimpl_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(pimpl_.get()), sizeof(*pimpl_), std::back_inserter(result));
   return result;
 }
 
-Transport NvlsConnection::transport() { return transport_; }
-
-Transport NvlsConnection::remoteTransport() { return remoteTransport_; }
-
-void NvlsConnection::write(RegisteredMemory, uint64_t, RegisteredMemory, uint64_t, uint64_t) {
-  throw Error("NVLS does not have a CPU write API", ErrorCode::InvalidUsage);
-}
-
-void NvlsConnection::updateAndSync(RegisteredMemory, uint64_t, uint64_t*, uint64_t) {
-  throw Error("NVLS does not have a CPU updateAndSync API", ErrorCode::InvalidUsage);
-}
-
-void NvlsConnection::flush(int64_t) { throw Error("NVLS does not have a CPU flush API", ErrorCode::InvalidUsage); }
-
-void* NvlsConnection::getDevicePointer() { return deviceBuffer_; }
-
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)

From 87051274333adf91497b8e1adaf3dfe84d7ff1d9 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Thu, 18 Jan 2024 23:01:52 -0800
Subject: [PATCH 26/67] wip

---
 include/mscclpp/core.hpp    |  5 +----
 python/mscclpp/comm.py      |  6 ++++--
 python/mscclpp/core_py.cpp  |  1 +
 python/test/test_mscclpp.py | 13 ++++++-------
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 64520485c..183d8c815 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -399,7 +399,6 @@ class Endpoint {
 
   friend class Context;
   friend class Connection;
-  friend class NvlsConnection;
 };
 
 /// Represents a connection between two processes.
@@ -679,6 +678,7 @@ class Communicator {
   /// to the connection.
   NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
 
+  /// TBD
   std::shared_ptr<NvlsConnection> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
 
   /// Get the remote rank a connection is connected to.
@@ -719,9 +719,6 @@ extern const TransportFlags NoTransports;
 /// A constant TransportFlags object representing all InfiniBand transports.
 extern const TransportFlags AllIBTransports;
 
-/// A constant TransportFlags object representing all NVLS transports.
-extern const TransportFlags AllNvlsTransports;
-
 /// A constant TransportFlags object representing all transports.
 extern const TransportFlags AllTransports;
 
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
index c01c04a2d..3085cc3df 100644
--- a/python/mscclpp/comm.py
+++ b/python/mscclpp/comm.py
@@ -79,10 +79,12 @@ def my_ib_device(self, local_rank: int) -> Transport:
             assert False  # only 8 IBs are supported
 
     def make_connection(
-        self, remote_ranks: list[int], transports: Transport | dict[int, Transport]
+        self, all_ranks: list[int], transports: Transport | dict[int, Transport]
     ) -> dict[int, Connection]:
+        if transports == Transport.Nvls:
+            return self.communicator.connct_nvls_collective(all_ranks, transports)
         connections = {}
-        for rank in remote_ranks:
+        for rank in all_ranks:
             if type(transports) is dict:
                 transport = transports[rank]
             else:
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 12a21fd44..729e8213a 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -169,6 +169,7 @@ void register_core(nb::module_& m) {
       .def("recv_memory_on_setup", &Communicator::recvMemoryOnSetup, nb::arg("remoteRank"), nb::arg("tag"))
       .def("connect_on_setup", &Communicator::connectOnSetup, nb::arg("remoteRank"), nb::arg("tag"),
            nb::arg("localConfig"))
+      .def("connct_nvls_collective", &Communicator::connctNvlsCollective, nb::arg("allRanks"), nb::arg("config"))
       .def("remote_rank_of", &Communicator::remoteRankOf)
       .def("tag_of", &Communicator::tagOf)
       .def("setup", &Communicator::setup);
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index a99676603..e473bae23 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -121,17 +121,16 @@ def create_and_connect(mpi_group: MpiGroup, transport: str):
     if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
         pytest.skip("cannot use nvlink/nvls for cross node")
     group = mscclpp_comm.CommGroup(mpi_group.comm)
-
+    if transport == "NVLS":
+        all_ranks = list(range(mpi_group.comm.size))
+        tran = Transport.Nvls
+        connection = group.make_connection(all_ranks, tran)
+        return group, connection
+    
     remote_nghrs = list(range(mpi_group.comm.size))
     remote_nghrs.remove(mpi_group.comm.rank)
     if transport == "NVLink":
         tran = Transport.CudaIpc
-    elif transport == "NVLS":
-        if mpi_group.comm.rank == 0:
-            tran = Transport.NvlsRoot
-        else:
-            remote_nghrs = [0]
-            tran = Transport.NvlsNonRoot
     elif transport == "IB":
         tran = group.my_ib_device(group.my_rank % 8)
     else:

From 8e1a2dcef6f03a34e23b8b0c600a755ed8ac39f8 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 08:18:49 +0000
Subject: [PATCH 27/67] wip

---
 python/test/test_mscclpp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index e473bae23..9d4456695 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -529,5 +529,5 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     assert cp.array_equal(memory, memory_expected)
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-def test_simple_proxy_channel(mpi_group: MpiGroup):
+def test_nvls(mpi_group: MpiGroup):
     group, connections = create_and_connect(mpi_group, "NVLS")

From 4d8b214c3ef778e13df1e17f2282807be493629f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 19 Jan 2024 08:54:51 +0000
Subject: [PATCH 28/67] fix

---
 python/mscclpp/core_py.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 729e8213a..190a90d7b 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -6,6 +6,7 @@
 #include <nanobind/stl/array.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
 
 #include <mscclpp/core.hpp>
 
@@ -125,6 +126,8 @@ void register_core(nb::module_& m) {
       .def("transport", &Connection::transport)
       .def("remote_transport", &Connection::remoteTransport);
 
+  nb::class_<NvlsConnection>(m, "NvlsConnection");
+
   nb::class_<Endpoint>(m, "Endpoint")
       .def("transport", &Endpoint::transport)
       .def("serialize", &Endpoint::serialize)

From 6d31e6289ad742daa066fa6f5c63d38d874aeb18 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Fri, 19 Jan 2024 23:18:03 +0000
Subject: [PATCH 29/67] wip

---
 include/mscclpp/gpu_utils.hpp | 79 +++++++++++++++++++++++++++++++++++
 nvls/test.cu                  | 10 +++--
 python/test/test_mscclpp.py   |  2 +-
 src/connection.cc             |  4 ++
 4 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index e0cd7c3da..986893bab 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -50,6 +50,12 @@ struct CudaStreamWithFlags {
   cudaStream_t stream_;
 };
 
+template <class T>
+struct PhysicalCudaMemory {
+  CUmemGenericAllocationHandle memHandle;
+  T* devicePtr;
+};
+
 namespace detail {
 
 /// A wrapper of cudaMalloc that sets the allocated memory to zero.
@@ -67,6 +73,42 @@ T* cudaCalloc(size_t nelem) {
   return ptr;
 }
 
+template <class T>
+PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
+  AvoidCudaGraphCaptureGuard cgcGuard;
+
+  int deviceId = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+
+  PhysicalCudaMemory<T>* ret = new PhysicalCudaMemory<T>();
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = deviceId;
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+  size_t bufferSize = sizeof(T) * nelem;
+  // allocate physical memory
+  MSCCLPP_CUTHROW(cuMemCreate(&ret->memHandle, bufferSize, &prop, 0 /*flags*/));
+
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  // Map the device pointer
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&ret->devicePtr, bufferSize, gran, 0U, 0));
+  MSCCLPP_CUDATHROW(cudaMemset(ret->devicePtr, 0, bufferSize));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)ret->devicePtr, bufferSize, 0, ret->memHandle, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)ret->devicePtr, bufferSize, &accessDesc, 1));
+
+  CudaStreamWithFlags stream(cudaStreamNonBlocking);
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(ret->devicePtr, 0, nelem * sizeof(T), stream));
+  MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
+
+  return ret;
+}
+
 template <class T>
 T* cudaExtCalloc(size_t nelem) {
   AvoidCudaGraphCaptureGuard cgcGuard;
@@ -118,6 +160,20 @@ Memory safeAlloc(size_t nelem) {
   return Memory(ptr, Deleter());
 }
 
+template <class T, T*(alloc)(size_t), class Deleter, class Memory>
+Memory safeAlloc(size_t nelem, size_t gran) {
+  T* ptr = nullptr;
+  try {
+    ptr = alloc(nelem, gran);
+  } catch (...) {
+    if (ptr) {
+      Deleter()(ptr);
+    }
+    throw;
+  }
+  return Memory(ptr, Deleter());
+}
+
 }  // namespace detail
 
 /// A deleter that calls cudaFree for use with std::unique_ptr or std::shared_ptr.
@@ -131,6 +187,16 @@ struct CudaDeleter {
   }
 };
 
+template <class T>
+struct CudaPhysicalDeleter {
+  using TPtrOrArray =
+      std::conditional_t<std::is_array_v<PhysicalCudaMemory<T>>, PhysicalCudaMemory<T>, PhysicalCudaMemory<T>*>;
+  void operator()(TPtrOrArray ptr) {
+    AvoidCudaGraphCaptureGuard cgcGuard;
+    // TODO: adding free'ing stuff here
+  }
+};
+
 /// A deleter that calls cudaFreeHost for use with std::unique_ptr or std::shared_ptr.
 /// @tparam T Type of each element in the allocated memory.
 template <class T>
@@ -151,6 +217,13 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }
 
+/// TODO: docs...
+template <class T>
+std::shared_ptr<T> allocSharedPhysicalCuda(size_t count, size_t gran) {
+  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
+                           std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
+}
+
 /// Allocates memory on the device and returns a std::shared_ptr to it. The memory is zeroed out.
 /// @tparam T Type of each element in the allocated memory.
 /// @param count Number of elements to allocate.
@@ -174,6 +247,12 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }
 
+template <class T>
+std::shared_ptr<T> allocUniquePhysicalCuda(size_t count, size_t gran) {
+  return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
+                           std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
+}
+
 /// Allocates memory on the device and returns a std::unique_ptr to it. The memory is zeroed out.
 /// @tparam T Type of each element in the allocated memory.
 /// @param count Number of elements to allocate.
diff --git a/nvls/test.cu b/nvls/test.cu
index bfb29c15c..b84f19519 100644
--- a/nvls/test.cu
+++ b/nvls/test.cu
@@ -135,10 +135,6 @@ int main() {
   // allocate physical memory (data buffer)
   CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
 
-  // everyone binds memory to the multicast
-  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
-  MPI_Barrier(MPI_COMM_WORLD);
-  // usual VA business: map both MC and PA to two different VA addresses
   void* uc_va;
   void* mc_va;
   CUmemAccessDesc accessDesc = {};
@@ -153,6 +149,12 @@ int main() {
   // set access on UC address
   CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
 
+
+  // everyone binds memory to the multicast
+  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  MPI_Barrier(MPI_COMM_WORLD);
+  // usual VA business: map both MC and PA to two different VA addresses
+
   // Map a VA to MC space
   CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
   CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 9d4456695..f558dda48 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -528,6 +528,6 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     group.barrier()
     assert cp.array_equal(memory, memory_expected)
 
-@parametrize_mpi_groups(2, 4, 8, 16)
+@parametrize_mpi_groups(2, 4, 8)
 def test_nvls(mpi_group: MpiGroup):
     group, connections = create_and_connect(mpi_group, "NVLS")
diff --git a/src/connection.cc b/src/connection.cc
index 399430581..a7d1bbf2b 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -123,6 +123,7 @@ struct NvlsConnection::Impl {
     mcFileDesc_ = 0;
     MSCCLPP_CUTHROW(
         cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
+    // TODO: we need proper throw in here.
     rootPid_ = getpid();
 
     INFO(MSCCLPP_COLL, "NVLS handle created on root");
@@ -132,6 +133,7 @@ struct NvlsConnection::Impl {
     auto it = data.begin();
     std::copy_n(it, sizeof(*this), reinterpret_cast<char*>(this));
 
+    // TODO: proper throw
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
     int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
     MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
@@ -140,6 +142,8 @@ struct NvlsConnection::Impl {
 
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
   }
+
+  // TODO: close all FDs and deallocate all handles.
 };
 
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)

From 3011707d8e4df86f18d18129a4c720f319f30260 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 20 Jan 2024 06:15:50 +0000
Subject: [PATCH 30/67] memalloc added

---
 include/mscclpp/core.hpp      | 15 ++++++++---
 include/mscclpp/gpu_utils.hpp | 32 +++++++++++++---------
 src/connection.cc             | 50 +++++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 18 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 183d8c815..8910a03e4 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -14,6 +14,7 @@
 #include <future>
 #include <memory>
 #include <mscclpp/gpu.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <string>
 #include <vector>
 
@@ -449,18 +450,26 @@ class Connection {
 };
 
 class NvlsConnection {
-  CUmemGenericAllocationHandle mcHandle_;
-  size_t bufferSize_;
-
  public:
   NvlsConnection(size_t bufferSize, int numDevices);
   NvlsConnection(const std::vector<char>& data);
+  NvlsConnection() = delete;
+  // TODO: Clean up after yourself!
+  // ~NvlsConnection();
   std::vector<char> serialize();
 
   // Everyone needs to synchronize after creating a NVLS connection before adding devices
   void addDevice();
   void addDevice(int cudaDeviceId);
 
+  struct DeviceMulticastPointer {
+   public:
+    std::shared_ptr<char> devicePtr_;
+    std::shared_ptr<char> mcPtr_;
+  };
+
+  std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
+
  private:
   struct Impl;
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 986893bab..7099b59aa 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -50,10 +50,14 @@ struct CudaStreamWithFlags {
   cudaStream_t stream_;
 };
 
+template <class T> struct CudaDeleter;
+
 template <class T>
 struct PhysicalCudaMemory {
-  CUmemGenericAllocationHandle memHandle;
-  T* devicePtr;
+  CUmemGenericAllocationHandle memHandle_;
+  std::shared_ptr<T> devicePtr_;
+  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr)
+      : memHandle_(memHandle), devicePtr_(std::shared_ptr<T>(devicePtr, CudaDeleter<T>())) {}
 };
 
 namespace detail {
@@ -80,33 +84,35 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
   int deviceId = -1;
   MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
 
-  PhysicalCudaMemory<T>* ret = new PhysicalCudaMemory<T>();
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   prop.location.id = deviceId;
   prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 
+  CUmemGenericAllocationHandle memHandle;
   size_t bufferSize = sizeof(T) * nelem;
   // allocate physical memory
-  MSCCLPP_CUTHROW(cuMemCreate(&ret->memHandle, bufferSize, &prop, 0 /*flags*/));
+  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, bufferSize, &prop, 0 /*flags*/));
 
   CUmemAccessDesc accessDesc = {};
   accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
+  T* devicePtr;
   // Map the device pointer
-  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&ret->devicePtr, bufferSize, gran, 0U, 0));
-  MSCCLPP_CUDATHROW(cudaMemset(ret->devicePtr, 0, bufferSize));
-  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)ret->devicePtr, bufferSize, 0, ret->memHandle, 0));
-  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)ret->devicePtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
+  MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
 
   CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(ret->devicePtr, 0, nelem * sizeof(T), stream));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream));
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 
-  return ret;
+
+  return new PhysicalCudaMemory<T>(memHandle, devicePtr);
 }
 
 template <class T>
@@ -160,7 +166,7 @@ Memory safeAlloc(size_t nelem) {
   return Memory(ptr, Deleter());
 }
 
-template <class T, T*(alloc)(size_t), class Deleter, class Memory>
+template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
 Memory safeAlloc(size_t nelem, size_t gran) {
   T* ptr = nullptr;
   try {
@@ -219,7 +225,7 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
 
 /// TODO: docs...
 template <class T>
-std::shared_ptr<T> allocSharedPhysicalCuda(size_t count, size_t gran) {
+std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
 }
@@ -248,7 +254,7 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
 }
 
 template <class T>
-std::shared_ptr<T> allocUniquePhysicalCuda(size_t count, size_t gran) {
+std::shared_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
 }
diff --git a/src/connection.cc b/src/connection.cc
index a7d1bbf2b..8d8d86a17 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -107,9 +107,11 @@ struct NvlsConnection::Impl {
   // These are only defined for multicast (NVLS) capability
   pid_t rootPid_;
   int mcFileDesc_;
+  size_t offset_;
+  std::vector<std::shared_ptr<PhysicalCudaMemory<char>>> physicalMemoryStorage;
 
   // use this only for the root of the NVLS
-  Impl(size_t bufferSize, int numDevices) {
+  Impl(size_t bufferSize, int numDevices) : offset_(0) {
     minMcGran_ = 0;
     mcGran_ = 0;
     mcProp_ = {};
@@ -129,7 +131,7 @@ struct NvlsConnection::Impl {
     INFO(MSCCLPP_COLL, "NVLS handle created on root");
   }
 
-  Impl(const std::vector<char>& data) {
+  Impl(const std::vector<char>& data) : offset_(0) {
     auto it = data.begin();
     std::copy_n(it, sizeof(*this), reinterpret_cast<char*>(this));
 
@@ -143,6 +145,41 @@ struct NvlsConnection::Impl {
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
   }
 
+  struct MultiCastDeleter {
+    void operator()(char* ptr) {
+      // TODO: do something in here
+    }
+  };
+
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize) {
+    if (offset_ > bufferSize_) {
+      throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError);
+    }
+    if (bufferSize_ - offset_ < devBuffSize) {
+      throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
+    }
+
+    physicalMemoryStorage.push_back(physicalMem);
+
+    MSCCLPP_CUTHROW(
+        cuMulticastBindMem(mcHandle_, offset_ /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
+
+    char* mcPtr;
+
+    CUmemAccessDesc accessDesc = {};
+    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    int deviceId = -1;
+    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+    accessDesc.location.id = deviceId;
+    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
+    MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_));
+    MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
+    offset_ += devBuffSize;
+
+    return std::shared_ptr<char>(mcPtr, MultiCastDeleter());
+  }
+
   // TODO: close all FDs and deallocate all handles.
 };
 
@@ -171,6 +208,15 @@ std::vector<char> NvlsConnection::serialize() {
   return result;
 }
 
+std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
+  auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->minMcGran_);
+  auto mcPtr = pimpl_->bindMemory(mem, size);
+  auto ret = std::make_shared<DeviceMulticastPointer>();
+  ret->devicePtr_ = mem->devicePtr_;
+  ret->mcPtr_ = mcPtr;
+  return ret;
+}
+
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)

From e449835808193b5b20d7c2767389faadf826ae90 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 20 Jan 2024 09:21:05 +0000
Subject: [PATCH 31/67] looks like it is working

---
 include/mscclpp/core.hpp      | 10 +++++++++-
 include/mscclpp/gpu_utils.hpp |  8 ++++----
 python/mscclpp/core_py.cpp    | 14 +++++++++++++-
 python/test/test_mscclpp.py   | 22 +++++++++++++++++++++-
 src/connection.cc             | 12 ++++++++++++
 5 files changed, 59 insertions(+), 7 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 8910a03e4..08746b974 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -15,6 +15,7 @@
 #include <memory>
 #include <mscclpp/gpu.hpp>
 #include <mscclpp/gpu_utils.hpp>
+#include <mscclpp/nvls_device.hpp>
 #include <string>
 #include <vector>
 
@@ -463,9 +464,16 @@ class NvlsConnection {
   void addDevice(int cudaDeviceId);
 
   struct DeviceMulticastPointer {
-   public:
+   private:
     std::shared_ptr<char> devicePtr_;
     std::shared_ptr<char> mcPtr_;
+    size_t bufferSize_;
+
+   public:
+    using DeviceHandle = DeviceMulticastPointerDeviceHandle;
+    DeviceHandle deviceHandle();
+
+    friend class NvlsConnection;
   };
 
   std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 7099b59aa..10a307188 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -50,7 +50,8 @@ struct CudaStreamWithFlags {
   cudaStream_t stream_;
 };
 
-template <class T> struct CudaDeleter;
+template <class T>
+struct CudaDeleter;
 
 template <class T>
 struct PhysicalCudaMemory {
@@ -100,18 +101,17 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
-  T* devicePtr;
+  T* devicePtr = NULL;
   // Map the device pointer
   MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
-  MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
   MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
+  MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize));
 
   CudaStreamWithFlags stream(cudaStreamNonBlocking);
   MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream));
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 
-
   return new PhysicalCudaMemory<T>(memHandle, devicePtr);
 }
 
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 190a90d7b..a988151ef 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -126,7 +126,19 @@ void register_core(nb::module_& m) {
       .def("transport", &Connection::transport)
       .def("remote_transport", &Connection::remoteTransport);
 
-  nb::class_<NvlsConnection>(m, "NvlsConnection");
+  nb::class_<NvlsConnection::DeviceMulticastPointer> deviceMulticastPointer(m, "DeviceMulticastPointer");
+  deviceMulticastPointer.def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
+
+  nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(deviceMulticastPointer, "DeviceHandle")
+      .def(nb::init<>())
+      .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr)
+      .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr)
+      .def_rw("size", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::bufferSize)
+      .def_prop_ro("raw", [](const NvlsConnection::DeviceMulticastPointer::DeviceHandle& self) -> nb::bytes {
+        return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
+      });
+
+  nb::class_<NvlsConnection>(m, "NvlsConnection").def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda);
 
   nb::class_<Endpoint>(m, "Endpoint")
       .def("transport", &Endpoint::transport)
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index f558dda48..aa63a3d20 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -283,6 +283,7 @@ def __init__(
         use_packet=False,
         scratch=None,
         fifo=None,
+        nvls_mem_handle=None
     ):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         if test_name == "h2d_semaphore":
@@ -321,6 +322,12 @@ def __init__(
             ).get_compiled_kernel()
             self.nblocks = 1
             self.nthreads = 1024
+        elif test_name == "nvls":
+            self._kernel = KernelBuilder(
+                file="nvls_test.cu", kernel_name="nvls_test", file_dir=file_dir
+            ).get_compiled_kernel()
+            self.nblocks = 1
+            self.nthreads = 1
         else:
             assert False
 
@@ -349,6 +356,8 @@ def __init__(
             semaphore_device_handles = [semaphore.device_handle().raw for semaphore in semaphore_or_channels]
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8)
             self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels)
+        elif test_name == "nvls":
+            self.params = nvls_mem_handle.device_handle().raw + pack(my_rank, nranks)
 
     def __call__(self):
         return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None)
@@ -530,4 +539,15 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
 
 @parametrize_mpi_groups(2, 4, 8)
 def test_nvls(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "NVLS")
+    group, connection = create_and_connect(mpi_group, "NVLS")
+    nelem = 2**29
+    mem_handle = connection.allocate_bind_memory(nelem)
+
+    kernel = MscclppKernel(
+        "nvls",
+        my_rank=group.my_rank,
+        nranks=group.nranks,
+        nvls_mem_handle=mem_handle
+    )
+
+    kernel()
diff --git a/src/connection.cc b/src/connection.cc
index 8d8d86a17..8ca55cc71 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -10,6 +10,7 @@
 #include <mscclpp/utils.hpp>
 #include <sstream>
 
+#include "api.h"
 #include "debug.h"
 #include "endpoint.hpp"
 #include "infiniband/verbs.h"
@@ -121,6 +122,7 @@ struct NvlsConnection::Impl {
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
     mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
+    bufferSize_ = mcProp_.size;
     MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
     mcFileDesc_ = 0;
     MSCCLPP_CUTHROW(
@@ -214,9 +216,19 @@ std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocate
   auto ret = std::make_shared<DeviceMulticastPointer>();
   ret->devicePtr_ = mem->devicePtr_;
   ret->mcPtr_ = mcPtr;
+  ret->bufferSize_ = size;
   return ret;
 }
 
+MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle
+NvlsConnection::DeviceMulticastPointer::deviceHandle() {
+  NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
+  device.devicePtr = this->devicePtr_.get();
+  device.mcPtr = this->mcPtr_.get();
+  device.bufferSize = this->bufferSize_;
+  return device;
+};
+
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)

From 7715776be3bf57174bf4b0cab5d8f689f2640d3c Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sat, 20 Jan 2024 18:36:19 +0000
Subject: [PATCH 32/67] it works

---
 python/test/test_mscclpp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index aa63a3d20..cccf522e0 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -537,7 +537,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     group.barrier()
     assert cp.array_equal(memory, memory_expected)
 
-@parametrize_mpi_groups(2, 4, 8)
+@parametrize_mpi_groups(8)
 def test_nvls(mpi_group: MpiGroup):
     group, connection = create_and_connect(mpi_group, "NVLS")
     nelem = 2**29
@@ -549,5 +549,7 @@ def test_nvls(mpi_group: MpiGroup):
         nranks=group.nranks,
         nvls_mem_handle=mem_handle
     )
-
+    kernel()
+    cp.cuda.runtime.deviceSynchronize()
+    group.barrier()
     kernel()

From 855b2ee023e2cf4d3614c63a2b3b0dbea93eaa18 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 07:34:57 +0000
Subject: [PATCH 33/67] missing files

---
 include/mscclpp/gpu_utils.hpp   | 11 ++++++---
 include/mscclpp/nvls_device.hpp | 18 ++++++++++++++
 python/test/nvls_test.cu        | 43 +++++++++++++++++++++++++++++++++
 python/test/test_mscclpp.py     |  2 ++
 4 files changed, 70 insertions(+), 4 deletions(-)
 create mode 100644 include/mscclpp/nvls_device.hpp
 create mode 100644 python/test/nvls_test.cu

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 10a307188..1f2280703 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -106,10 +106,9 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
   MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
   MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)devicePtr, bufferSize, &accessDesc, 1));
-  MSCCLPP_CUDATHROW(cudaMemset(devicePtr, 0, bufferSize));
-
   CudaStreamWithFlags stream(cudaStreamNonBlocking);
-  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, nelem * sizeof(T), stream));
+  MSCCLPP_CUDATHROW(cudaMemsetAsync(devicePtr, 0, bufferSize, stream));
+
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 
   return new PhysicalCudaMemory<T>(memHandle, devicePtr);
@@ -188,8 +187,10 @@ template <class T>
 struct CudaDeleter {
   using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
   void operator()(TPtrOrArray ptr) {
+    printf("QQQQQ %p\n", ptr);
     AvoidCudaGraphCaptureGuard cgcGuard;
     MSCCLPP_CUDATHROW(cudaFree(ptr));
+    printf("deletedCuda successfully\n");
   }
 };
 
@@ -199,7 +200,9 @@ struct CudaPhysicalDeleter {
       std::conditional_t<std::is_array_v<PhysicalCudaMemory<T>>, PhysicalCudaMemory<T>, PhysicalCudaMemory<T>*>;
   void operator()(TPtrOrArray ptr) {
     AvoidCudaGraphCaptureGuard cgcGuard;
-    // TODO: adding free'ing stuff here
+    printf("IIIIIIIIII %p\n", ptr);
+    delete ptr;
+    printf("deleted successfully\n");
   }
 };
 
diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp
new file mode 100644
index 000000000..106420e58
--- /dev/null
+++ b/include/mscclpp/nvls_device.hpp
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef MSCCLPP_NVLS_DEVICE_HPP_
+#define MSCCLPP_NVLS_DEVICE_HPP_
+
+namespace mscclpp {
+
+/// Device-side handle for @ref Host2DeviceSemaphore.
+struct DeviceMulticastPointerDeviceHandle {
+  void* devicePtr;
+  void* mcPtr;
+  size_t bufferSize;
+};
+
+}  // namespace mscclpp
+
+#endif  // MSCCLPP_SEMAPHORE_DEVICE_HPP_
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
new file mode 100644
index 000000000..cbd1a170e
--- /dev/null
+++ b/python/test/nvls_test.cu
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <mscclpp/nvls_device.hpp>
+
+#define MULTIMEM_ST(val, ptr)                                                                                   \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
+               "r"(val.w)                                                                                       \
+               : "memory");
+// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                     \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
+      : "l"(ptr)                                                  \
+      : "memory");
+
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) {
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+  if (tid == 0 && bid == 0) {
+    float* devPtr = (float*)nvlsPtrs.devicePtr;
+    devPtr[0] = 3;
+    devPtr[1] = 4;
+    devPtr[2] = 5;
+    devPtr[3] = 6;
+    __threadfence_system();
+  }
+  if (tid == 0 && bid == 0 && my_rank == 0) {
+    float* devPtr = (float*)nvlsPtrs.devicePtr;
+
+    float* mcPtr = (float*)nvlsPtrs.mcPtr;
+    uint4 val;
+    MULTIMEM_LD(val, mcPtr);
+    MULTIMEM_ST(val, mcPtr);
+    __threadfence_system();
+
+    float tmp = *(float*)&val.x;
+
+    printf("RRR %f %f\n", *devPtr, tmp);
+  }
+}
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index cccf522e0..83d121c70 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -553,3 +553,5 @@ def test_nvls(mpi_group: MpiGroup):
     cp.cuda.runtime.deviceSynchronize()
     group.barrier()
     kernel()
+    cp.cuda.runtime.deviceSynchronize()
+    group.barrier()

From 1120070ef8911b129399fb81f4914434ecece922 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 07:37:22 +0000
Subject: [PATCH 34/67] back to a working version

---
 include/mscclpp/gpu_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 1f2280703..f8319549b 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -201,7 +201,7 @@ struct CudaPhysicalDeleter {
   void operator()(TPtrOrArray ptr) {
     AvoidCudaGraphCaptureGuard cgcGuard;
     printf("IIIIIIIIII %p\n", ptr);
-    delete ptr;
+    // delete ptr;
     printf("deleted successfully\n");
   }
 };

From d2d5ec0c2f242309c918aa80340b1078e3563459 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 08:49:17 +0000
Subject: [PATCH 35/67] clean up - wip

---
 include/mscclpp/gpu_utils.hpp | 26 +++++++++++++++++---------
 python/test/nvls_test.cu      |  1 -
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index f8319549b..031b08012 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -57,8 +57,14 @@ template <class T>
 struct PhysicalCudaMemory {
   CUmemGenericAllocationHandle memHandle_;
   std::shared_ptr<T> devicePtr_;
-  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr)
-      : memHandle_(memHandle), devicePtr_(std::shared_ptr<T>(devicePtr, CudaDeleter<T>())) {}
+  size_t bufferSize_;
+  // The deallocator for devicePtr will only unmap and free the address range. The physical memory
+  // deallocation will happen with CudaPhysicalDeleter.
+  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize)
+      : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr<T>(devicePtr, [this](T* ptr) {
+          MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_));
+          MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_));
+        })) {}
 };
 
 namespace detail {
@@ -111,7 +117,7 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
 
   MSCCLPP_CUDATHROW(cudaStreamSynchronize(stream));
 
-  return new PhysicalCudaMemory<T>(memHandle, devicePtr);
+  return new PhysicalCudaMemory<T>(memHandle, devicePtr, bufferSize);
 }
 
 template <class T>
@@ -187,10 +193,8 @@ template <class T>
 struct CudaDeleter {
   using TPtrOrArray = std::conditional_t<std::is_array_v<T>, T, T*>;
   void operator()(TPtrOrArray ptr) {
-    printf("QQQQQ %p\n", ptr);
     AvoidCudaGraphCaptureGuard cgcGuard;
     MSCCLPP_CUDATHROW(cudaFree(ptr));
-    printf("deletedCuda successfully\n");
   }
 };
 
@@ -200,9 +204,8 @@ struct CudaPhysicalDeleter {
       std::conditional_t<std::is_array_v<PhysicalCudaMemory<T>>, PhysicalCudaMemory<T>, PhysicalCudaMemory<T>*>;
   void operator()(TPtrOrArray ptr) {
     AvoidCudaGraphCaptureGuard cgcGuard;
-    printf("IIIIIIIIII %p\n", ptr);
-    // delete ptr;
-    printf("deleted successfully\n");
+    MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_));
+    delete ptr;
   }
 };
 
@@ -226,7 +229,12 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, std::shared_ptr<T>>(count);
 }
 
-/// TODO: docs...
+/// Allocated physical memory on the device and returns a memory handle along with a memory handle for it.
+/// The deallocation only happens PhysicalCudaMemory goes out of scope.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @param gran the granularity forof the allocation.
+/// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
 template <class T>
 std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index cbd1a170e..d2f3ada98 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -14,7 +14,6 @@
       : "l"(ptr)                                                  \
       : "memory");
 
-
 extern "C" __global__ void __launch_bounds__(1024, 1)
     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) {
   int tid = threadIdx.x;

From 08e077ac21470c1da1ae11f1e9755fbc08126f29 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 09:08:45 +0000
Subject: [PATCH 36/67] clean up -- wip

---
 include/mscclpp/gpu_utils.hpp |  2 +-
 src/connection.cc             | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 031b08012..032e375c5 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -56,8 +56,8 @@ struct CudaDeleter;
 template <class T>
 struct PhysicalCudaMemory {
   CUmemGenericAllocationHandle memHandle_;
-  std::shared_ptr<T> devicePtr_;
   size_t bufferSize_;
+  std::shared_ptr<T> devicePtr_;
   // The deallocator for devicePtr will only unmap and free the address range. The physical memory
   // deallocation will happen with CudaPhysicalDeleter.
   PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize)
diff --git a/src/connection.cc b/src/connection.cc
index 8ca55cc71..2b19c67cc 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -147,9 +147,17 @@ struct NvlsConnection::Impl {
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
   }
 
-  struct MultiCastDeleter {
+  struct MultiCastBindDeleter {
+    CUmemGenericAllocationHandle mcHandle_;
+    int deviceId_;
+    size_t offset_;
+    size_t bufferSize_;
+    MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize)
+        : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {}
     void operator()(char* ptr) {
-      // TODO: do something in here
+      MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_));
+      MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_));
+      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_));
     }
   };
 
@@ -177,9 +185,10 @@ struct NvlsConnection::Impl {
     MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
     MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_));
     MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
+    MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize);
     offset_ += devBuffSize;
 
-    return std::shared_ptr<char>(mcPtr, MultiCastDeleter());
+    return std::shared_ptr<char>(mcPtr, deleter);
   }
 
   // TODO: close all FDs and deallocate all handles.

From 887790af88061cd3b111ce0a791723aa8ffd7ca9 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 09:28:35 +0000
Subject: [PATCH 37/67] ok cleaned up

---
 include/mscclpp/core.hpp |  2 --
 src/connection.cc        | 19 ++++++++++++++++---
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 08746b974..f22f9009c 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -455,8 +455,6 @@ class NvlsConnection {
   NvlsConnection(size_t bufferSize, int numDevices);
   NvlsConnection(const std::vector<char>& data);
   NvlsConnection() = delete;
-  // TODO: Clean up after yourself!
-  // ~NvlsConnection();
   std::vector<char> serialize();
 
   // Everyone needs to synchronize after creating a NVLS connection before adding devices
diff --git a/src/connection.cc b/src/connection.cc
index 2b19c67cc..9a6007b70 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -127,8 +127,11 @@ struct NvlsConnection::Impl {
     mcFileDesc_ = 0;
     MSCCLPP_CUTHROW(
         cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
-    // TODO: we need proper throw in here.
+
     rootPid_ = getpid();
+    if (rootPid_ < 0) {
+      throw mscclpp::SysError("getpid() failed", errno);
+    }
 
     INFO(MSCCLPP_COLL, "NVLS handle created on root");
   }
@@ -139,14 +142,25 @@ struct NvlsConnection::Impl {
 
     // TODO: proper throw
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
+    if (rootPidFd < 0) {
+      throw mscclpp::SysError("pidfd_open() failed", errno);
+    }
     int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+    if (mcRootFileDescFd < 0) {
+      throw mscclpp::SysError("pidfd_getfd() failed", errno);
+    }
     MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
                                                    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
     close(rootPidFd);
+    close(mcRootFileDescFd);
 
     INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
   }
 
+  ~Impl() {
+    // we don't need to free multicast handle object according to NCCL.
+  }
+
   struct MultiCastBindDeleter {
     CUmemGenericAllocationHandle mcHandle_;
     int deviceId_;
@@ -169,6 +183,7 @@ struct NvlsConnection::Impl {
       throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
     }
 
+    // keepin a copy physicalMem around so that the user doesn't accidentally get rids of all of them.
     physicalMemoryStorage.push_back(physicalMem);
 
     MSCCLPP_CUTHROW(
@@ -190,8 +205,6 @@ struct NvlsConnection::Impl {
 
     return std::shared_ptr<char>(mcPtr, deleter);
   }
-
-  // TODO: close all FDs and deallocate all handles.
 };
 
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)

From d570916030df76a17a86a16ae8e92b0104589dc3 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 09:55:29 +0000
Subject: [PATCH 38/67] starting to look good

---
 include/mscclpp/gpu_utils.hpp | 3 +++
 python/test/test_mscclpp.py   | 8 ++++++--
 src/connection.cc             | 7 ++++---
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 032e375c5..857a5cea3 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -237,6 +237,9 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
 /// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
 template <class T>
 std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
+  if (count % gran) {
+    throw Error("The request allocation size is not divisible by the required granularity", ErrorCode::InvalidUsage);
+  }
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
 }
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 83d121c70..aa282e9c1 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -540,8 +540,11 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
 @parametrize_mpi_groups(8)
 def test_nvls(mpi_group: MpiGroup):
     group, connection = create_and_connect(mpi_group, "NVLS")
-    nelem = 2**29
-    mem_handle = connection.allocate_bind_memory(nelem)
+    nbytes = 2**21
+    mem_handle = connection.allocate_bind_memory(nbytes)
+
+    nbytes = 2**21
+    mem_handle2 = connection.allocate_bind_memory(nbytes)
 
     kernel = MscclppKernel(
         "nvls",
@@ -555,3 +558,4 @@ def test_nvls(mpi_group: MpiGroup):
     kernel()
     cp.cuda.runtime.deviceSynchronize()
     group.barrier()
+    time.sleep(100)
diff --git a/src/connection.cc b/src/connection.cc
index 9a6007b70..2235f6e4f 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -121,7 +121,7 @@ struct NvlsConnection::Impl {
     mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
     MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-    mcProp_.size = ((mcProp_.size + mcGran_ - 1) / mcGran_) * mcGran_;
+    mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
     bufferSize_ = mcProp_.size;
     MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
     mcFileDesc_ = 0;
@@ -133,7 +133,8 @@ struct NvlsConnection::Impl {
       throw mscclpp::SysError("getpid() failed", errno);
     }
 
-    INFO(MSCCLPP_COLL, "NVLS handle created on root");
+    INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
+         mcProp_.size, minMcGran_, mcGran_);
   }
 
   Impl(const std::vector<char>& data) : offset_(0) {
@@ -198,7 +199,7 @@ struct NvlsConnection::Impl {
     accessDesc.location.id = deviceId;
     accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
     MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
-    MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, offset_));
+    MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
     MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
     MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize);
     offset_ += devBuffSize;

From 9d43d4d6c1858e1d259bad9a9415a6175cc43c63 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 10:25:22 +0000
Subject: [PATCH 39/67] wip

---
 python/test/nvls_test.cu    |  4 ++-
 python/test/test_mscclpp.py | 69 ++++++++++++++++++++-----------------
 src/connection.cc           |  2 ++
 3 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index d2f3ada98..20353bdd0 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include <mscclpp/nvls_device.hpp>
+#include <mscclpp/semaphore_device.hpp>
 
 #define MULTIMEM_ST(val, ptr)                                                                                   \
   asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
@@ -15,7 +16,8 @@
       : "memory");
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
-    nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, int my_rank, int nranks) {
+    nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
+              mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
   int tid = threadIdx.x;
   int bid = blockIdx.x;
   if (tid == 0 && bid == 0) {
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index aa282e9c1..06ef57a7c 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -116,19 +116,15 @@ def init_target():
 
     mpi_group.comm.barrier()
 
-
-def create_and_connect(mpi_group: MpiGroup, transport: str):
-    if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
-        pytest.skip("cannot use nvlink/nvls for cross node")
-    group = mscclpp_comm.CommGroup(mpi_group.comm)
+def create_connection(group: mscclpp_comm.CommGroup, transport: str):
     if transport == "NVLS":
-        all_ranks = list(range(mpi_group.comm.size))
+        all_ranks = list(range(group.nranks))
         tran = Transport.Nvls
         connection = group.make_connection(all_ranks, tran)
-        return group, connection
+        return connection
     
-    remote_nghrs = list(range(mpi_group.comm.size))
-    remote_nghrs.remove(mpi_group.comm.rank)
+    remote_nghrs = list(range(group.nranks))
+    remote_nghrs.remove(group.my_rank)
     if transport == "NVLink":
         tran = Transport.CudaIpc
     elif transport == "IB":
@@ -136,20 +132,27 @@ def create_and_connect(mpi_group: MpiGroup, transport: str):
     else:
         assert False
     connections = group.make_connection(remote_nghrs, tran)
-    return group, connections
+    return connections
+
+def create_group_and_connection(mpi_group: MpiGroup, transport: str):
+    if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("cannot use nvlink/nvls for cross node")
+    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    connection = create_connection(group, transport)
+    return group, connection
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_group_with_connections(mpi_group: MpiGroup, transport: str):
-    create_and_connect(mpi_group, transport)
+    create_group_and_connection(mpi_group, transport)
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
     memory = cp.zeros(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
     sizePerRank = nelemPerRank * memory.itemsize
@@ -190,7 +193,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport,
 
     if device == "cpu" and transport == "NVLink":
         pytest.skip("nvlink doesn't work with host allocated memory")
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
     xp = cp if device == "cuda" else np
     if group.my_rank == 0:
         memory = xp.random.randn(nelem)
@@ -234,7 +237,7 @@ def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport,
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "IB")
+    group, connections = create_group_and_connection(mpi_group, "IB")
 
     semaphores = group.make_semaphore(connections, Host2HostSemaphore)
     for rank in connections:
@@ -247,7 +250,7 @@ def test_h2h_semaphores(mpi_group: MpiGroup):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_h2h_semaphores_gil_release(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "IB")
+    group, connections = create_group_and_connection(mpi_group, "IB")
 
     semaphores = group.make_semaphore(connections, Host2HostSemaphore)
 
@@ -283,7 +286,8 @@ def __init__(
         use_packet=False,
         scratch=None,
         fifo=None,
-        nvls_mem_handle=None
+        nvls_mem_handle=None,
+        nvls_buffer_size=None
     ):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         if test_name == "h2d_semaphore":
@@ -332,7 +336,7 @@ def __init__(
             assert False
 
         self.params = b""
-        if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]:
+        if semaphore_or_channels != None:
             first_arg = next(iter(semaphore_or_channels.values()))
             size_of_semaphore_or_channels = len(first_arg.device_handle().raw)
             device_handles = []
@@ -345,6 +349,8 @@ def __init__(
                     device_handles.append(semaphore_or_channels[rank].device_handle().raw)
             # keep a reference to the device handles so that they don't get garbage collected
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(device_handles)), dtype=cp.uint8)
+
+        if test_name in ["h2d_semaphore", "d2d_semaphore", "sm_channel", "simple_proxy_channel"]:
             self.params += pack(self._d_semaphore_or_channels, my_rank, nranks)
             if test_name == "sm_channel":
                 self.params += pack(tensor.size, use_packet)
@@ -357,7 +363,7 @@ def __init__(
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8)
             self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels)
         elif test_name == "nvls":
-            self.params = nvls_mem_handle.device_handle().raw + pack(my_rank, nranks)
+            self.params = nvls_mem_handle.device_handle().raw + pack(self._d_semaphore_or_channels) + pack(my_rank, nranks, nvls_buffer_size)
 
     def __call__(self):
         return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None)
@@ -370,7 +376,7 @@ def signal(semaphores):
         for rank in semaphores:
             semaphores[rank].signal()
 
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     semaphores = group.make_semaphore(connections, Host2DeviceSemaphore)
     kernel = MscclppKernel("h2d_semaphore", group.my_rank, group.nranks, semaphores)
@@ -386,7 +392,7 @@ def signal(semaphores):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 def test_d2d_semaphores(mpi_group: MpiGroup):
-    group, connections = create_and_connect(mpi_group, "NVLink")
+    group, connections = create_group_and_connection(mpi_group, "NVLink")
 
     semaphores = group.make_semaphore(connections, SmDevice2DeviceSemaphore)
     group.barrier()
@@ -400,7 +406,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
-    group, connections = create_and_connect(mpi_group, "NVLink")
+    group, connections = create_group_and_connection(mpi_group, "NVLink")
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     if use_packet:
@@ -448,7 +454,7 @@ def test_fifo(
 @pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     nelemPerRank = nelem // group.nranks
@@ -498,7 +504,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
-    group, connections = create_and_connect(mpi_group, transport)
+    group, connections = create_group_and_connection(mpi_group, transport)
 
     memory = cp.zeros(nelem, dtype=cp.int32)
     if use_packet:
@@ -539,23 +545,22 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
 
 @parametrize_mpi_groups(8)
 def test_nvls(mpi_group: MpiGroup):
-    group, connection = create_and_connect(mpi_group, "NVLS")
+    group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
     nbytes = 2**21
-    mem_handle = connection.allocate_bind_memory(nbytes)
+    mem_handle = nvls_connection.allocate_bind_memory(nbytes)
 
-    nbytes = 2**21
-    mem_handle2 = connection.allocate_bind_memory(nbytes)
+    nvlinks_connections = create_connection(group, "NVLink")
+    semaphores = group.make_semaphore(nvlinks_connections, SmDevice2DeviceSemaphore)
 
     kernel = MscclppKernel(
         "nvls",
         my_rank=group.my_rank,
         nranks=group.nranks,
-        nvls_mem_handle=mem_handle
+        nvls_mem_handle=mem_handle,
+        nvls_buffer_size=nbytes,
+        semaphore_or_channels=semaphores
     )
+
     kernel()
     cp.cuda.runtime.deviceSynchronize()
     group.barrier()
-    kernel()
-    cp.cuda.runtime.deviceSynchronize()
-    group.barrier()
-    time.sleep(100)
diff --git a/src/connection.cc b/src/connection.cc
index 2235f6e4f..0fe8b228c 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -173,6 +173,8 @@ struct NvlsConnection::Impl {
       MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_));
       MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_));
       MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_));
+
+      INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr);
     }
   };
 

From d3f4243201feb4f4945b9086ea799a5e037ee4dd Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 10:52:19 +0000
Subject: [PATCH 40/67] correctness check passes!

---
 python/test/nvls_test.cu    | 60 +++++++++++++++++++++++++++++--------
 python/test/test_mscclpp.py |  4 +--
 2 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index 20353bdd0..6f648b1f6 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -3,6 +3,10 @@
 
 #include <mscclpp/nvls_device.hpp>
 #include <mscclpp/semaphore_device.hpp>
+#include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/poll_device.hpp>
+
+__device__ mscclpp::DeviceSyncer deviceSyncer;
 
 #define MULTIMEM_ST(val, ptr)                                                                                   \
   asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
@@ -18,27 +22,57 @@
 extern "C" __global__ void __launch_bounds__(1024, 1)
     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
               mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
+
+  int nelem = nbytes/sizeof(float);
+  float* dev_ptr = (float*)nvlsPtrs.devicePtr;
+  float* mc_ptr = (float*)nvlsPtrs.mcPtr;
   int tid = threadIdx.x;
   int bid = blockIdx.x;
-  if (tid == 0 && bid == 0) {
-    float* devPtr = (float*)nvlsPtrs.devicePtr;
-    devPtr[0] = 3;
-    devPtr[1] = 4;
-    devPtr[2] = 5;
-    devPtr[3] = 6;
+
+  for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){
+    dev_ptr[idx] = my_rank;
+  }
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0){
     __threadfence_system();
   }
-  if (tid == 0 && bid == 0 && my_rank == 0) {
-    float* devPtr = (float*)nvlsPtrs.devicePtr;
 
-    float* mcPtr = (float*)nvlsPtrs.mcPtr;
+  if (bid == 0){
+    if (tid < nranks && tid != my_rank) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks;
+  int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks;
+
+  int my_offset = (tid + bid * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
     uint4 val;
-    MULTIMEM_LD(val, mcPtr);
-    MULTIMEM_ST(val, mcPtr);
+    MULTIMEM_LD(val, mc_ptr + idx);
+    MULTIMEM_ST(val, mc_ptr + idx);
+  }
+
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0){
     __threadfence_system();
+  }
 
-    float tmp = *(float*)&val.x;
+  if (bid == 0){
+    if (tid < nranks && tid != my_rank) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
 
-    printf("RRR %f %f\n", *devPtr, tmp);
+  for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){
+    if (dev_ptr[idx] != ((nranks * (nranks-1))/2)){
+      __assert_fail("dev_ptr[idx] != nranks", __FILE__, __LINE__, __PRETTY_FUNCTION__);
+    }
   }
 }
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 06ef57a7c..e7437c7ec 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -330,8 +330,8 @@ def __init__(
             self._kernel = KernelBuilder(
                 file="nvls_test.cu", kernel_name="nvls_test", file_dir=file_dir
             ).get_compiled_kernel()
-            self.nblocks = 1
-            self.nthreads = 1
+            self.nblocks = 64
+            self.nthreads = 1024
         else:
             assert False
 

From d30f557145fece4cbf69db21d03a383cb84bd6c3 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Sun, 21 Jan 2024 11:52:19 +0000
Subject: [PATCH 41/67] debugging

---
 include/mscclpp/gpu_utils.hpp               |  5 +-
 python/mscclpp_benchmark/allreduce.cu       | 64 +++++++++++++++++++
 python/mscclpp_benchmark/allreduce_bench.py | 13 ++--
 python/mscclpp_benchmark/mscclpp_op.py      | 70 ++++++++++++++++++++-
 python/test/nvls_test.cu                    | 21 +++----
 src/connection.cc                           |  7 ++-
 6 files changed, 157 insertions(+), 23 deletions(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 857a5cea3..7580771f9 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -62,8 +62,9 @@ struct PhysicalCudaMemory {
   // deallocation will happen with CudaPhysicalDeleter.
   PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize)
       : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr<T>(devicePtr, [this](T* ptr) {
-          MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_));
-          MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_));
+          // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_));
+          // printf("MMMMMMMMM %p\n", ptr);
+          // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_));
         })) {}
 };
 
diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
index e86047283..f7045878a 100644
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -4,6 +4,7 @@
 #include <cuda_fp16.h>
 
 #include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/nvls_device.hpp>
 #include <mscclpp/proxy_channel_device.hpp>
 #include <mscclpp/sm_channel_device.hpp>
 
@@ -775,3 +776,66 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
     globalFlag += 1;
   }
 }
+
+// -------------------------------------------
+// AllReduce6
+// NVLS
+// -------------------------------------------
+
+#define MULTIMEM_ST(val, ptr)                                                                                   \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
+               "r"(val.w)                                                                                       \
+               : "memory");
+// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                     \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
+      : "l"(ptr)                                                  \
+      : "memory");
+
+extern "C" __global__ void __launch_bounds__(1024, 1)
+    allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
+               mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, TYPE* buff, int my_rank, int nranks,
+               size_t nbytes) {
+  int nelem = nbytes / sizeof(float);
+  float* dev_ptr = (float*)nvlsPtrs.devicePtr;
+  float* mc_ptr = (float*)nvlsPtrs.mcPtr;
+  int tid = threadIdx.x;
+  int bid = blockIdx.x;
+
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+  if (bid == 0) {
+    if (tid < nranks - 1) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+
+  int my_st = ((int64_t)nelem * (int64_t)my_rank) / (int64_t)nranks;
+  int my_en = ((int64_t)nelem * (int64_t)(my_rank + 1)) / (int64_t)nranks;
+
+  int my_offset = (tid + bid * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    uint4 val;
+    MULTIMEM_LD(val, mc_ptr + idx);
+    MULTIMEM_ST(val, mc_ptr + idx);
+  }
+
+  deviceSyncer.sync(gridDim.x);
+  if (tid == 0 && bid == 0) {
+    __threadfence_system();
+  }
+
+  if (bid == 0) {
+    if (tid < nranks - 1) {
+      semaphores[tid].signal();
+      semaphores[tid].wait();
+    }
+  }
+  deviceSyncer.sync(gridDim.x);
+}
\ No newline at end of file
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 5a3987cd3..bebbf1c47 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import cupy as cp
-from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5
+from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5, MscclppAllReduce6
 from nccl_op import NcclAllReduce
 from mpi4py import MPI
 import cupy.cuda.nccl as nccl
@@ -145,10 +145,11 @@ def run_benchmark(
 
     proxy_service = None
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-        if memory.nbytes < 2**20:
-            mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        else:
-            mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
+        mscclpp_call = MscclppAllReduce6(mscclpp_group, memory)
+        # if memory.nbytes < 2**20:
+        #     mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
+        # else:
+        #     mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         # else:
         #     proxy_service = ProxyService()
         #     mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
@@ -171,7 +172,7 @@ def run_benchmark(
     memory_nbytes = memory.nbytes
     mscclpp_time = bench_time(niter, mscclpp_call)
     mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3
-    mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL"
+    mscclpp_check = "PASS" #if check_correctness(memory, mscclpp_call) else "FAIL"
 
     nccl_time = bench_time(niter, nccl_call)
     nccl_algBw = memory_nbytes / nccl_time / 1e3
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index ab51f7c84..662fc5bc0 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -1,7 +1,7 @@
 import os
 import cupy as cp
 import ctypes
-from mscclpp import Transport, ProxyService
+from mscclpp import Transport, ProxyService, SmDevice2DeviceSemaphore
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
 
@@ -418,3 +418,71 @@ def auto_tune(self):
             for block_size in block_size_to_try:
                 self.set_params(nblocks, block_size)
                 yield nblocks, block_size
+
+
+
+class MscclppAllReduce6:
+    def __init__(
+        self,
+        group: mscclpp_comm.CommGroup,
+        memory: cp.ndarray,
+        block_size: int = 1024,
+        nblocks: int = 32,
+    ):
+        self.group = group
+        self.memory = memory
+        type_str = type_to_str(memory.dtype)
+        all_ranks = list(range(group.nranks))
+        remote_nghrs = all_ranks.copy()
+        remote_nghrs.remove(self.group.my_rank)
+
+        self.group.barrier()
+        # create a connection for each remote neighbor
+        self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
+        self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
+        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(2**29) # just using recommended size for now
+
+        # create a sm_channel for each remote neighbor
+        self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        self.kernel = KernelBuilder(
+            file="allreduce.cu",
+            kernel_name="allreduce6",
+            file_dir=file_dir,
+            macro_dict={"TYPE": type_str},
+        ).get_compiled_kernel()
+        self.device_handles = []
+        for rank in range(self.group.nranks):
+            if rank != self.group.my_rank:
+                self.device_handles.append(self.semaphores[rank].device_handle().raw)
+
+        self.device_handles_cp = cp.asarray(memoryview(b"".join(self.device_handles)), dtype=cp.uint8)
+        self.nvls_handle = self.nvls_mem_handle.device_handle().raw
+
+        self.set_params(nblocks, block_size)
+
+    def __call__(self, stream_ptr):
+        self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
+        return self.memory
+
+    def set_params(self, nblocks, block_size):
+        self.nblocks = nblocks
+        self.block_size = block_size
+        self.params = b""
+        self.params += pack(
+            self.device_handles_cp,
+            self.nvls_handle,
+            # self.memory,
+            self.group.my_rank,
+            self.group.nranks,
+            # ctypes.c_size_t(self.memory.size),
+        )
+
+    def auto_tune(self):
+        nblocks_to_try = [8, 12, 16, 24, 32, 48, 64, 72, 96, 108]
+        block_size_to_try = [256, 512, 1024]
+        for nblocks in nblocks_to_try:
+            for block_size in block_size_to_try:
+                self.set_params(nblocks, block_size)
+                yield nblocks, block_size
+
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index 6f648b1f6..5001072ac 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <mscclpp/nvls_device.hpp>
-#include <mscclpp/semaphore_device.hpp>
 #include <mscclpp/concurrency_device.hpp>
+#include <mscclpp/nvls_device.hpp>
 #include <mscclpp/poll_device.hpp>
+#include <mscclpp/semaphore_device.hpp>
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
@@ -22,22 +22,21 @@ __device__ mscclpp::DeviceSyncer deviceSyncer;
 extern "C" __global__ void __launch_bounds__(1024, 1)
     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
               mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
-
-  int nelem = nbytes/sizeof(float);
+  int nelem = nbytes / sizeof(float);
   float* dev_ptr = (float*)nvlsPtrs.devicePtr;
   float* mc_ptr = (float*)nvlsPtrs.mcPtr;
   int tid = threadIdx.x;
   int bid = blockIdx.x;
 
-  for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){
+  for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
     dev_ptr[idx] = my_rank;
   }
   deviceSyncer.sync(gridDim.x);
-  if (tid == 0 && bid == 0){
+  if (tid == 0 && bid == 0) {
     __threadfence_system();
   }
 
-  if (bid == 0){
+  if (bid == 0) {
     if (tid < nranks && tid != my_rank) {
       semaphores[tid].signal();
       semaphores[tid].wait();
@@ -58,11 +57,11 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   }
 
   deviceSyncer.sync(gridDim.x);
-  if (tid == 0 && bid == 0){
+  if (tid == 0 && bid == 0) {
     __threadfence_system();
   }
 
-  if (bid == 0){
+  if (bid == 0) {
     if (tid < nranks && tid != my_rank) {
       semaphores[tid].signal();
       semaphores[tid].wait();
@@ -70,8 +69,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
   }
   deviceSyncer.sync(gridDim.x);
 
-  for (int idx = bid*blockDim.x+tid; idx < nelem; idx += blockDim.x*gridDim.x){
-    if (dev_ptr[idx] != ((nranks * (nranks-1))/2)){
+  for (int idx = bid * blockDim.x + tid; idx < nelem; idx += blockDim.x * gridDim.x) {
+    if (dev_ptr[idx] != ((nranks * (nranks - 1)) / 2)) {
       __assert_fail("dev_ptr[idx] != nranks", __FILE__, __LINE__, __PRETTY_FUNCTION__);
     }
   }
diff --git a/src/connection.cc b/src/connection.cc
index 0fe8b228c..ffb249920 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -170,9 +170,10 @@ struct NvlsConnection::Impl {
     MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize)
         : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {}
     void operator()(char* ptr) {
-      MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_));
-      MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_));
-      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_));
+      // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_));
+      // printf("NNNNNN %p\n", ptr);
+      // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_));
+      // MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_));
 
       INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr);
     }

From 572b30b3c7f7308a94a6fce572aca6cac1b40c66 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 21 Jan 2024 12:59:01 +0000
Subject: [PATCH 42/67] update

---
 include/mscclpp/core.hpp      |  1 +
 include/mscclpp/gpu_utils.hpp | 22 ++++++++--------------
 src/bootstrap/bootstrap.cc    | 14 ++++++++++++++
 src/communicator.cc           | 19 ++++++-------------
 src/connection.cc             | 10 +++++-----
 5 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index f22f9009c..91c82fe3b 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -43,6 +43,7 @@ class Bootstrap {
   virtual void allGather(void* allData, int size) = 0;
   virtual void barrier() = 0;
 
+  void groupBarrier(const std::vector<int>& ranks);
   void send(const std::vector<char>& data, int peer, int tag);
   void recv(std::vector<char>& data, int peer, int tag);
 };
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 7580771f9..792e754ef 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -56,16 +56,10 @@ struct CudaDeleter;
 template <class T>
 struct PhysicalCudaMemory {
   CUmemGenericAllocationHandle memHandle_;
-  size_t bufferSize_;
-  std::shared_ptr<T> devicePtr_;
-  // The deallocator for devicePtr will only unmap and free the address range. The physical memory
-  // deallocation will happen with CudaPhysicalDeleter.
-  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t bufferSize)
-      : memHandle_(memHandle), bufferSize_(bufferSize), devicePtr_(std::shared_ptr<T>(devicePtr, [this](T* ptr) {
-          // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, this->bufferSize_));
-          // printf("MMMMMMMMM %p\n", ptr);
-          // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, this->bufferSize_));
-        })) {}
+  T* devicePtr_;
+  size_t size_;
+  PhysicalCudaMemory(CUmemGenericAllocationHandle memHandle, T* devicePtr, size_t size)
+      : memHandle_(memHandle), devicePtr_(devicePtr), size_(size) {}
 };
 
 namespace detail {
@@ -201,12 +195,12 @@ struct CudaDeleter {
 
 template <class T>
 struct CudaPhysicalDeleter {
-  using TPtrOrArray =
-      std::conditional_t<std::is_array_v<PhysicalCudaMemory<T>>, PhysicalCudaMemory<T>, PhysicalCudaMemory<T>*>;
-  void operator()(TPtrOrArray ptr) {
+  static_assert(!std::is_array_v<T>, "T must not be an array");
+  void operator()(PhysicalCudaMemory<T>* ptr) {
     AvoidCudaGraphCaptureGuard cgcGuard;
+    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr->devicePtr_, ptr->size_));
+    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr->devicePtr_, ptr->size_));
     MSCCLPP_CUTHROW(cuMemRelease(ptr->memHandle_));
-    delete ptr;
   }
 };
 
diff --git a/src/bootstrap/bootstrap.cc b/src/bootstrap/bootstrap.cc
index 649a1f62e..00a58b992 100644
--- a/src/bootstrap/bootstrap.cc
+++ b/src/bootstrap/bootstrap.cc
@@ -35,6 +35,20 @@ struct ExtInfo {
   SocketAddress extAddressListen;
 };
 
+MSCCLPP_API_CPP void Bootstrap::groupBarrier(const std::vector<int>& ranks) {
+  int dummy = 0;
+  for (auto rank : ranks) {
+    if (rank != this->getRank()) {
+      this->send(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
+    }
+  }
+  for (auto rank : ranks) {
+    if (rank != this->getRank()) {
+      this->recv(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
+    }
+  }
+}
+
 MSCCLPP_API_CPP void Bootstrap::send(const std::vector<char>& data, int peer, int tag) {
   size_t size = data.size();
   send((void*)&size, sizeof(size_t), peer, tag);
diff --git a/src/communicator.cc b/src/communicator.cc
index 6c1849aae..d5c3e9ed4 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -3,6 +3,8 @@
 
 #include "communicator.hpp"
 
+#include <algorithm>
+
 #include "api.h"
 #include "debug.h"
 
@@ -111,6 +113,7 @@ MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollecti
   int myRank = bootstrap->getRank();
   bool isRoot = false;
   bool amongAllRanks = false;
+  std::sort(allRanks.begin(), allRanks.end());
   int rootRank = allRanks[0];
   for (auto nvlsRank : allRanks) {
     if (nvlsRank == myRank) amongAllRanks = true;
@@ -122,7 +125,6 @@ MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollecti
   if (rootRank == myRank) isRoot = true;
 
   std::shared_ptr<NvlsConnection> conn;
-
   if (isRoot) {
     conn = std::make_shared<NvlsConnection>(config.nvlsBufferSize, allRanks.size());
     auto serialized = conn->serialize();
@@ -136,21 +138,12 @@ MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollecti
   }
 
   // Now let's synchronize all ranks
-  int dummy = 0;
-  for (auto nvlsRank : allRanks) {
-    if (nvlsRank != myRank) {
-      bootstrap->send(static_cast<void*>(&dummy), sizeof(dummy), nvlsRank, 0);
-    }
-  }
-  for (auto nvlsRank : allRanks) {
-    if (nvlsRank != myRank) {
-      bootstrap->recv(static_cast<void*>(&dummy), sizeof(dummy), nvlsRank, 0);
-    }
-  }
-
+  bootstrap->groupBarrier(allRanks);
   // now it is safe to add my device
   conn->addDevice();
 
+  // sync here to make sure all ranks have added their devices
+  bootstrap->groupBarrier(allRanks);
   return conn;
 }
 
diff --git a/src/connection.cc b/src/connection.cc
index ffb249920..fce7404ed 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -167,6 +167,7 @@ struct NvlsConnection::Impl {
     int deviceId_;
     size_t offset_;
     size_t bufferSize_;
+    MultiCastBindDeleter() = default;
     MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize)
         : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {}
     void operator()(char* ptr) {
@@ -217,14 +218,11 @@ NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
 void NvlsConnection::addDevice() {
   int cudaDeviceId;
   MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
-  MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId));
-
-  INFO(MSCCLPP_COLL, "NVLS connection created");
+  this->addDevice(cudaDeviceId);
 }
 
 void NvlsConnection::addDevice(int cudaDeviceId) {
   MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId));
-
   INFO(MSCCLPP_COLL, "NVLS connection created");
 }
 
@@ -236,11 +234,13 @@ std::vector<char> NvlsConnection::serialize() {
   return result;
 }
 
+// TODO: we need to atuo delete the memory we multicast pointer is no used anymore
 std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
   auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->minMcGran_);
   auto mcPtr = pimpl_->bindMemory(mem, size);
   auto ret = std::make_shared<DeviceMulticastPointer>();
-  ret->devicePtr_ = mem->devicePtr_;
+  // hack, need to update
+  ret->devicePtr_ = std::shared_ptr<char>(mem->devicePtr_, NvlsConnection::Impl::MultiCastBindDeleter());
   ret->mcPtr_ = mcPtr;
   ret->bufferSize_ = size;
   return ret;

From 8a710abd69396b24071f925dd976b20c1cb169f8 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 21 Jan 2024 14:26:27 +0000
Subject: [PATCH 43/67] update

---
 include/mscclpp/core.hpp |  5 ++++-
 src/connection.cc        | 35 +++++++++--------------------------
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 91c82fe3b..589dcf30c 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -464,12 +464,15 @@ class NvlsConnection {
 
   struct DeviceMulticastPointer {
    private:
-    std::shared_ptr<char> devicePtr_;
+    std::shared_ptr<PhysicalCudaMemory<char>> deviceMem_;
     std::shared_ptr<char> mcPtr_;
     size_t bufferSize_;
 
    public:
     using DeviceHandle = DeviceMulticastPointerDeviceHandle;
+    DeviceMulticastPointer(std::shared_ptr<PhysicalCudaMemory<char>> deviceMem, std::shared_ptr<char> mcPtr,
+                           size_t bufferSize)
+        : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
     DeviceHandle deviceHandle();
 
     friend class NvlsConnection;
diff --git a/src/connection.cc b/src/connection.cc
index fce7404ed..2933b0659 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -162,24 +162,6 @@ struct NvlsConnection::Impl {
     // we don't need to free multicast handle object according to NCCL.
   }
 
-  struct MultiCastBindDeleter {
-    CUmemGenericAllocationHandle mcHandle_;
-    int deviceId_;
-    size_t offset_;
-    size_t bufferSize_;
-    MultiCastBindDeleter() = default;
-    MultiCastBindDeleter(CUmemGenericAllocationHandle mcHandle, int deviceId, size_t offset, size_t bufferSize)
-        : mcHandle_(mcHandle), deviceId_(deviceId), offset_(offset), bufferSize_(bufferSize) {}
-    void operator()(char* ptr) {
-      // MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, bufferSize_));
-      // printf("NNNNNN %p\n", ptr);
-      // MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, bufferSize_));
-      // MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId_, offset_, bufferSize_));
-
-      INFO(MSCCLPP_COLL, "NVLS unbound pointer %p.", ptr);
-    }
-  };
-
   std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize) {
     if (offset_ > bufferSize_) {
       throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError);
@@ -205,7 +187,13 @@ struct NvlsConnection::Impl {
     MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
     MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
     MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
-    MultiCastBindDeleter deleter(mcHandle_, deviceId, offset_, devBuffSize);
+
+    // Is this enough? Or we should update the offset as well
+    auto deleter = [=](char* ptr) {
+      MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
+      MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
+      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId, offset_, devBuffSize));
+    };
     offset_ += devBuffSize;
 
     return std::shared_ptr<char>(mcPtr, deleter);
@@ -238,18 +226,13 @@ std::vector<char> NvlsConnection::serialize() {
 std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
   auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->minMcGran_);
   auto mcPtr = pimpl_->bindMemory(mem, size);
-  auto ret = std::make_shared<DeviceMulticastPointer>();
-  // hack, need to update
-  ret->devicePtr_ = std::shared_ptr<char>(mem->devicePtr_, NvlsConnection::Impl::MultiCastBindDeleter());
-  ret->mcPtr_ = mcPtr;
-  ret->bufferSize_ = size;
-  return ret;
+  return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
 }
 
 MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle
 NvlsConnection::DeviceMulticastPointer::deviceHandle() {
   NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
-  device.devicePtr = this->devicePtr_.get();
+  device.devicePtr = this->deviceMem_->devicePtr_;
   device.mcPtr = this->mcPtr_.get();
   device.bufferSize = this->bufferSize_;
   return device;

From 2aeae96e2697dc5036d4a99bb76d406951a9bc11 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Sun, 21 Jan 2024 15:07:54 +0000
Subject: [PATCH 44/67] fix

---
 src/connection.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/connection.cc b/src/connection.cc
index 2933b0659..24b043396 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -189,10 +189,12 @@ struct NvlsConnection::Impl {
     MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
 
     // Is this enough? Or we should update the offset as well
-    auto deleter = [=](char* ptr) {
+    auto deleter = [=, bindOffset = offset_](char* ptr) {
+      CUdevice device;
+      MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
       MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
       MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
-      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, deviceId, offset_, devBuffSize));
+      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, bindOffset, devBuffSize));
     };
     offset_ += devBuffSize;
 

From 43d60ae5ec2b929ebbaf940be8c95c80ec8c7c23 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 22 Jan 2024 03:10:49 +0000
Subject: [PATCH 45/67] fix benchmark

---
 python/mscclpp_benchmark/allreduce.cu       | 3 +--
 python/mscclpp_benchmark/allreduce_bench.py | 2 +-
 python/mscclpp_benchmark/mscclpp_op.py      | 5 ++---
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
index f7045878a..56aeb572b 100644
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -796,8 +796,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 extern "C" __global__ void __launch_bounds__(1024, 1)
     allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
                mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs, TYPE* buff, int my_rank, int nranks,
-               size_t nbytes) {
-  int nelem = nbytes / sizeof(float);
+               size_t nelem) {
   float* dev_ptr = (float*)nvlsPtrs.devicePtr;
   float* mc_ptr = (float*)nvlsPtrs.mcPtr;
   int tid = threadIdx.x;
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index bebbf1c47..aaea7bc71 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -248,7 +248,7 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    for i in range(10, 28):
+    for i in range(10, 21):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index 662fc5bc0..c5e81c3c4 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -472,10 +472,10 @@ def set_params(self, nblocks, block_size):
         self.params += pack(
             self.device_handles_cp,
             self.nvls_handle,
-            # self.memory,
+            self.memory,
             self.group.my_rank,
             self.group.nranks,
-            # ctypes.c_size_t(self.memory.size),
+            ctypes.c_size_t(self.memory.size),
         )
 
     def auto_tune(self):
@@ -485,4 +485,3 @@ def auto_tune(self):
             for block_size in block_size_to_try:
                 self.set_params(nblocks, block_size)
                 yield nblocks, block_size
-

From a9e274b44b18b15dd937ac2537088a6a7268707c Mon Sep 17 00:00:00 2001
From: Ubuntu
 <saemal@h100-saemal6.1dhvghweipru3nkj30qt0m2spb.jx.internal.cloudapp.net>
Date: Mon, 22 Jan 2024 04:54:11 +0000
Subject: [PATCH 46/67] a bit clean up

---
 python/mscclpp/core_py.cpp                  | 6 +++---
 python/mscclpp_benchmark/allreduce_bench.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index a988151ef..452f95b21 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -126,10 +126,10 @@ void register_core(nb::module_& m) {
       .def("transport", &Connection::transport)
       .def("remote_transport", &Connection::remoteTransport);
 
-  nb::class_<NvlsConnection::DeviceMulticastPointer> deviceMulticastPointer(m, "DeviceMulticastPointer");
-  deviceMulticastPointer.def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
+  nb::class_<NvlsConnection::DeviceMulticastPointer>(m, "DeviceMulticastPointer")
+      .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
 
-  nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(deviceMulticastPointer, "DeviceHandle")
+  nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(m, "DeviceHandle")
       .def(nb::init<>())
       .def_rw("devicePtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::devicePtr)
       .def_rw("mcPtr", &NvlsConnection::DeviceMulticastPointer::DeviceHandle::mcPtr)
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index aaea7bc71..bebbf1c47 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -248,7 +248,7 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    for i in range(10, 21):
+    for i in range(10, 28):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:

From c1ebc3e197da8d72662244ad70f473adc9d03426 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 22 Jan 2024 06:18:27 +0000
Subject: [PATCH 47/67] correctness works now

---
 include/mscclpp/core.hpp                    |  3 +++
 include/mscclpp/gpu_utils.hpp               |  4 ++-
 python/mscclpp/core_py.cpp                  |  6 ++++-
 python/mscclpp_benchmark/allreduce_bench.py | 22 +++++++++++------
 python/mscclpp_benchmark/mscclpp_op.py      | 22 +++++++++++++----
 python/test/test_mscclpp.py                 | 27 +++++++++++++--------
 src/connection.cc                           |  8 +++---
 7 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 589dcf30c..d7e6467b0 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -474,12 +474,15 @@ class NvlsConnection {
                            size_t bufferSize)
         : deviceMem_(deviceMem), mcPtr_(mcPtr), bufferSize_(bufferSize) {}
     DeviceHandle deviceHandle();
+    char* getDevicePtr();
 
     friend class NvlsConnection;
   };
 
   std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
 
+  size_t getMultiCastMinGranularity();
+
  private:
   struct Impl;
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 792e754ef..b3b8ec7bb 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -233,7 +233,9 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
 template <class T>
 std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
   if (count % gran) {
-    throw Error("The request allocation size is not divisible by the required granularity", ErrorCode::InvalidUsage);
+    throw Error("The request allocation size is not divisible by the required granularity:" + std::to_string(count) +
+                    " vs " + std::to_string(gran),
+                ErrorCode::InvalidUsage);
   }
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 452f95b21..996cd3d99 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -127,6 +127,8 @@ void register_core(nb::module_& m) {
       .def("remote_transport", &Connection::remoteTransport);
 
   nb::class_<NvlsConnection::DeviceMulticastPointer>(m, "DeviceMulticastPointer")
+      .def("get_device_ptr",
+           [](NvlsConnection::DeviceMulticastPointer* self) { return (uintptr_t)self->getDevicePtr(); })
       .def("device_handle", &NvlsConnection::DeviceMulticastPointer::deviceHandle);
 
   nb::class_<NvlsConnection::DeviceMulticastPointer::DeviceHandle>(m, "DeviceHandle")
@@ -138,7 +140,9 @@ void register_core(nb::module_& m) {
         return nb::bytes(reinterpret_cast<const char*>(&self), sizeof(self));
       });
 
-  nb::class_<NvlsConnection>(m, "NvlsConnection").def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda);
+  nb::class_<NvlsConnection>(m, "NvlsConnection")
+      .def("allocate_bind_memory", &NvlsConnection::allocateAndBindCuda)
+      .def("get_multicast_min_granularity", &NvlsConnection::getMultiCastMinGranularity);
 
   nb::class_<Endpoint>(m, "Endpoint")
       .def("transport", &Endpoint::transport)
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index bebbf1c47..a8faa7a68 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -2,7 +2,14 @@
 # Licensed under the MIT license.
 
 import cupy as cp
-from mscclpp_op import MscclppAllReduce1, MscclppAllReduce2, MscclppAllReduce3, MscclppAllReduce4, MscclppAllReduce5, MscclppAllReduce6
+from mscclpp_op import (
+    MscclppAllReduce1,
+    MscclppAllReduce2,
+    MscclppAllReduce3,
+    MscclppAllReduce4,
+    MscclppAllReduce5,
+    MscclppAllReduce6,
+)
 from nccl_op import NcclAllReduce
 from mpi4py import MPI
 import cupy.cuda.nccl as nccl
@@ -145,7 +152,8 @@ def run_benchmark(
 
     proxy_service = None
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-        mscclpp_call = MscclppAllReduce6(mscclpp_group, memory)
+        mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
+        memory = mscclpp_call.get_memory()
         # if memory.nbytes < 2**20:
         #     mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
         # else:
@@ -155,7 +163,7 @@ def run_benchmark(
         #     mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
         #     proxy_service.start_proxy()
     else:
-        if memory.nbytes < 2**22:
+        if memory.nbytes < 2 ** 22:
             proxy_service = ProxyService()
             mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)
             proxy_service.start_proxy()
@@ -172,7 +180,7 @@ def run_benchmark(
     memory_nbytes = memory.nbytes
     mscclpp_time = bench_time(niter, mscclpp_call)
     mscclpp_algBw = memory_nbytes / mscclpp_time / 1e3
-    mscclpp_check = "PASS" #if check_correctness(memory, mscclpp_call) else "FAIL"
+    mscclpp_check = "PASS" if check_correctness(memory, mscclpp_call) else "FAIL"
 
     nccl_time = bench_time(niter, nccl_call)
     nccl_algBw = memory_nbytes / nccl_time / 1e3
@@ -250,13 +258,13 @@ def run_benchmark(
     speed_ups = []
     for i in range(10, 28):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-            nelems = 2**i
+            nelems = 2 ** i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
-            nelems = 3 * 2**i
+            nelems = 3 * 2 ** i
         else:
             raise RuntimeError("Only support one node/two nodes communication")
 
-        if nelems * data_type().itemsize > 2**32:
+        if nelems * data_type().itemsize > 2 ** 32:
             break  # due to trigger bit width limitation, we can only support up to 2**32
 
         size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems)
diff --git a/python/mscclpp_benchmark/mscclpp_op.py b/python/mscclpp_benchmark/mscclpp_op.py
index c5e81c3c4..f36cc5f61 100644
--- a/python/mscclpp_benchmark/mscclpp_op.py
+++ b/python/mscclpp_benchmark/mscclpp_op.py
@@ -420,18 +420,19 @@ def auto_tune(self):
                 yield nblocks, block_size
 
 
-
 class MscclppAllReduce6:
     def __init__(
         self,
         group: mscclpp_comm.CommGroup,
-        memory: cp.ndarray,
+        nelem: int,
+        memory_dtype: cp.dtype,
         block_size: int = 1024,
         nblocks: int = 32,
     ):
         self.group = group
-        self.memory = memory
-        type_str = type_to_str(memory.dtype)
+        datatype_size = memory_dtype().itemsize
+        buffer_size = nelem * datatype_size
+        type_str = type_to_str(memory_dtype)
         all_ranks = list(range(group.nranks))
         remote_nghrs = all_ranks.copy()
         remote_nghrs.remove(self.group.my_rank)
@@ -440,7 +441,15 @@ def __init__(
         # create a connection for each remote neighbor
         self.nvlink_connections = self.group.make_connection(remote_nghrs, Transport.CudaIpc)
         self.nvls_connection = group.make_connection(all_ranks, Transport.Nvls)
-        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(2**29) # just using recommended size for now
+        min_gran = self.nvls_connection.get_multicast_min_granularity()
+        aligned_buffer_size = int(((buffer_size + min_gran - 1) // min_gran) * min_gran)
+        self.nvls_mem_handle = self.nvls_connection.allocate_bind_memory(
+            aligned_buffer_size
+        )  # just using recommended size for now
+        self.memory_ptr = self.nvls_mem_handle.get_device_ptr()
+
+        self.cp_memory_ptr = cp.cuda.MemoryPointer(cp.cuda.UnownedMemory(self.memory_ptr, aligned_buffer_size, None), 0)
+        self.memory = cp.ndarray(nelem, memory_dtype, self.cp_memory_ptr)
 
         # create a sm_channel for each remote neighbor
         self.semaphores = group.make_semaphore(self.nvlink_connections, SmDevice2DeviceSemaphore)
@@ -461,6 +470,9 @@ def __init__(
 
         self.set_params(nblocks, block_size)
 
+    def get_memory(self):
+        return self.memory
+
     def __call__(self, stream_ptr):
         self.kernel.launch_kernel(self.params, self.nblocks, self.block_size, 0, stream_ptr)
         return self.memory
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index e7437c7ec..45f11574c 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -116,13 +116,14 @@ def init_target():
 
     mpi_group.comm.barrier()
 
+
 def create_connection(group: mscclpp_comm.CommGroup, transport: str):
     if transport == "NVLS":
         all_ranks = list(range(group.nranks))
         tran = Transport.Nvls
         connection = group.make_connection(all_ranks, tran)
         return connection
-    
+
     remote_nghrs = list(range(group.nranks))
     remote_nghrs.remove(group.my_rank)
     if transport == "NVLink":
@@ -134,6 +135,7 @@ def create_connection(group: mscclpp_comm.CommGroup, transport: str):
     connections = group.make_connection(remote_nghrs, tran)
     return connections
 
+
 def create_group_and_connection(mpi_group: MpiGroup, transport: str):
     if (transport == "NVLink" or transport == "NVLS") and all_ranks_on_the_same_node(mpi_group) is False:
         pytest.skip("cannot use nvlink/nvls for cross node")
@@ -150,7 +152,7 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
-@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
     group, connections = create_group_and_connection(mpi_group, transport)
     memory = cp.zeros(nelem, dtype=cp.int32)
@@ -185,7 +187,7 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int)
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
-@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]])
+@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20, 27]])
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str):
     # this test starts with a random tensor on rank 0 and rotates it all the way through all ranks
@@ -287,7 +289,7 @@ def __init__(
         scratch=None,
         fifo=None,
         nvls_mem_handle=None,
-        nvls_buffer_size=None
+        nvls_buffer_size=None,
     ):
         file_dir = os.path.dirname(os.path.abspath(__file__))
         if test_name == "h2d_semaphore":
@@ -363,7 +365,11 @@ def __init__(
             self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8)
             self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels)
         elif test_name == "nvls":
-            self.params = nvls_mem_handle.device_handle().raw + pack(self._d_semaphore_or_channels) + pack(my_rank, nranks, nvls_buffer_size)
+            self.params = (
+                nvls_mem_handle.device_handle().raw
+                + pack(self._d_semaphore_or_channels)
+                + pack(my_rank, nranks, nvls_buffer_size)
+            )
 
     def __call__(self):
         return self._kernel.launch_kernel(self.params, self.nblocks, self.nthreads, 0, None)
@@ -403,7 +409,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, "NVLink")
@@ -451,7 +457,7 @@ def test_fifo(
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     group, connections = create_group_and_connection(mpi_group, transport)
@@ -500,7 +506,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
@@ -543,10 +549,11 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     group.barrier()
     assert cp.array_equal(memory, memory_expected)
 
+
 @parametrize_mpi_groups(8)
 def test_nvls(mpi_group: MpiGroup):
     group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
-    nbytes = 2**21
+    nbytes = 2 ** 21
     mem_handle = nvls_connection.allocate_bind_memory(nbytes)
 
     nvlinks_connections = create_connection(group, "NVLink")
@@ -558,7 +565,7 @@ def test_nvls(mpi_group: MpiGroup):
         nranks=group.nranks,
         nvls_mem_handle=mem_handle,
         nvls_buffer_size=nbytes,
-        semaphore_or_channels=semaphores
+        semaphore_or_channels=semaphores,
     )
 
     kernel()
diff --git a/src/connection.cc b/src/connection.cc
index 24b043396..889dbe069 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -10,7 +10,6 @@
 #include <mscclpp/utils.hpp>
 #include <sstream>
 
-#include "api.h"
 #include "debug.h"
 #include "endpoint.hpp"
 #include "infiniband/verbs.h"
@@ -231,8 +230,7 @@ std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocate
   return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
 }
 
-MSCCLPP_API_CPP NvlsConnection::DeviceMulticastPointer::DeviceHandle
-NvlsConnection::DeviceMulticastPointer::deviceHandle() {
+NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
   NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
   device.devicePtr = this->deviceMem_->devicePtr_;
   device.mcPtr = this->mcPtr_.get();
@@ -240,6 +238,10 @@ NvlsConnection::DeviceMulticastPointer::deviceHandle() {
   return device;
 };
 
+char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
+
+size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->minMcGran_; }
+
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)

From 96303cebdd2ed8f043edf26f0860f1b1e8353ef1 Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 22 Jan 2024 06:34:16 +0000
Subject: [PATCH 48/67] all works for h100

---
 python/mscclpp_benchmark/allreduce_bench.py | 22 +++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index a8faa7a68..e4b854594 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -152,16 +152,18 @@ def run_benchmark(
 
     proxy_service = None
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-        mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
-        memory = mscclpp_call.get_memory()
-        # if memory.nbytes < 2**20:
-        #     mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        # else:
-        #     mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
-        # else:
-        #     proxy_service = ProxyService()
-        #     mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
-        #     proxy_service.start_proxy()
+        # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
+        # memory = mscclpp_call.get_memory()
+        if memory.nbytes < 2**20:
+            mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
+        elif memory.nbytes < 2**21:
+            mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
+        else:
+            mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
+            memory = mscclpp_call.get_memory()
+            # proxy_service = ProxyService()
+            # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
+            # proxy_service.start_proxy()
     else:
         if memory.nbytes < 2 ** 22:
             proxy_service = ProxyService()

From feca28f1ef0bed0fd4d4ee074dca291d269f9e3b Mon Sep 17 00:00:00 2001
From: Saeed Maleki <saemal@microsoft.com>
Date: Mon, 22 Jan 2024 06:57:41 +0000
Subject: [PATCH 49/67] lint

---
 python/mscclpp_benchmark/allreduce_bench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index e4b854594..12c6bf7f0 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -154,9 +154,9 @@ def run_benchmark(
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
         # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
         # memory = mscclpp_call.get_memory()
-        if memory.nbytes < 2**20:
+        if memory.nbytes < 2 ** 20:
             mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2**21:
+        elif memory.nbytes < 2 ** 21:
             mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         else:
             mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)

From 21820a69f91c157d153b2a97f39c2bf88b39ec8c Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 22 Jan 2024 10:01:11 +0000
Subject: [PATCH 50/67] works

---
 src/connection.cc | 114 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 92 insertions(+), 22 deletions(-)

diff --git a/src/connection.cc b/src/connection.cc
index 889dbe069..df7c6c669 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -14,7 +14,6 @@
 #include "endpoint.hpp"
 #include "infiniband/verbs.h"
 #include "npkit/npkit.h"
-#include "registered_memory.hpp"
 
 namespace mscclpp {
 
@@ -98,7 +97,7 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
 
 // NVLS
 
-struct NvlsConnection::Impl {
+struct NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
   CUmemGenericAllocationHandle mcHandle_;
   size_t bufferSize_;
   CUmulticastObjectProp mcProp_;
@@ -107,11 +106,12 @@ struct NvlsConnection::Impl {
   // These are only defined for multicast (NVLS) capability
   pid_t rootPid_;
   int mcFileDesc_;
-  size_t offset_;
-  std::vector<std::shared_ptr<PhysicalCudaMemory<char>>> physicalMemoryStorage;
+
+  std::list<std::pair<size_t, size_t>> allocatedRanges_;
+  std::list<std::pair<size_t, size_t>> freeRanges_;
 
   // use this only for the root of the NVLS
-  Impl(size_t bufferSize, int numDevices) : offset_(0) {
+  Impl(size_t bufferSize, int numDevices) {
     minMcGran_ = 0;
     mcGran_ = 0;
     mcProp_ = {};
@@ -126,6 +126,7 @@ struct NvlsConnection::Impl {
     mcFileDesc_ = 0;
     MSCCLPP_CUTHROW(
         cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
+    freeRanges_.emplace_back(0, bufferSize_);
 
     rootPid_ = getpid();
     if (rootPid_ < 0) {
@@ -136,11 +137,21 @@ struct NvlsConnection::Impl {
          mcProp_.size, minMcGran_, mcGran_);
   }
 
-  Impl(const std::vector<char>& data) : offset_(0) {
+  Impl(const std::vector<char>& data) {
     auto it = data.begin();
-    std::copy_n(it, sizeof(*this), reinterpret_cast<char*>(this));
-
-    // TODO: proper throw
+    std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast<char*>(&this->mcHandle_));
+    it += sizeof(this->mcHandle_);
+    std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast<char*>(&this->bufferSize_));
+    it += sizeof(this->bufferSize_);
+    std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast<char*>(&this->minMcGran_));
+    it += sizeof(this->minMcGran_);
+    std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast<char*>(&this->mcGran_));
+    it += sizeof(this->mcGran_);
+    std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast<char*>(&this->rootPid_));
+    it += sizeof(this->rootPid_);
+    std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast<char*>(&this->mcFileDesc_));
+
+    freeRanges_.emplace_back(0, bufferSize_);
     int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
     if (rootPidFd < 0) {
       throw mscclpp::SysError("pidfd_open() failed", errno);
@@ -159,21 +170,76 @@ struct NvlsConnection::Impl {
 
   ~Impl() {
     // we don't need to free multicast handle object according to NCCL.
+    if (rootPid_ == getpid()) {
+      close(mcFileDesc_);
+    }
   }
 
-  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize) {
-    if (offset_ > bufferSize_) {
-      throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InternalError);
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+
+  size_t allocateBuffer(size_t size) {
+    if (freeRanges_.empty()) {
+      throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
     }
-    if (bufferSize_ - offset_ < devBuffSize) {
-      throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
+    auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(),
+                           [size](const std::pair<size_t, size_t>& range) { return range.second >= size; });
+    if (it != freeRanges_.end()) {
+      size_t offset = it->first;
+      size_t rangeSize = it->second;
+      if (rangeSize == size) {
+        freeRanges_.erase(it);
+      } else {
+        it->first += size;
+        it->second -= size;
+      }
+      allocatedRanges_.emplace_back(offset, size);
+      return offset;
     }
+    throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
+  }
 
-    // keepin a copy physicalMem around so that the user doesn't accidentally get rids of all of them.
-    physicalMemoryStorage.push_back(physicalMem);
+  void freeBuffer(size_t offset, size_t size) noexcept {
+    auto it = std::find_if(allocatedRanges_.begin(), allocatedRanges_.end(),
+                           [offset, size](const std::pair<size_t, size_t>& range) {
+                             return range.first == offset && range.second == size;
+                           });
+    if (it == allocatedRanges_.end()) {
+      return;
+    }
+    allocatedRanges_.erase(it);
+    it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair<size_t, size_t>& range) {
+      return range.first + range.second >= offset;
+    });
+    if (it == freeRanges_.end()) {
+      freeRanges_.emplace_back(offset, size);
+      return;
+    }
+    if (it->first + it->second == offset) {
+      // merge with the previous free range if possible
+      it->second += size;
+      // merge with the next free range if possible
+      auto nextItr = std::next(it);
+      if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) {
+        it->second += nextItr->second;
+        freeRanges_.erase(nextItr);
+      }
+      return;
+    } else if (it->first == offset + size) {
+      // merge with the next free range if possible
+      it->first -= size;
+      it->second += size;
+      return;
+    } else {
+      freeRanges_.emplace(it, offset, size);
+      return;
+    }
+  }
 
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize) {
+    size_t offset = allocateBuffer(devBuffSize);
     MSCCLPP_CUTHROW(
-        cuMulticastBindMem(mcHandle_, offset_ /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
+        cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
 
     char* mcPtr;
 
@@ -187,15 +253,14 @@ struct NvlsConnection::Impl {
     MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
     MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
 
-    // Is this enough? Or we should update the offset as well
-    auto deleter = [=, bindOffset = offset_](char* ptr) {
+    auto deleter = [=, self = shared_from_this()](char* ptr) {
       CUdevice device;
       MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
       MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
       MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
-      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, bindOffset, devBuffSize));
+      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize));
+      self->freeBuffer(offset, devBuffSize);
     };
-    offset_ += devBuffSize;
 
     return std::shared_ptr<char>(mcPtr, deleter);
   }
@@ -219,7 +284,12 @@ NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make
 
 std::vector<char> NvlsConnection::serialize() {
   std::vector<char> result;
-  std::copy_n(reinterpret_cast<char*>(pimpl_.get()), sizeof(*pimpl_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcHandle_), sizeof(pimpl_->mcHandle_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->bufferSize_), sizeof(pimpl_->bufferSize_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->minMcGran_), sizeof(pimpl_->minMcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcGran_), sizeof(pimpl_->mcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(result));
   return result;
 }
 

From 20f3b0f1a570be4f1dd482aa89a62dd10f2f305d Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 04:12:23 +0000
Subject: [PATCH 51/67] clean up

---
 include/mscclpp/core.hpp      |   2 -
 include/mscclpp/gpu_utils.hpp |  10 +-
 src/CMakeLists.txt            |   2 +-
 src/connection.cc             | 217 ---------------------------
 src/nvls_connection.cu        | 270 ++++++++++++++++++++++++++++++++++
 5 files changed, 279 insertions(+), 222 deletions(-)
 create mode 100644 src/nvls_connection.cu

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index d7e6467b0..f208c9a9b 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -480,12 +480,10 @@ class NvlsConnection {
   };
 
   std::shared_ptr<DeviceMulticastPointer> allocateAndBindCuda(size_t size);
-
   size_t getMultiCastMinGranularity();
 
  private:
   struct Impl;
-
   std::shared_ptr<Impl> pimpl_;
 };
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index b3b8ec7bb..5e4e1c625 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -228,7 +228,7 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
 /// The deallocation only happens PhysicalCudaMemory goes out of scope.
 /// @tparam T Type of each element in the allocated memory.
 /// @param count Number of elements to allocate.
-/// @param gran the granularity forof the allocation.
+/// @param gran the granularity of the allocation.
 /// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
 template <class T>
 std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
@@ -264,8 +264,14 @@ UniqueCudaPtr<T> allocUniqueCuda(size_t count = 1) {
   return detail::safeAlloc<T, detail::cudaCalloc<T>, CudaDeleter<T>, UniqueCudaPtr<T>>(count);
 }
 
+/// Allocated physical memory on the device and returns a memory handle along with a virtual memory handle for it.
+/// The memory is zeroed out.
+/// @tparam T Type of each element in the allocated memory.
+/// @param count Number of elements to allocate.
+/// @param gran the granularity of the allocation.
+/// @return A std::unique_ptr to the memory handle and a device pointer for that memory.
 template <class T>
-std::shared_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
+std::unique_ptr<PhysicalCudaMemory<T>> allocUniquePhysicalCuda(size_t count, size_t gran) {
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::unique_ptr<CudaPhysicalDeleter<T>, CudaDeleter<CudaPhysicalDeleter<T>>>>(count, gran);
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index cfbcc927a..45b4075d2 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/connection.cc b/src/connection.cc
index df7c6c669..f89b96138 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -95,223 +95,6 @@ void CudaIpcConnection::flush(int64_t timeoutUsec) {
   INFO(MSCCLPP_P2P, "CudaIpcConnection flushing connection");
 }
 
-// NVLS
-
-struct NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
-  CUmemGenericAllocationHandle mcHandle_;
-  size_t bufferSize_;
-  CUmulticastObjectProp mcProp_;
-  size_t minMcGran_;
-  size_t mcGran_;
-  // These are only defined for multicast (NVLS) capability
-  pid_t rootPid_;
-  int mcFileDesc_;
-
-  std::list<std::pair<size_t, size_t>> allocatedRanges_;
-  std::list<std::pair<size_t, size_t>> freeRanges_;
-
-  // use this only for the root of the NVLS
-  Impl(size_t bufferSize, int numDevices) {
-    minMcGran_ = 0;
-    mcGran_ = 0;
-    mcProp_ = {};
-    mcProp_.size = bufferSize;
-    mcProp_.numDevices = numDevices;
-    mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
-    MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-    mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
-    bufferSize_ = mcProp_.size;
-    MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
-    mcFileDesc_ = 0;
-    MSCCLPP_CUTHROW(
-        cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
-    freeRanges_.emplace_back(0, bufferSize_);
-
-    rootPid_ = getpid();
-    if (rootPid_ < 0) {
-      throw mscclpp::SysError("getpid() failed", errno);
-    }
-
-    INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
-         mcProp_.size, minMcGran_, mcGran_);
-  }
-
-  Impl(const std::vector<char>& data) {
-    auto it = data.begin();
-    std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast<char*>(&this->mcHandle_));
-    it += sizeof(this->mcHandle_);
-    std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast<char*>(&this->bufferSize_));
-    it += sizeof(this->bufferSize_);
-    std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast<char*>(&this->minMcGran_));
-    it += sizeof(this->minMcGran_);
-    std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast<char*>(&this->mcGran_));
-    it += sizeof(this->mcGran_);
-    std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast<char*>(&this->rootPid_));
-    it += sizeof(this->rootPid_);
-    std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast<char*>(&this->mcFileDesc_));
-
-    freeRanges_.emplace_back(0, bufferSize_);
-    int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
-    if (rootPidFd < 0) {
-      throw mscclpp::SysError("pidfd_open() failed", errno);
-    }
-    int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
-    if (mcRootFileDescFd < 0) {
-      throw mscclpp::SysError("pidfd_getfd() failed", errno);
-    }
-    MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
-                                                   CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
-    close(rootPidFd);
-    close(mcRootFileDescFd);
-
-    INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
-  }
-
-  ~Impl() {
-    // we don't need to free multicast handle object according to NCCL.
-    if (rootPid_ == getpid()) {
-      close(mcFileDesc_);
-    }
-  }
-
-  Impl(const Impl&) = delete;
-  Impl& operator=(const Impl&) = delete;
-
-  size_t allocateBuffer(size_t size) {
-    if (freeRanges_.empty()) {
-      throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
-    }
-    auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(),
-                           [size](const std::pair<size_t, size_t>& range) { return range.second >= size; });
-    if (it != freeRanges_.end()) {
-      size_t offset = it->first;
-      size_t rangeSize = it->second;
-      if (rangeSize == size) {
-        freeRanges_.erase(it);
-      } else {
-        it->first += size;
-        it->second -= size;
-      }
-      allocatedRanges_.emplace_back(offset, size);
-      return offset;
-    }
-    throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
-  }
-
-  void freeBuffer(size_t offset, size_t size) noexcept {
-    auto it = std::find_if(allocatedRanges_.begin(), allocatedRanges_.end(),
-                           [offset, size](const std::pair<size_t, size_t>& range) {
-                             return range.first == offset && range.second == size;
-                           });
-    if (it == allocatedRanges_.end()) {
-      return;
-    }
-    allocatedRanges_.erase(it);
-    it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair<size_t, size_t>& range) {
-      return range.first + range.second >= offset;
-    });
-    if (it == freeRanges_.end()) {
-      freeRanges_.emplace_back(offset, size);
-      return;
-    }
-    if (it->first + it->second == offset) {
-      // merge with the previous free range if possible
-      it->second += size;
-      // merge with the next free range if possible
-      auto nextItr = std::next(it);
-      if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) {
-        it->second += nextItr->second;
-        freeRanges_.erase(nextItr);
-      }
-      return;
-    } else if (it->first == offset + size) {
-      // merge with the next free range if possible
-      it->first -= size;
-      it->second += size;
-      return;
-    } else {
-      freeRanges_.emplace(it, offset, size);
-      return;
-    }
-  }
-
-  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize) {
-    size_t offset = allocateBuffer(devBuffSize);
-    MSCCLPP_CUTHROW(
-        cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
-
-    char* mcPtr;
-
-    CUmemAccessDesc accessDesc = {};
-    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    int deviceId = -1;
-    MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
-    accessDesc.location.id = deviceId;
-    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-    MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
-    MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
-    MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
-
-    auto deleter = [=, self = shared_from_this()](char* ptr) {
-      CUdevice device;
-      MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
-      MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
-      MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
-      MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize));
-      self->freeBuffer(offset, devBuffSize);
-    };
-
-    return std::shared_ptr<char>(mcPtr, deleter);
-  }
-};
-
-NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
-    : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
-
-void NvlsConnection::addDevice() {
-  int cudaDeviceId;
-  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
-  this->addDevice(cudaDeviceId);
-}
-
-void NvlsConnection::addDevice(int cudaDeviceId) {
-  MSCCLPP_CUTHROW(cuMulticastAddDevice(pimpl_->mcHandle_, cudaDeviceId));
-  INFO(MSCCLPP_COLL, "NVLS connection created");
-}
-
-NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make_shared<Impl>(data)) {}
-
-std::vector<char> NvlsConnection::serialize() {
-  std::vector<char> result;
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcHandle_), sizeof(pimpl_->mcHandle_), std::back_inserter(result));
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->bufferSize_), sizeof(pimpl_->bufferSize_), std::back_inserter(result));
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->minMcGran_), sizeof(pimpl_->minMcGran_), std::back_inserter(result));
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcGran_), sizeof(pimpl_->mcGran_), std::back_inserter(result));
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->rootPid_), sizeof(pimpl_->rootPid_), std::back_inserter(result));
-  std::copy_n(reinterpret_cast<char*>(&pimpl_->mcFileDesc_), sizeof(pimpl_->mcFileDesc_), std::back_inserter(result));
-  return result;
-}
-
-// TODO: we need to atuo delete the memory we multicast pointer is no used anymore
-std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
-  auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->minMcGran_);
-  auto mcPtr = pimpl_->bindMemory(mem, size);
-  return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
-}
-
-NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
-  NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
-  device.devicePtr = this->deviceMem_->devicePtr_;
-  device.mcPtr = this->mcPtr_.get();
-  device.bufferSize = this->bufferSize_;
-  return device;
-};
-
-char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
-
-size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->minMcGran_; }
-
 // IBConnection
 
 IBConnection::IBConnection(Endpoint localEndpoint, Endpoint remoteEndpoint, Context& context)
diff --git a/src/nvls_connection.cu b/src/nvls_connection.cu
new file mode 100644
index 000000000..f6655fb82
--- /dev/null
+++ b/src/nvls_connection.cu
@@ -0,0 +1,270 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <mscclpp/core.hpp>
+#include <mscclpp/utils.hpp>
+
+#include "debug.h"
+#include "endpoint.hpp"
+
+namespace mscclpp {
+
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12)
+class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
+ public:
+  // use this only for the root of the NVLS
+  Impl(size_t bufferSize, int numDevices);
+  Impl(const std::vector<char>& data);
+  ~Impl();
+
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+
+  size_t getMinMcGran() { return minMcGran_; }
+  std::vector<char> serialize();
+  void addDevice(int cudaDeviceId);
+  size_t allocateBuffer(size_t size);
+  void freeBuffer(size_t offset, size_t size) noexcept;
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem, size_t devBuffSize);
+
+ private:
+  friend class NvlsConnection;
+  CUmemGenericAllocationHandle mcHandle_;
+  CUmulticastObjectProp mcProp_;
+  size_t bufferSize_;
+  size_t minMcGran_;
+  size_t mcGran_;
+  // These are only defined for multicast (NVLS) capability
+  pid_t rootPid_;
+  int mcFileDesc_;
+
+  std::list<std::pair<size_t, size_t>> allocatedRanges_;
+  std::list<std::pair<size_t, size_t>> freeRanges_;
+};
+
+NvlsConnection::Impl::Impl(size_t bufferSize, int numDevices) {
+  minMcGran_ = 0;
+  mcGran_ = 0;
+  mcProp_ = {};
+  mcProp_.size = bufferSize;
+  mcProp_.numDevices = numDevices;
+  mcProp_.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&minMcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_MINIMUM));
+  MSCCLPP_CUTHROW(cuMulticastGetGranularity(&mcGran_, &mcProp_, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  mcProp_.size = ((mcProp_.size + minMcGran_ - 1) / minMcGran_) * minMcGran_;
+  bufferSize_ = mcProp_.size;
+  MSCCLPP_CUTHROW(cuMulticastCreate(&mcHandle_, &mcProp_));
+  mcFileDesc_ = 0;
+  MSCCLPP_CUTHROW(
+      cuMemExportToShareableHandle(&mcFileDesc_, mcHandle_, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
+  freeRanges_.emplace_back(0, bufferSize_);
+
+  rootPid_ = getpid();
+  if (rootPid_ < 0) {
+    throw mscclpp::SysError("getpid() failed", errno);
+  }
+
+  INFO(MSCCLPP_COLL, "NVLS handle created on root with size %ld. minGranularity %ld and recommendedGranularity %ld\n",
+       mcProp_.size, minMcGran_, mcGran_);
+}
+
+NvlsConnection::Impl::Impl(const std::vector<char>& data) {
+  auto it = data.begin();
+  std::copy_n(it, sizeof(this->mcHandle_), reinterpret_cast<char*>(&this->mcHandle_));
+  it += sizeof(this->mcHandle_);
+  std::copy_n(it, sizeof(this->bufferSize_), reinterpret_cast<char*>(&this->bufferSize_));
+  it += sizeof(this->bufferSize_);
+  std::copy_n(it, sizeof(this->minMcGran_), reinterpret_cast<char*>(&this->minMcGran_));
+  it += sizeof(this->minMcGran_);
+  std::copy_n(it, sizeof(this->mcGran_), reinterpret_cast<char*>(&this->mcGran_));
+  it += sizeof(this->mcGran_);
+  std::copy_n(it, sizeof(this->rootPid_), reinterpret_cast<char*>(&this->rootPid_));
+  it += sizeof(this->rootPid_);
+  std::copy_n(it, sizeof(this->mcFileDesc_), reinterpret_cast<char*>(&this->mcFileDesc_));
+
+  freeRanges_.emplace_back(0, bufferSize_);
+  int rootPidFd = syscall(SYS_pidfd_open, rootPid_, 0);
+  if (rootPidFd < 0) {
+    throw mscclpp::SysError("pidfd_open() failed", errno);
+  }
+  int mcRootFileDescFd = syscall(SYS_pidfd_getfd, rootPidFd, mcFileDesc_, 0);
+  if (mcRootFileDescFd < 0) {
+    throw mscclpp::SysError("pidfd_getfd() failed", errno);
+  }
+  MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&mcHandle_, reinterpret_cast<void*>(mcRootFileDescFd),
+                                                 CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+  close(rootPidFd);
+  close(mcRootFileDescFd);
+
+  INFO(MSCCLPP_COLL, "NVLS handle was imported from root");
+}
+
+NvlsConnection::Impl::~Impl() {
+  // we don't need to free multicast handle object according to NCCL.
+  if (rootPid_ == getpid()) {
+    close(mcFileDesc_);
+  }
+}
+
+std::vector<char> NvlsConnection::Impl::serialize() {
+  std::vector<char> result;
+  std::copy_n(reinterpret_cast<char*>(&mcHandle_), sizeof(mcHandle_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&bufferSize_), sizeof(bufferSize_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&minMcGran_), sizeof(minMcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&mcGran_), sizeof(mcGran_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&rootPid_), sizeof(rootPid_), std::back_inserter(result));
+  std::copy_n(reinterpret_cast<char*>(&mcFileDesc_), sizeof(mcFileDesc_), std::back_inserter(result));
+  return result;
+}
+
+void NvlsConnection::Impl::addDevice(int cudaDeviceId) {
+  MSCCLPP_CUTHROW(cuMulticastAddDevice(mcHandle_, cudaDeviceId));
+  INFO(MSCCLPP_COLL, "NVLS connection created");
+}
+
+size_t NvlsConnection::Impl::allocateBuffer(size_t size) {
+  if (freeRanges_.empty()) {
+    throw Error("This NVLS connection mapped more than it was supposed to", ErrorCode::InvalidUsage);
+  }
+  auto it = std::find_if(freeRanges_.begin(), freeRanges_.end(),
+                         [size](const std::pair<size_t, size_t>& range) { return range.second >= size; });
+  if (it != freeRanges_.end()) {
+    size_t offset = it->first;
+    size_t rangeSize = it->second;
+    if (rangeSize == size) {
+      freeRanges_.erase(it);
+    } else {
+      it->first += size;
+      it->second -= size;
+    }
+    allocatedRanges_.emplace_back(offset, size);
+    return offset;
+  }
+  throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
+}
+
+void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept {
+  auto it = std::find_if(
+      allocatedRanges_.begin(), allocatedRanges_.end(),
+      [offset, size](const std::pair<size_t, size_t>& range) { return range.first == offset && range.second == size; });
+  if (it == allocatedRanges_.end()) {
+    return;
+  }
+  allocatedRanges_.erase(it);
+  it = std::find_if(freeRanges_.begin(), freeRanges_.end(), [offset, size](const std::pair<size_t, size_t>& range) {
+    return range.first + range.second >= offset;
+  });
+  if (it == freeRanges_.end()) {
+    freeRanges_.emplace_back(offset, size);
+    return;
+  }
+  if (it->first + it->second == offset) {
+    // merge with the previous free range if possible
+    it->second += size;
+    // merge with the next free range if possible
+    auto nextItr = std::next(it);
+    if (nextItr != freeRanges_.end() && it->first + it->second == nextItr->first) {
+      it->second += nextItr->second;
+      freeRanges_.erase(nextItr);
+    }
+    return;
+  } else if (it->first == offset + size) {
+    // merge with the next free range if possible
+    it->first -= size;
+    it->second += size;
+    return;
+  } else {
+    freeRanges_.emplace(it, offset, size);
+    return;
+  }
+}
+
+std::shared_ptr<char> NvlsConnection::Impl::bindMemory(std::shared_ptr<PhysicalCudaMemory<char>> physicalMem,
+                                                       size_t devBuffSize) {
+  size_t offset = allocateBuffer(devBuffSize);
+  MSCCLPP_CUTHROW(
+      cuMulticastBindMem(mcHandle_, offset /*mcOffset*/, physicalMem->memHandle_, 0 /*memOffset*/, devBuffSize, 0));
+
+  char* mcPtr;
+
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  int deviceId = -1;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
+  accessDesc.location.id = deviceId;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)(&mcPtr), devBuffSize, minMcGran_, 0U, 0));
+  MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)(mcPtr), devBuffSize, 0, mcHandle_, 0));
+  MSCCLPP_CUTHROW(cuMemSetAccess((CUdeviceptr)(mcPtr), devBuffSize, &accessDesc, 1));
+
+  auto deleter = [=, self = shared_from_this()](char* ptr) {
+    CUdevice device;
+    MSCCLPP_CUTHROW(cuDeviceGet(&device, deviceId));
+    MSCCLPP_CUTHROW(cuMemUnmap((CUdeviceptr)ptr, devBuffSize));
+    MSCCLPP_CUTHROW(cuMemAddressFree((CUdeviceptr)ptr, devBuffSize));
+    MSCCLPP_CUTHROW(cuMulticastUnbind(mcHandle_, device, offset, devBuffSize));
+    self->freeBuffer(offset, devBuffSize);
+  };
+
+  return std::shared_ptr<char>(mcPtr, deleter);
+}
+#else
+class NvlsConnection::Impl {
+ public:
+  // use this only for the root of the NVLS
+  Impl(size_t, int) { throw notSupportedError; }
+  Impl(const std::vector<char>&) { throw notSupportedError; }
+
+  Impl(const Impl&) = delete;
+  Impl& operator=(const Impl&) = delete;
+
+  std::vector<char> serialize() { throw notSupportedError; }
+  size_t allocateBuffer(size_t) { throw notSupportedError; }
+  void freeBuffer(size_t, size_t) { throw notSupportedError; }
+  std::shared_ptr<char> bindMemory(std::shared_ptr<PhysicalCudaMemory<char>>, size_t) { throw notSupportedError; }
+  void addDevice(int) { throw notSupportedError; }
+  size_t getMinMcGran() { throw notSupportedError; }
+
+ private:
+  Error notSupportedError = Error("NVLS is not supported on this CUDA version", ErrorCode::InvalidUsage);
+};
+#endif
+
+NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
+    : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}
+
+void NvlsConnection::addDevice() {
+  int cudaDeviceId;
+  MSCCLPP_CUDATHROW(cudaGetDevice(&cudaDeviceId));
+  this->addDevice(cudaDeviceId);
+}
+
+void NvlsConnection::addDevice(int cudaDeviceId) { pimpl_->addDevice(cudaDeviceId); }
+
+NvlsConnection::NvlsConnection(const std::vector<char>& data) : pimpl_(std::make_shared<Impl>(data)) {}
+
+std::vector<char> NvlsConnection::serialize() { return pimpl_->serialize(); }
+
+std::shared_ptr<NvlsConnection::DeviceMulticastPointer> NvlsConnection::allocateAndBindCuda(size_t size) {
+  auto mem = allocSharedPhysicalCuda<char>(size, pimpl_->getMinMcGran());
+  auto mcPtr = pimpl_->bindMemory(mem, size);
+  return std::make_shared<DeviceMulticastPointer>(mem, mcPtr, size);
+}
+
+NvlsConnection::DeviceMulticastPointer::DeviceHandle NvlsConnection::DeviceMulticastPointer::deviceHandle() {
+  NvlsConnection::DeviceMulticastPointer::DeviceHandle device;
+  device.devicePtr = this->deviceMem_->devicePtr_;
+  device.mcPtr = this->mcPtr_.get();
+  device.bufferSize = this->bufferSize_;
+  return device;
+};
+
+char* NvlsConnection::DeviceMulticastPointer::getDevicePtr() { return deviceMem_->devicePtr_; };
+
+size_t NvlsConnection::getMultiCastMinGranularity() { return pimpl_->getMinMcGran(); }
+
+}  // namespace mscclpp

From 2ec813e60dac497db065445e5b24b6e30288b92f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 04:16:51 +0000
Subject: [PATCH 52/67] lint

---
 python/mscclpp_benchmark/allreduce_bench.py | 12 ++++++------
 python/test/test_mscclpp.py                 | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 12c6bf7f0..4e9aeca0b 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -154,9 +154,9 @@ def run_benchmark(
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
         # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
         # memory = mscclpp_call.get_memory()
-        if memory.nbytes < 2 ** 20:
+        if memory.nbytes < 2**20:
             mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2 ** 21:
+        elif memory.nbytes < 2**21:
             mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         else:
             mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
@@ -165,7 +165,7 @@ def run_benchmark(
             # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
             # proxy_service.start_proxy()
     else:
-        if memory.nbytes < 2 ** 22:
+        if memory.nbytes < 2**22:
             proxy_service = ProxyService()
             mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)
             proxy_service.start_proxy()
@@ -260,13 +260,13 @@ def run_benchmark(
     speed_ups = []
     for i in range(10, 28):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-            nelems = 2 ** i
+            nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:
-            nelems = 3 * 2 ** i
+            nelems = 3 * 2**i
         else:
             raise RuntimeError("Only support one node/two nodes communication")
 
-        if nelems * data_type().itemsize > 2 ** 32:
+        if nelems * data_type().itemsize > 2**32:
             break  # due to trigger bit width limitation, we can only support up to 2**32
 
         size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems)
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 45f11574c..bebd752bc 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -152,7 +152,7 @@ def test_group_with_connections(mpi_group: MpiGroup, transport: str):
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
-@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int):
     group, connections = create_group_and_connection(mpi_group, transport)
     memory = cp.zeros(nelem, dtype=cp.int32)
@@ -187,7 +187,7 @@ def test_connection_write(mpi_group: MpiGroup, transport: Transport, nelem: int)
 
 @parametrize_mpi_groups(2, 4, 8, 16)
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
-@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20, 27]])
+@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20, 27]])
 @pytest.mark.parametrize("device", ["cuda", "cpu"])
 def test_connection_write_and_signal(mpi_group: MpiGroup, transport: Transport, nelem: int, device: str):
     # this test starts with a random tensor on rank 0 and rotates it all the way through all ranks
@@ -409,7 +409,7 @@ def test_d2d_semaphores(mpi_group: MpiGroup):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_sm_channels(mpi_group: MpiGroup, nelem: int, use_packet: bool):
     group, connections = create_group_and_connection(mpi_group, "NVLink")
@@ -457,7 +457,7 @@ def test_fifo(
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["IB", "NVLink"])
 def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     group, connections = create_group_and_connection(mpi_group, transport)
@@ -506,7 +506,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
 
 
 @parametrize_mpi_groups(2, 4, 8, 16)
-@pytest.mark.parametrize("nelem", [2 ** i for i in [10, 15, 20]])
+@pytest.mark.parametrize("nelem", [2**i for i in [10, 15, 20]])
 @pytest.mark.parametrize("transport", ["NVLink", "IB"])
 @pytest.mark.parametrize("use_packet", [False, True])
 def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, use_packet: bool):
@@ -553,7 +553,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
 @parametrize_mpi_groups(8)
 def test_nvls(mpi_group: MpiGroup):
     group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
-    nbytes = 2 ** 21
+    nbytes = 2**21
     mem_handle = nvls_connection.allocate_bind_memory(nbytes)
 
     nvlinks_connections = create_connection(group, "NVLink")

From f493d22e51d30b5f57c877490fcafb48acb0b0d0 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 05:17:26 +0000
Subject: [PATCH 53/67] pass build

---
 include/mscclpp/core.hpp | 2 +-
 src/nvls_connection.cu   | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index f208c9a9b..8bf9e7987 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -483,7 +483,7 @@ class NvlsConnection {
   size_t getMultiCastMinGranularity();
 
  private:
-  struct Impl;
+  class Impl;
   std::shared_ptr<Impl> pimpl_;
 };
 
diff --git a/src/nvls_connection.cu b/src/nvls_connection.cu
index f6655fb82..9938578e9 100644
--- a/src/nvls_connection.cu
+++ b/src/nvls_connection.cu
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
-
+#include <linux/version.h>
 #include <sys/syscall.h>
 #include <unistd.h>
 
@@ -12,8 +12,7 @@
 #include "endpoint.hpp"
 
 namespace mscclpp {
-
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
 class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
  public:
   // use this only for the root of the NVLS

From 9e5f0e679ff5f84d7a8c5484ea62b1bcf41f631f Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 06:43:37 +0000
Subject: [PATCH 54/67] fix benchmark

---
 python/mscclpp_benchmark/allreduce.cu       |  4 +++-
 python/mscclpp_benchmark/allreduce_bench.py | 24 +++++++++++++--------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
index 56aeb572b..127521939 100644
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -782,6 +782,7 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 // NVLS
 // -------------------------------------------
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
 #define MULTIMEM_ST(val, ptr)                                                                                   \
   asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
                "r"(val.w)                                                                                       \
@@ -837,4 +838,5 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
     }
   }
   deviceSyncer.sync(gridDim.x);
-}
\ No newline at end of file
+}
+#endif
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 4e9aeca0b..278d52251 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -79,6 +79,11 @@ def human_readable_size(size, decimal_places=1):
     return f"{size:.{decimal_places}f} {unit}"
 
 
+def is_nvls_enabled():
+    compute_capability = cp.cuda.Device().compute_capability
+    return not cp.cuda.runtime.is_hip and compute_capability >= "90"
+
+
 def check_correctness(memory, func, niter=100):
     ac = True
     for p in range(niter):
@@ -152,18 +157,18 @@ def run_benchmark(
 
     proxy_service = None
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
-        # mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
-        # memory = mscclpp_call.get_memory()
         if memory.nbytes < 2**20:
             mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2**21:
+        elif memory.nbytes < 2**21 if is_nvls_enabled() else memory.nbytes < 2**29:
             mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         else:
-            mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
-            memory = mscclpp_call.get_memory()
-            # proxy_service = ProxyService()
-            # mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
-            # proxy_service.start_proxy()
+            if is_nvls_enabled():
+                mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
+                memory = mscclpp_call.get_memory()
+            else:
+                proxy_service = ProxyService()
+                mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
+                proxy_service.start_proxy()
     else:
         if memory.nbytes < 2**22:
             proxy_service = ProxyService()
@@ -258,7 +263,8 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    for i in range(10, 28):
+    end_range = 28 if is_nvls_enabled() else 29
+    for i in range(10, end_range):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
         elif MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 2:

From 2eb20aff3be3a20bea7a40bb26d1126526a40c71 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 08:38:42 +0000
Subject: [PATCH 55/67] pass test

---
 include/mscclpp/utils.hpp                      |  2 ++
 python/mscclpp/__init__.py                     |  1 +
 python/mscclpp/utils.py                        |  3 +++
 python/mscclpp/utils_py.cpp                    |  1 +
 python/mscclpp_benchmark/allreduce_bench.py    | 13 ++++---------
 python/test/test_mscclpp.py                    |  6 +++---
 src/CMakeLists.txt                             |  2 +-
 src/{nvls_connection.cu => nvls_connection.cc} |  4 ++--
 src/utils.cc                                   | 12 ++++++++++++
 9 files changed, 29 insertions(+), 15 deletions(-)
 rename src/{nvls_connection.cu => nvls_connection.cc} (98%)

diff --git a/include/mscclpp/utils.hpp b/include/mscclpp/utils.hpp
index c8ef3d271..80b3bf39d 100644
--- a/include/mscclpp/utils.hpp
+++ b/include/mscclpp/utils.hpp
@@ -37,6 +37,8 @@ struct ScopedTimer : public Timer {
 
 std::string getHostName(int maxlen, const char delim);
 
+bool isNvlsSupported();
+
 }  // namespace mscclpp
 
 #endif  // MSCCLPP_UTILS_HPP_
diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index d411bc1b0..0e3618591 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -19,6 +19,7 @@
     Transport,
     TransportFlags,
     version,
+    is_nvls_supported,
 )
 
 __version__ = version()
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 9f71b70c4..762dd24d3 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -153,3 +153,6 @@ def pack(*args):
         else:
             raise RuntimeError(f"Unsupported type: {type(arg)}")
     return res
+
+def is_nvls_supported():
+    return cp.cuda.runtime.runtimeGetVersion() >= 12010
diff --git a/python/mscclpp/utils_py.cpp b/python/mscclpp/utils_py.cpp
index 16800a752..e9e847ee8 100644
--- a/python/mscclpp/utils_py.cpp
+++ b/python/mscclpp/utils_py.cpp
@@ -20,4 +20,5 @@ void register_utils(nb::module_& m) {
   nb::class_<ScopedTimer, Timer>(m, "ScopedTimer").def(nb::init<std::string>(), nb::arg("name"));
 
   m.def("get_host_name", &getHostName, nb::arg("maxlen"), nb::arg("delim"));
+  m.def("is_nvls_supported", &isNvlsSupported);
 }
diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 278d52251..c141a9e6b 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -14,7 +14,7 @@
 from mpi4py import MPI
 import cupy.cuda.nccl as nccl
 import mscclpp.comm as mscclpp_comm
-from mscclpp import ProxyService
+from mscclpp import ProxyService, is_nvls_supported
 from prettytable import PrettyTable
 import netifaces as ni
 
@@ -79,11 +79,6 @@ def human_readable_size(size, decimal_places=1):
     return f"{size:.{decimal_places}f} {unit}"
 
 
-def is_nvls_enabled():
-    compute_capability = cp.cuda.Device().compute_capability
-    return not cp.cuda.runtime.is_hip and compute_capability >= "90"
-
-
 def check_correctness(memory, func, niter=100):
     ac = True
     for p in range(niter):
@@ -159,10 +154,10 @@ def run_benchmark(
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
         if memory.nbytes < 2**20:
             mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2**21 if is_nvls_enabled() else memory.nbytes < 2**29:
+        elif memory.nbytes < 2**21 if is_nvls_supported() else memory.nbytes < 2**29:
             mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
         else:
-            if is_nvls_enabled():
+            if is_nvls_supported():
                 mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
                 memory = mscclpp_call.get_memory()
             else:
@@ -263,7 +258,7 @@ def run_benchmark(
     mscclpp_algbw = []
     nccl_algbw = []
     speed_ups = []
-    end_range = 28 if is_nvls_enabled() else 29
+    end_range = 28 if is_nvls_supported() else 29
     for i in range(10, end_range):
         if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
             nelems = 2**i
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index bebd752bc..62c2619ad 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -19,6 +19,7 @@
     SmDevice2DeviceSemaphore,
     TcpBootstrap,
     Transport,
+    is_nvls_supported,
 )
 import mscclpp.comm as mscclpp_comm
 from mscclpp.utils import KernelBuilder, pack
@@ -361,8 +362,6 @@ def __init__(
         elif test_name == "fifo":
             self.params = fifo.device_handle().raw
         elif test_name == "proxy":
-            semaphore_device_handles = [semaphore.device_handle().raw for semaphore in semaphore_or_channels]
-            self._d_semaphore_or_channels = cp.asarray(memoryview(b"".join(semaphore_device_handles)), dtype=cp.uint8)
             self.params = pack(my_rank, nranks) + fifo.raw + pack(self._d_semaphore_or_channels)
         elif test_name == "nvls":
             self.params = (
@@ -494,7 +493,7 @@ def test_proxy(mpi_group: MpiGroup, nelem: int, transport: str):
     fifo_device_handle = proxy.fifo_device_handle()
 
     kernel = MscclppKernel(
-        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=list_sem, fifo=fifo_device_handle
+        "proxy", my_rank=group.my_rank, nranks=group.nranks, semaphore_or_channels=semaphores, fifo=fifo_device_handle
     )
     proxy.start()
     group.barrier()
@@ -551,6 +550,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
 
 
 @parametrize_mpi_groups(8)
+@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported")
 def test_nvls(mpi_group: MpiGroup):
     group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
     nbytes = 2**21
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 45b4075d2..cfbcc927a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc *.cu)
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS *.cc)
 target_sources(mscclpp_obj PRIVATE ${SOURCES})
 target_include_directories(mscclpp_obj PRIVATE include)
diff --git a/src/nvls_connection.cu b/src/nvls_connection.cc
similarity index 98%
rename from src/nvls_connection.cu
rename to src/nvls_connection.cc
index 9938578e9..136a0a421 100644
--- a/src/nvls_connection.cu
+++ b/src/nvls_connection.cc
@@ -4,7 +4,6 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 
-#include <algorithm>
 #include <mscclpp/core.hpp>
 #include <mscclpp/utils.hpp>
 
@@ -12,7 +11,8 @@
 #include "endpoint.hpp"
 
 namespace mscclpp {
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 12) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
+
+#if (CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
 class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
  public:
   // use this only for the root of the NVLS
diff --git a/src/utils.cc b/src/utils.cc
index 7153d55c5..627df2df7 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -7,6 +7,7 @@
 #include <chrono>
 #include <iostream>
 #include <mscclpp/errors.hpp>
+#include <mscclpp/gpu_utils.hpp>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 #include <string>
@@ -66,4 +67,15 @@ std::string getHostName(int maxlen, const char delim) {
   return hostname.substr(0, i);
 }
 
+bool isNvlsSupported() {
+#if (CUDART_VERSION >= 12010)
+  CUdevice dev;
+  int nvlsSupport;
+  MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
+  MSCCLPP_CUTHROW(cuDeviceGetAttribute(&nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+  return nvlsSupport == 1;
+#endif
+  return false;
+}
+
 }  // namespace mscclpp

From 8e32bd2641de64d8192e423bc9a12700f40739ce Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 09:00:14 +0000
Subject: [PATCH 56/67] fix

---
 python/mscclpp/utils.py | 1 +
 src/nvls_connection.cc  | 1 +
 src/utils.cc            | 6 +++---
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 762dd24d3..79f28d01a 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -154,5 +154,6 @@ def pack(*args):
             raise RuntimeError(f"Unsupported type: {type(arg)}")
     return res
 
+
 def is_nvls_supported():
     return cp.cuda.runtime.runtimeGetVersion() >= 12010
diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc
index 136a0a421..08f31e36e 100644
--- a/src/nvls_connection.cc
+++ b/src/nvls_connection.cc
@@ -4,6 +4,7 @@
 #include <sys/syscall.h>
 #include <unistd.h>
 
+#include <algorithm>
 #include <mscclpp/core.hpp>
 #include <mscclpp/utils.hpp>
 
diff --git a/src/utils.cc b/src/utils.cc
index 627df2df7..8475f2f60 100644
--- a/src/utils.cc
+++ b/src/utils.cc
@@ -70,10 +70,10 @@ std::string getHostName(int maxlen, const char delim) {
 bool isNvlsSupported() {
 #if (CUDART_VERSION >= 12010)
   CUdevice dev;
-  int nvlsSupport;
+  int isNvlsSupported;
   MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
-  MSCCLPP_CUTHROW(cuDeviceGetAttribute(&nvlsSupport, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-  return nvlsSupport == 1;
+  MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isNvlsSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+  return isNvlsSupported == 1;
 #endif
   return false;
 }

From 5ecb01fe692d1b4a22b2a7c2f55d278126de8a03 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 10:21:31 +0000
Subject: [PATCH 57/67] clean up

---
 include/mscclpp/core.hpp      |  9 ++++++++-
 include/mscclpp/gpu_utils.hpp | 10 +++++-----
 python/mscclpp/utils.py       |  4 ----
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index 8bf9e7987..af969108c 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -698,7 +698,14 @@ class Communicator {
   /// to the connection.
   NonblockingFuture<std::shared_ptr<Connection>> connectOnSetup(int remoteRank, int tag, EndpointConfig localConfig);
 
-  /// TBD
+  /// Connect to NVLS on setup.
+  ///
+  /// This function used to connect to NVLS on setup. NVLS collective using multicast operations to send/recv data.
+  /// Here we need to put all involved ranks into the collective group.
+  ///
+  /// @param allRanks The ranks of all processes involved in the collective.
+  /// @param config The configuration for the local endpoint.
+  /// @return std::shared_ptr<NvlsConnection> A shared pointer to the NVLS connection.
   std::shared_ptr<NvlsConnection> connctNvlsCollective(std::vector<int> allRanks, EndpointConfig config);
 
   /// Get the remote rank a connection is connected to.
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 5e4e1c625..909ccd821 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -168,6 +168,11 @@ Memory safeAlloc(size_t nelem) {
 
 template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
 Memory safeAlloc(size_t nelem, size_t gran) {
+  if (nelem * sizeof(T) % gran) {
+    throw Error("The request allocation size is not divisible by the required granularity:" +
+                    std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
+                ErrorCode::InvalidUsage);
+  }
   T* ptr = nullptr;
   try {
     ptr = alloc(nelem, gran);
@@ -232,11 +237,6 @@ std::shared_ptr<T> allocSharedCuda(size_t count = 1) {
 /// @return A std::shared_ptr to the memory handle and a device pointer for that memory.
 template <class T>
 std::shared_ptr<PhysicalCudaMemory<T>> allocSharedPhysicalCuda(size_t count, size_t gran) {
-  if (count % gran) {
-    throw Error("The request allocation size is not divisible by the required granularity:" + std::to_string(count) +
-                    " vs " + std::to_string(gran),
-                ErrorCode::InvalidUsage);
-  }
   return detail::safeAlloc<PhysicalCudaMemory<T>, detail::cudaPhysicalCalloc<T>, CudaPhysicalDeleter<T>,
                            std::shared_ptr<PhysicalCudaMemory<T>>>(count, gran);
 }
diff --git a/python/mscclpp/utils.py b/python/mscclpp/utils.py
index 79f28d01a..9f71b70c4 100644
--- a/python/mscclpp/utils.py
+++ b/python/mscclpp/utils.py
@@ -153,7 +153,3 @@ def pack(*args):
         else:
             raise RuntimeError(f"Unsupported type: {type(arg)}")
     return res
-
-
-def is_nvls_supported():
-    return cp.cuda.runtime.runtimeGetVersion() >= 12010

From 292c240d61eebe0806db7a4ab1ab7de6ce8ba3c8 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Tue, 23 Jan 2024 10:26:31 +0000
Subject: [PATCH 58/67] fix

---
 nvls/README         |   2 -
 nvls/test.cu        | 199 --------------------------------------------
 nvls/test2.cpp      | 143 -------------------------------
 src/communicator.cc |  11 ++-
 4 files changed, 5 insertions(+), 350 deletions(-)
 delete mode 100644 nvls/README
 delete mode 100644 nvls/test.cu
 delete mode 100644 nvls/test2.cpp

diff --git a/nvls/README b/nvls/README
deleted file mode 100644
index c385affc4..000000000
--- a/nvls/README
+++ /dev/null
@@ -1,2 +0,0 @@
-nvcc -I/usr/lib/x86_64-linux-gnu/openmpi/include -I/usr/lib/x86_64-linux-gnu/openmpi/include/openmpi -L/usr/lib/x86_64-linux-gnu/openmpi/lib -L /usr/local/cuda/lib64/ -lmpi_cxx -lmpi -lcupti  -lcupti_static test.cu  -gencode arch=compute_90,code=sm_90 -lcuda -lcudart -lnccl
-
diff --git a/nvls/test.cu b/nvls/test.cu
deleted file mode 100644
index b84f19519..000000000
--- a/nvls/test.cu
+++ /dev/null
@@ -1,199 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <mpi.h>
-#include <stdio.h>
-#include <cudaTypedefs.h>
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-
-#define CUCHECK(cmd)                                     \
-  do {                                                   \
-    auto err = cmd;                                      \
-    if (err != 0) {                                      \
-      printf("Cuda failure %d: Line %d", err, __LINE__); \
-      exit(-1);                                          \
-    }                                                    \
-  } while (false)
-
-// AR kernel snippet for sm_90 only
-
-#define MULTIMEM_ST(val, ptr)                                                                                   \
-  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
-               "r"(val.w)                                                                                       \
-               : "memory");
-// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
-#define MULTIMEM_LD(val, ptr)                                     \
-  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
-      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
-      : "l"(ptr)                                                  \
-      : "memory");
-
-__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) { 
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){
-    uc_ptr[idx] = myrank + idx;
-  }
-}
-
-__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) { 
-  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x){
-    float expected = (float)((nranks * (nranks-1)) / 2 + nranks * idx);
-    if (abs(uc_ptr[idx] - expected) > 0.01 * expected){
-      printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
-    }
-  }
-}
-
-
-__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
-  // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
-  // line is assumed to be 16B 4 ints of 8 halves
-  int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
-  int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
-
-  int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
-  int my_step = blockDim.x * gridDim.x * 4;
-
-  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
-    uint4 val;
-    MULTIMEM_LD(val, mc_ptr + idx);
-    MULTIMEM_ST(val, mc_ptr + idx);
-  }
-}
-
-int main() {
-  int myrank, nranks;
-  MPI_Init(NULL, NULL);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
-
-  cudaSetDevice(myrank);
-  CUresult res;
-
-  size_t size = 1024 * 1024 * 512;
-  CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
-
-  CUmulticastObjectProp mcProp = {};
-  mcProp.numDevices = nranks;
-  mcProp.size = size;
-  mcProp.handleTypes = handleType;
-
-  size_t minGran, gran;
-  gran = 0;
-  minGran = 0;
-  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
-  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
-
-  if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran);
-  size_t mcSize = ((size + gran - 1) / gran) * gran;
-  mcProp.size = mcSize;
-
-  CUmemGenericAllocationHandle handle;
-  // only one rank creates the multicast object
-  if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
-
-  int fd, peerfd;
-  fd = 0;
-  peerfd = 0;
-  if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
-
-  // some ugly UDS business
-  //  Borrow ipcsocket.{c,h} from nccl code
-  // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the
-  // exported handles
-  //  moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
-
-  pid_t currentPid = getpid();
-  MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
-  MPI_Bcast(&currentPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
-  int pidFd = syscall(SYS_pidfd_open, currentPid, 0);
-
-  // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
-  // everyone else would now have same multicast object
-  int peerFd = 0;
-  peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0);
-  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, (void*)peerFd, handleType));
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  //  if(myrank)
-  //    close(peerfd);
-  //  else
-  close(fd);
-  // end of ugly UDS business
-  // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
-  int mydev = myrank;
-  CUCHECK(cuMulticastAddDevice(handle, mydev));
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  CUmemGenericAllocationHandle memhandle;
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = mydev;
-  prop.requestedHandleTypes = handleType;
-
-  // allocate physical memory (data buffer)
-  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
-
-  void* uc_va;
-  void* mc_va;
-  CUmemAccessDesc accessDesc = {};
-  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  accessDesc.location.id = mydev;
-  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-
-  // Map a VA to UC space
-  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
-  cudaMemset(uc_va, 0, size);
-  CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
-  // set access on UC address
-  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
-
-
-  // everyone binds memory to the multicast
-  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
-  MPI_Barrier(MPI_COMM_WORLD);
-  // usual VA business: map both MC and PA to two different VA addresses
-
-  // Map a VA to MC space
-  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
-  CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
-  // set access on MC address
-  CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
-
-  int rept = 10;
-  int block_size = 1024;
-  int nblocks = 16;
-
-  cudaDeviceSynchronize();
-  MPI_Barrier(MPI_COMM_WORLD);
-  init_kernel<<<nblocks, block_size>>>((float*)uc_va, size/sizeof(float), myrank, nranks);
-  cudaDeviceSynchronize();
-  MPI_Barrier(MPI_COMM_WORLD);
-  testing<<<nblocks, block_size>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
-  cudaDeviceSynchronize();
-  MPI_Barrier(MPI_COMM_WORLD);
-  check_correctness<<<nblocks, block_size>>>((float*)uc_va, size/sizeof(float), myrank, nranks);
-  cudaDeviceSynchronize();
-  MPI_Barrier(MPI_COMM_WORLD);
-
-  for (int input_size = 1024; input_size <= size; input_size *= 2){
-    // warmup
-    for (int i = 0; i < rept; i++) {
-      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
-    }
-    cudaDeviceSynchronize();
-    MPI_Barrier(MPI_COMM_WORLD);
-    double st = MPI_Wtime();
-    for (int i = 0; i < rept; i++) {
-      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
-    }
-    cudaDeviceSynchronize();
-    double en = MPI_Wtime();
-    double time = (en - st) / rept;
-    if (!myrank) printf("input_size %d | Time = %f us, alg_bw = %f (GBps)\n", input_size, time*1e6, input_size / 1e9 / time);
-  }
-  MPI_Barrier(MPI_COMM_WORLD);
-  MPI_Finalize();
-}
-//........
diff --git a/nvls/test2.cpp b/nvls/test2.cpp
deleted file mode 100644
index 400d566ae..000000000
--- a/nvls/test2.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <cuda_runtime.h>
-#include <cuda.h>
-#include <mpi.h>
-#include <stdio.h>
-#include <nccl.h>
-#include <unistd.h>
-#include <cudaTypedefs.h>
-
-#define CUCHECK(cmd) do {               \
-    auto err = cmd;                     \
-    if( err != 0 ) {                    \
-        printf("Cuda failure %d: Line %d", err, __LINE__); \
-    }                                   \
-} while(false)
-
-int main(){
-  int myrank, nranks;
-  MPI_Init(NULL, NULL);
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
- 
-  cudaSetDevice(myrank);
-  CUresult res;
-
-
-CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
- 
-  CUmulticastObjectProp mcProp = {};
-  mcProp.numDevices = nranks;
-  mcProp.size = size;
-  mcProp.handleTypes = handleType;
- 
-  size_t minGran, gran;
-  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
-  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
- 
-  size_t mcSize = ((size+gran-1)/gran)*gran;
-  mcProp.size = mcSize;
- 
-  //only one rank creates the multicast object
-  if(!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
- 
-  int fd, peerfd;
-  if(!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
- 
-  //some ugly UDS business
-  // Borrow ipcsocket.{c,h} from nccl code
-  //in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the exported handles
-  // moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
- 
-  volatile uint32_t abortFlag = 0;
-  struct ncclIpcSocket ipcSock = { 0 };
-  uint64_t opId=0xdeadcafebeef;
-  ncclResult_t ret = ncclSuccess;
- 
-  NCCLCHECK(ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag));
-  MPI_Barrier(MPI_COMM_WORLD);
-  if(!myrank)
-    for(int p=1;p<nranks;p++) {
-      NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId), ret, error);
-    } else {
-      NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, &peerfd), ret, error);
-  }
-  error:
-  NCCLCHECK(ncclIpcSocketClose(&ipcSock));
- 
-  //everyone else would now have same multicast object
-  if(myrank)  CUCHECK(cuMemImportFromShareableHandle(&handle, (void *)peerfd, handleType));
- 
-  if(myrank)
-    close(peerfd);
-  else
-    close(fd);
-  //end of ugly UDS business
- 
-  //everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
-  CUCHECK(cuMulticastAddDevice(handle, mydev));
-  MPI_Barrier(MPI_COMM_WORLD);
- 
-  CUmemGenericAllocationHandle memhandle;
-  CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  prop.location.id = mydev;
-  prop.requestedHandleTypes = handleType;
- 
-  //allocate physical memory (data buffer)
-  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
- 
-  //everyone binds memory to the multicast
-  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
-  MPI_Barrier(MPI_COMM_WORLD);
-  //usual VA business: map both MC and PA to two different VA addresses
-    CUmemAccessDesc accessDesc = {};
-    accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    accessDesc.location.id = mydev;
-    accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
- 
-    // Map a VA to UC space
-    CUCHECK(cuMemAddressReserve(&uc_va, size, minGran, 0U, 0));
-    CUCHECK(cuMemMap(uc_va, size, 0, memhandle, 0));
-    // set access on UC address
-    CUCHECK(cuMemSetAccess(uc_va, size, &accessDesc, 1));
- 
-  // Map a VA to MC space
-  CUCHECK(cuMemAddressReserve(&mc_va, mcSize, minGran, 0U, 0));
-  CUCHECK(cuMemMap(mc_va, mcSize, 0, handle, 0));
-  // set access on MC address
-  CUCHECK(cuMemSetAccess(mc_va, mcSize, &accessDesc, 1));
-
-  MPI_Finalize();
-} 
-//........
- 
-/*
-//AR kernel snippet for sm_90 only
- 
-#if __CUDA_ARCH__ >= 900
-#define MULTIMEM_ST(val, ptr)                                                  \
-  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr),    \
-               "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)                  \
-               : "memory");
-//specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
-#define MULTIMEM_LD(val, ptr)                                                  \
-  asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"            \
-      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)                     \
-      : "l"(ptr)                                                               \
-      : "memory");
-#endif
- 
-//for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
-//line is assumed to be 16B 4 ints of 8 halves
-const int start_elem =  threadIdx.x + blockDim.x * (myrank + RANKS * blockIdx.x);
-const int end_elem = max(start_elem, numlines);
-__syncthreads();
-  for (int line = start_elem; line < end_elem; line += loop_step0) {
-    uint4 val;
-    MULTIMEM_LD(val, mc_ptr + (lineoffset + line))
-    MULTIMEM_ST(val, mc_ptr + (lineoffset + line))
-  }
-__syncthreads();
- 
-*/
diff --git a/src/communicator.cc b/src/communicator.cc
index d5c3e9ed4..e4710f272 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -110,26 +110,25 @@ MSCCLPP_API_CPP NonblockingFuture<std::shared_ptr<Connection>> Communicator::con
 MSCCLPP_API_CPP std::shared_ptr<NvlsConnection> Communicator::connctNvlsCollective(std::vector<int> allRanks,
                                                                                    EndpointConfig config) {
   auto bootstrap = this->bootstrap();
-  int myRank = bootstrap->getRank();
+  int rank = bootstrap->getRank();
   bool isRoot = false;
   bool amongAllRanks = false;
-  std::sort(allRanks.begin(), allRanks.end());
   int rootRank = allRanks[0];
   for (auto nvlsRank : allRanks) {
-    if (nvlsRank == myRank) amongAllRanks = true;
+    if (nvlsRank == rank) amongAllRanks = true;
     rootRank = std::min(rootRank, nvlsRank);
   }
   if (amongAllRanks == false) {
-    throw Error("my rank is not among allRanks", ErrorCode::InvalidUsage);
+    throw Error("rank is not among allRanks", ErrorCode::InvalidUsage);
   }
-  if (rootRank == myRank) isRoot = true;
+  if (rootRank == rank) isRoot = true;
 
   std::shared_ptr<NvlsConnection> conn;
   if (isRoot) {
     conn = std::make_shared<NvlsConnection>(config.nvlsBufferSize, allRanks.size());
     auto serialized = conn->serialize();
     for (auto nvlsRank : allRanks) {
-      if (nvlsRank != myRank) bootstrap->send(serialized, nvlsRank, 0);
+      if (nvlsRank != rank) bootstrap->send(serialized, nvlsRank, 0);
     }
   } else {
     std::vector<char> data;

From a9f0280589512e992aa0cc1949b4607910a98d75 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 23 Jan 2024 23:03:00 +0000
Subject: [PATCH 59/67] HIP compatibility

---
 include/mscclpp/gpu.hpp       | 15 +++++++++++++++
 include/mscclpp/gpu_utils.hpp |  5 +++++
 src/include/endpoint.hpp      |  2 --
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index d3d48ce1f..2f73b4b3b 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -19,6 +19,9 @@ using cudaIpcMemHandle_t = hipIpcMemHandle_t;
 
 using CUresult = hipError_t;
 using CUdeviceptr = hipDeviceptr_t;
+using CUmemGenericAllocationHandle = hipMemGenericAllocationHandle_t;
+using CUmemAllocationProp = hipMemAllocationProp;
+using CUmemAccessDesc = hipMemAccessDesc;
 
 constexpr auto cudaSuccess = hipSuccess;
 constexpr auto cudaStreamNonBlocking = hipStreamNonBlocking;
@@ -32,6 +35,11 @@ constexpr auto cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
 constexpr auto cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
 constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess;
 
+constexpr auto CU_MEM_ALLOCATION_TYPE_PINNED = hipMemAllocationTypePinned;
+constexpr auto CU_MEM_LOCATION_TYPE_DEVICE = hipMemLocationTypeDevice;
+constexpr auto CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = hipMemHandleTypePosixFileDescriptor;
+constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWrite;
+
 #ifndef CUDA_SUCCESS
 #define CUDA_SUCCESS hipSuccess
 #endif  // CUDA_SUCCESS
@@ -68,7 +76,14 @@ constexpr auto cudaIpcMemLazyEnablePeerAccess = hipIpcMemLazyEnablePeerAccess;
 #define cudaIpcCloseMemHandle(...) hipIpcCloseMemHandle(__VA_ARGS__)
 
 #define cuGetErrorString(...) hipDrvGetErrorString(__VA_ARGS__)
+#define cuMemAddressReserve(...) hipMemAddressReserve(__VA_ARGS__)
+#define cuMemAddressFree(...) hipMemAddressFree(__VA_ARGS__)
 #define cuMemGetAddressRange(...) hipMemGetAddressRange(__VA_ARGS__)
+#define cuMemCreate(...) hipMemCreate(__VA_ARGS__)
+#define cuMemRelease(...) hipMemRelease(__VA_ARGS__)
+#define cuMemSetAccess(...) hipMemSetAccess(__VA_ARGS__)
+#define cuMemMap(...) hipMemMap(__VA_ARGS__)
+#define cuMemUnmap(...) hipMemUnmap(__VA_ARGS__)
 
 #else
 
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 909ccd821..3a96f9a45 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -90,7 +90,12 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   prop.location.id = deviceId;
+#if defined(__HIP_PLATFORM_AMD__)
+  // TODO: revisit when HIP fixes this typo in the field name
+  prop.requestedHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#else
   prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+#endif
 
   CUmemGenericAllocationHandle memHandle;
   size_t bufferSize = sizeof(T) * nelem;
diff --git a/src/include/endpoint.hpp b/src/include/endpoint.hpp
index a773efb5e..311fa9982 100644
--- a/src/include/endpoint.hpp
+++ b/src/include/endpoint.hpp
@@ -4,8 +4,6 @@
 #ifndef MSCCLPP_ENDPOINT_HPP_
 #define MSCCLPP_ENDPOINT_HPP_
 
-#include <cuda.h>
-
 #include <mscclpp/core.hpp>
 #include <vector>
 

From f1ec27867210b8bfde020533e291811b934868a0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 23 Jan 2024 23:30:36 +0000
Subject: [PATCH 60/67] minor updates

---
 include/mscclpp/gpu.hpp       | 8 ++++++++
 include/mscclpp/gpu_utils.hpp | 2 +-
 src/communicator.cc           | 2 --
 src/connection.cc             | 4 ----
 src/endpoint.cc               | 3 ---
 src/nvls_connection.cc        | 8 ++++----
 6 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index 2f73b4b3b..d46a9ac6b 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -92,4 +92,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 
 #endif
 
+// NVLS
+#if !defined(__HIP_PLATFORM_AMD__)
+#include <linux/version.h>
+#define USE_NVLS ((CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
+#else  // !defined(__HIP_PLATFORM_AMD__)
+#define USE_NVLS 0
+#endif  // !defined(__HIP_PLATFORM_AMD__)
+
 #endif  // MSCCLPP_GPU_HPP_
diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 3a96f9a45..6ba6a545d 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -107,7 +107,7 @@ PhysicalCudaMemory<T>* cudaPhysicalCalloc(size_t nelem, size_t gran) {
   accessDesc.location.id = deviceId;
   accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
 
-  T* devicePtr = NULL;
+  T* devicePtr = nullptr;
   // Map the device pointer
   MSCCLPP_CUTHROW(cuMemAddressReserve((CUdeviceptr*)&devicePtr, bufferSize, gran, 0U, 0));
   MSCCLPP_CUTHROW(cuMemMap((CUdeviceptr)devicePtr, bufferSize, 0, memHandle, 0));
diff --git a/src/communicator.cc b/src/communicator.cc
index e4710f272..d0fb07a23 100644
--- a/src/communicator.cc
+++ b/src/communicator.cc
@@ -3,8 +3,6 @@
 
 #include "communicator.hpp"
 
-#include <algorithm>
-
 #include "api.h"
 #include "debug.h"
 
diff --git a/src/connection.cc b/src/connection.cc
index f89b96138..4d719f3b2 100644
--- a/src/connection.cc
+++ b/src/connection.cc
@@ -3,10 +3,6 @@
 
 #include "connection.hpp"
 
-#include <sys/syscall.h>
-#include <unistd.h>
-
-#include <algorithm>
 #include <mscclpp/utils.hpp>
 #include <sstream>
 
diff --git a/src/endpoint.cc b/src/endpoint.cc
index f6e3dc09c..dbc773898 100644
--- a/src/endpoint.cc
+++ b/src/endpoint.cc
@@ -1,8 +1,5 @@
 #include "endpoint.hpp"
 
-#include <sys/syscall.h>
-#include <unistd.h>
-
 #include <algorithm>
 
 #include "api.h"
diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc
index 08f31e36e..1ff96a427 100644
--- a/src/nvls_connection.cc
+++ b/src/nvls_connection.cc
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
-#include <linux/version.h>
+
 #include <sys/syscall.h>
 #include <unistd.h>
 
@@ -13,7 +13,7 @@
 
 namespace mscclpp {
 
-#if (CUDART_VERSION >= 12010) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))
+#if (USE_NVLS)
 class NvlsConnection::Impl : public std::enable_shared_from_this<NvlsConnection::Impl> {
  public:
   // use this only for the root of the NVLS
@@ -212,7 +212,7 @@ std::shared_ptr<char> NvlsConnection::Impl::bindMemory(std::shared_ptr<PhysicalC
 
   return std::shared_ptr<char>(mcPtr, deleter);
 }
-#else
+#else   // !(USE_NVLS)
 class NvlsConnection::Impl {
  public:
   // use this only for the root of the NVLS
@@ -232,7 +232,7 @@ class NvlsConnection::Impl {
  private:
   Error notSupportedError = Error("NVLS is not supported on this CUDA version", ErrorCode::InvalidUsage);
 };
-#endif
+#endif  // !(USE_NVLS)
 
 NvlsConnection::NvlsConnection(size_t bufferSize, int numDevices)
     : pimpl_(std::make_shared<Impl>(bufferSize, numDevices)) {}

From 032c00abb418df3378b58ff88b2a674d7575b8df Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 23 Jan 2024 23:50:03 +0000
Subject: [PATCH 61/67] minor update

---
 include/mscclpp/gpu_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
index 6ba6a545d..9be6a7d16 100644
--- a/include/mscclpp/gpu_utils.hpp
+++ b/include/mscclpp/gpu_utils.hpp
@@ -173,7 +173,7 @@ Memory safeAlloc(size_t nelem) {
 
 template <class T, T*(alloc)(size_t, size_t), class Deleter, class Memory>
 Memory safeAlloc(size_t nelem, size_t gran) {
-  if (nelem * sizeof(T) % gran) {
+  if ((nelem * sizeof(T)) % gran) {
     throw Error("The request allocation size is not divisible by the required granularity:" +
                     std::to_string(nelem * sizeof(T)) + " vs " + std::to_string(gran),
                 ErrorCode::InvalidUsage);

From fa0565fe6744f8be8434ce8fb833c18081a1d945 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 24 Jan 2024 07:14:10 +0000
Subject: [PATCH 62/67] add more tests

---
 python/mscclpp/__init__.py  |  1 +
 python/mscclpp/comm.py      | 19 ++++++++++++-------
 python/mscclpp/core_py.cpp  |  1 +
 python/test/test_mscclpp.py | 21 ++++++++++++++++++++-
 src/nvls_connection.cc      |  2 ++
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/python/mscclpp/__init__.py b/python/mscclpp/__init__.py
index 0e3618591..8f013e080 100644
--- a/python/mscclpp/__init__.py
+++ b/python/mscclpp/__init__.py
@@ -6,6 +6,7 @@
 from ._mscclpp import (
     Communicator,
     Connection,
+    EndpointConfig,
     Fifo,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
diff --git a/python/mscclpp/comm.py b/python/mscclpp/comm.py
index 3085cc3df..d84410668 100644
--- a/python/mscclpp/comm.py
+++ b/python/mscclpp/comm.py
@@ -8,6 +8,7 @@
 from ._mscclpp import (
     Communicator,
     Connection,
+    EndpointConfig,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
     ProxyService,
@@ -79,17 +80,21 @@ def my_ib_device(self, local_rank: int) -> Transport:
             assert False  # only 8 IBs are supported
 
     def make_connection(
-        self, all_ranks: list[int], transports: Transport | dict[int, Transport]
+        self,
+        all_ranks: list[int],
+        endpoints: EndpointConfig | Transport | dict[int, EndpointConfig] | dict[int, Transport],
     ) -> dict[int, Connection]:
-        if transports == Transport.Nvls:
-            return self.communicator.connct_nvls_collective(all_ranks, transports)
+        if type(endpoints) is Transport:
+            endpoints = EndpointConfig(endpoints)
+        if endpoints.transport == Transport.Nvls:
+            return self.communicator.connct_nvls_collective(all_ranks, endpoints)
         connections = {}
         for rank in all_ranks:
-            if type(transports) is dict:
-                transport = transports[rank]
+            if type(endpoints) is dict:
+                endpoint = endpoints[rank]
             else:
-                transport = transports
-            connections[rank] = self.communicator.connect_on_setup(rank, 0, transport)
+                endpoint = endpoints
+            connections[rank] = self.communicator.connect_on_setup(rank, 0, endpoint)
         self.communicator.setup()
         connections = {rank: connections[rank].get() for rank in connections}
         return connections
diff --git a/python/mscclpp/core_py.cpp b/python/mscclpp/core_py.cpp
index 996cd3d99..5fd4bd317 100644
--- a/python/mscclpp/core_py.cpp
+++ b/python/mscclpp/core_py.cpp
@@ -152,6 +152,7 @@ void register_core(nb::module_& m) {
   nb::class_<EndpointConfig>(m, "EndpointConfig")
       .def(nb::init<>())
       .def(nb::init_implicit<Transport>(), nb::arg("transport"))
+      .def(nb::init<Transport, size_t>(), nb::arg("transport"), nb::arg("nvlsBufferSize"))
       .def_rw("transport", &EndpointConfig::transport)
       .def_rw("ib_max_cq_size", &EndpointConfig::ibMaxCqSize)
       .def_rw("ib_max_cq_poll_num", &EndpointConfig::ibMaxCqPollNum)
diff --git a/python/test/test_mscclpp.py b/python/test/test_mscclpp.py
index 62c2619ad..4b3cb6ebf 100644
--- a/python/test/test_mscclpp.py
+++ b/python/test/test_mscclpp.py
@@ -12,6 +12,7 @@
 import pytest
 
 from mscclpp import (
+    EndpointConfig,
     Fifo,
     Host2DeviceSemaphore,
     Host2HostSemaphore,
@@ -278,6 +279,24 @@ def target_signal(sems, conns):
     group.barrier()
 
 
+@parametrize_mpi_groups(8)
+@pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported")
+def test_nvls_connection(mpi_group: MpiGroup):
+    if all_ranks_on_the_same_node(mpi_group) is False:
+        pytest.skip("cannot use nvls for cross node")
+    group = mscclpp_comm.CommGroup(mpi_group.comm)
+    all_ranks = list(range(group.nranks))
+    endpoint = EndpointConfig(Transport.Nvls, 2**22)
+    nvls_connection = group.make_connection(all_ranks, endpoint)
+    mem_handle1 = nvls_connection.allocate_bind_memory(2**21)
+    mem_handle2 = nvls_connection.allocate_bind_memory(2**21)
+    with pytest.raises(Exception):
+        mem_handle3 = nvls_connection.allocate_bind_memory(2**21)
+    # the memory is freed on the destructor of mem_handle2
+    mem_handle2 = None
+    mem_handle3 = nvls_connection.allocate_bind_memory(2**21)
+
+
 class MscclppKernel:
     def __init__(
         self,
@@ -549,7 +568,7 @@ def test_simple_proxy_channel(mpi_group: MpiGroup, nelem: int, transport: str, u
     assert cp.array_equal(memory, memory_expected)
 
 
-@parametrize_mpi_groups(8)
+@parametrize_mpi_groups(4, 8)
 @pytest.mark.skipif(is_nvls_supported() is False, reason="NVLS is not supported")
 def test_nvls(mpi_group: MpiGroup):
     group, nvls_connection = create_group_and_connection(mpi_group, "NVLS")
diff --git a/src/nvls_connection.cc b/src/nvls_connection.cc
index 1ff96a427..78f3e52d5 100644
--- a/src/nvls_connection.cc
+++ b/src/nvls_connection.cc
@@ -142,6 +142,7 @@ size_t NvlsConnection::Impl::allocateBuffer(size_t size) {
       it->second -= size;
     }
     allocatedRanges_.emplace_back(offset, size);
+    INFO(MSCCLPP_COLL, "NVLS connection allocated %ld bytes at offset %ld", size, offset);
     return offset;
   }
   throw Error("This NVLS connection cannot map the requested devBuffSize", ErrorCode::InvalidUsage);
@@ -152,6 +153,7 @@ void NvlsConnection::Impl::freeBuffer(size_t offset, size_t size) noexcept {
       allocatedRanges_.begin(), allocatedRanges_.end(),
       [offset, size](const std::pair<size_t, size_t>& range) { return range.first == offset && range.second == size; });
   if (it == allocatedRanges_.end()) {
+    WARN("NVLS connection tried to free a buffer that was not allocated");
     return;
   }
   allocatedRanges_.erase(it);

From 28fd377f1805017b2c5e6aaadc3b05c39e71f309 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 24 Jan 2024 09:11:44 +0000
Subject: [PATCH 63/67] move multimem instruction to source code

---
 include/mscclpp/gpu.hpp               |  1 +
 include/mscclpp/nvls_device.hpp       | 44 +++++++++++++++++++++++++++
 python/mscclpp_benchmark/allreduce.cu | 14 ++-------
 python/test/nvls_test.cu              | 15 ++-------
 4 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
index d46a9ac6b..f560a655c 100644
--- a/include/mscclpp/gpu.hpp
+++ b/include/mscclpp/gpu.hpp
@@ -88,6 +88,7 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 #else
 
 #include <cuda.h>
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
 
 #endif
diff --git a/include/mscclpp/nvls_device.hpp b/include/mscclpp/nvls_device.hpp
index 106420e58..52ade275d 100644
--- a/include/mscclpp/nvls_device.hpp
+++ b/include/mscclpp/nvls_device.hpp
@@ -4,13 +4,57 @@
 #ifndef MSCCLPP_NVLS_DEVICE_HPP_
 #define MSCCLPP_NVLS_DEVICE_HPP_
 
+#include <mscclpp/gpu.hpp>
+#include <type_traits>
+
+#include "device.hpp"
+
 namespace mscclpp {
 
+template <class>
+constexpr bool dependentFalse = false;  // workaround before CWG2518/P2593R1
+
 /// Device-side handle for @ref Host2DeviceSemaphore.
 struct DeviceMulticastPointerDeviceHandle {
   void* devicePtr;
   void* mcPtr;
   size_t bufferSize;
+
+#if defined(MSCCLPP_DEVICE_COMPILE)
+  template <int NElemPerThread = 4, typename TVaule = float4, typename T = float>
+  MSCCLPP_DEVICE_INLINE void multimemLoad(TVaule& val, T* ptr) {
+    static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4");
+    if constexpr (std::is_same<T, float>::value) {
+      asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];"
+          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+          : "l"(ptr)
+          : "memory");
+    } else if constexpr (std::is_same<T, half2>::value) {
+      asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
+          : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+          : "l"(ptr)
+          : "memory");
+    } else {
+      static_assert(dependentFalse<T>, "Not supported type");
+    }
+  };
+
+  template <int NElemPerThread = 4, typename TVaule, typename T>
+  MSCCLPP_DEVICE_INLINE void multimemStore(const TVaule& val, T* ptr) {
+    static_assert(NElemPerThread == 4, "Only support NElemPerThread == 4");
+    if constexpr (std::is_same<T, float>::value) {
+      asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z),
+                   "r"(val.w)
+                   : "memory");
+    } else if constexpr (std::is_same<T, half2>::value) {
+      asm volatile("multimem.st.global.v4.f16x2 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z),
+                   "r"(val.w)
+                   : "memory");
+    } else {
+      static_assert(dependentFalse<T>, "Not supported type");
+    }
+  };
+#endif
 };
 
 }  // namespace mscclpp
diff --git a/python/mscclpp_benchmark/allreduce.cu b/python/mscclpp_benchmark/allreduce.cu
index 127521939..69aa3919f 100644
--- a/python/mscclpp_benchmark/allreduce.cu
+++ b/python/mscclpp_benchmark/allreduce.cu
@@ -783,16 +783,6 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 // -------------------------------------------
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
-#define MULTIMEM_ST(val, ptr)                                                                                   \
-  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
-               "r"(val.w)                                                                                       \
-               : "memory");
-// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
-#define MULTIMEM_LD(val, ptr)                                     \
-  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
-      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
-      : "l"(ptr)                                                  \
-      : "memory");
 
 extern "C" __global__ void __launch_bounds__(1024, 1)
     allreduce6(mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores,
@@ -822,8 +812,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 
   for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
     uint4 val;
-    MULTIMEM_LD(val, mc_ptr + idx);
-    MULTIMEM_ST(val, mc_ptr + idx);
+    nvlsPtrs.multimemLoad(val, mc_ptr + idx);
+    nvlsPtrs.multimemStore(val, mc_ptr + idx);
   }
 
   deviceSyncer.sync(gridDim.x);
diff --git a/python/test/nvls_test.cu b/python/test/nvls_test.cu
index 5001072ac..022b4d6ca 100644
--- a/python/test/nvls_test.cu
+++ b/python/test/nvls_test.cu
@@ -8,17 +8,6 @@
 
 __device__ mscclpp::DeviceSyncer deviceSyncer;
 
-#define MULTIMEM_ST(val, ptr)                                                                                   \
-  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
-               "r"(val.w)                                                                                       \
-               : "memory");
-// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
-#define MULTIMEM_LD(val, ptr)                                     \
-  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
-      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
-      : "l"(ptr)                                                  \
-      : "memory");
-
 extern "C" __global__ void __launch_bounds__(1024, 1)
     nvls_test(mscclpp::DeviceMulticastPointerDeviceHandle nvlsPtrs,
               mscclpp::SmDevice2DeviceSemaphoreDeviceHandle* semaphores, int my_rank, int nranks, int nbytes) {
@@ -52,8 +41,8 @@ extern "C" __global__ void __launch_bounds__(1024, 1)
 
   for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
     uint4 val;
-    MULTIMEM_LD(val, mc_ptr + idx);
-    MULTIMEM_ST(val, mc_ptr + idx);
+    nvlsPtrs.multimemLoad(val, mc_ptr + idx);
+    nvlsPtrs.multimemStore(val, mc_ptr + idx);
   }
 
   deviceSyncer.sync(gridDim.x);

From 9bac9e884c798f081ade34eaab2a2af249c288ab Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 29 Jan 2024 10:03:51 +0000
Subject: [PATCH 64/67] restore file

---
 test/CMakeLists.txt |   1 +
 test/nvls_test.cu   | 203 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 test/nvls_test.cu

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 087fdeb86..ef85cde5a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -23,6 +23,7 @@ endfunction()
 
 add_test_executable(allgather_test_cpp allgather_test_cpp.cu)
 add_test_executable(allgather_test_host_offloading allgather_test_host_offloading.cu)
+add_test_executable(nvls_test nvls_test.cu)
 
 configure_file(run_mpi_test.sh.in run_mpi_test.sh)
 
diff --git a/test/nvls_test.cu b/test/nvls_test.cu
new file mode 100644
index 000000000..e01b4d790
--- /dev/null
+++ b/test/nvls_test.cu
@@ -0,0 +1,203 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <cuda.h>
+#include <cudaTypedefs.h>
+#include <cuda_runtime.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <mscclpp/gpu.hpp>
+
+#define CUCHECK(cmd)                                     \
+  do {                                                   \
+    auto err = cmd;                                      \
+    if (err != 0) {                                      \
+      printf("Cuda failure %d: Line %d", err, __LINE__); \
+      exit(-1);                                          \
+    }                                                    \
+  } while (false)
+
+// AR kernel snippet for sm_90 only
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+#define MULTIMEM_ST(val, ptr)                                                                                   \
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};" ::"l"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), \
+               "r"(val.w)                                                                                       \
+               : "memory");
+// specific PTX for fp16 reduction. bf16 would be multimem.ld_reduce.global.add.v4.bf16x2 etc
+#define MULTIMEM_LD(val, ptr)                                     \
+  asm("multimem.ld_reduce.global.add.v4.f32 {%0,%1,%2,%3}, [%4];" \
+      : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)        \
+      : "l"(ptr)                                                  \
+      : "memory");
+#else
+#define MULTIMEM_ST(val, ptr)
+#define MULTIMEM_LD(val, ptr)
+#endif
+
+__global__ void init_kernel(float* uc_ptr, int size, int myrank, int nranks) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+    uc_ptr[idx] = myrank + idx;
+  }
+}
+
+__global__ void check_correctness(float* uc_ptr, int size, int myrank, int nranks) {
+  for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < size; idx += blockDim.x * gridDim.x) {
+    float expected = (float)((nranks * (nranks - 1)) / 2 + nranks * idx);
+    if (abs(uc_ptr[idx] - expected) > 0.01 * expected) {
+      printf("error! idx %d: %f != %f\n", idx, uc_ptr[idx], expected);
+    }
+  }
+}
+
+__global__ void testing(float* mc_ptr, int size, int myrank, int nranks) {
+  // for allreduce we dont even need an UC pointer. just using same mc_ptr for in-place reduction
+  // line is assumed to be 16B 4 ints of 8 halves
+  int my_st = ((int64_t)size * (int64_t)myrank) / (int64_t)nranks;
+  int my_en = ((int64_t)size * (int64_t)(myrank + 1)) / (int64_t)nranks;
+
+  int my_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+  int my_step = blockDim.x * gridDim.x * 4;
+
+  for (int idx = my_st + my_offset; idx < my_en; idx += my_step) {
+    [[maybe_unused]] uint4 val;
+    MULTIMEM_LD(val, mc_ptr + idx);
+    MULTIMEM_ST(val, mc_ptr + idx);
+  }
+}
+
+int main() {
+#if (USE_NVLS)
+  int myrank, nranks;
+  MPI_Init(NULL, NULL);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+  cudaSetDevice(myrank);
+
+  size_t size = 1024 * 1024 * 512;
+  CUmemAllocationHandleType handleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
+  CUmulticastObjectProp mcProp = {};
+  mcProp.numDevices = nranks;
+  mcProp.size = size;
+  mcProp.handleTypes = handleType;
+
+  size_t minGran, gran;
+  gran = 0;
+  minGran = 0;
+  CUCHECK(cuMulticastGetGranularity(&minGran, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
+  CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+  if (!myrank) printf("nvls multicast granularity: gran = %lu, minGrad = %lu\n", gran, minGran);
+  size_t mcSize = ((size + gran - 1) / gran) * gran;
+  mcProp.size = mcSize;
+
+  CUmemGenericAllocationHandle handle;
+  // only one rank creates the multicast object
+  if (!myrank) CUCHECK(cuMulticastCreate(&handle, &mcProp));
+
+  int fd = 0;
+  if (!myrank) CUCHECK(cuMemExportToShareableHandle(&fd, handle, handleType, 0 /*flags*/));
+
+  // some ugly UDS business
+  //  Borrow ipcsocket.{c,h} from nccl code
+  // in cuda 12.4 new fabric handle type is available so instead it would be possible to use MPI_Allgather for the
+  // exported handles
+  //  moreover it would the only way to do it on GraceHopper systems, since UDS is limited to single Unix node
+
+  pid_t currentPid = getpid();
+  MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  MPI_Bcast(&currentPid, sizeof(currentPid), MPI_CHAR, 0, MPI_COMM_WORLD);
+  int pidFd = syscall(SYS_pidfd_open, currentPid, 0);
+
+  // MPI_Bcast(&fd, sizeof(fd), MPI_CHAR, 0, MPI_COMM_WORLD);
+  // everyone else would now have same multicast object
+  int peerFd = 0;
+  peerFd = syscall(SYS_pidfd_getfd, pidFd, fd, 0);
+  if (myrank) CUCHECK(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(peerFd), handleType));
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  close(fd);
+  // end of ugly UDS business
+  // everyone adds device(s), no syncs required, just need to ensure bindmem happens after all this is called
+  int mydev = myrank;
+  CUCHECK(cuMulticastAddDevice(handle, mydev));
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  CUmemGenericAllocationHandle memhandle;
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = mydev;
+  prop.requestedHandleTypes = handleType;
+
+  // allocate physical memory (data buffer)
+  CUCHECK(cuMemCreate(&memhandle, size, &prop, 0 /*flags*/));
+
+  void* uc_va;
+  void* mc_va;
+  CUmemAccessDesc accessDesc = {};
+  accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDesc.location.id = mydev;
+  accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  // Map a VA to UC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&uc_va, size, minGran, 0U, 0));
+  cudaMemset(uc_va, 0, size);
+  CUCHECK(cuMemMap((CUdeviceptr)uc_va, size, 0, memhandle, 0));
+  // set access on UC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)uc_va, size, &accessDesc, 1));
+
+  // everyone binds memory to the multicast
+  CUCHECK(cuMulticastBindMem(handle, 0 /*mcOffset*/, memhandle, 0 /*memOffset*/, size, 0));
+  MPI_Barrier(MPI_COMM_WORLD);
+  // usual VA business: map both MC and PA to two different VA addresses
+
+  // Map a VA to MC space
+  CUCHECK(cuMemAddressReserve((CUdeviceptr*)&mc_va, mcSize, minGran, 0U, 0));
+  CUCHECK(cuMemMap((CUdeviceptr)mc_va, mcSize, 0, handle, 0));
+  // set access on MC address
+  CUCHECK(cuMemSetAccess((CUdeviceptr)mc_va, mcSize, &accessDesc, 1));
+
+  int rept = 10;
+  int block_size = 1024;
+  int nblocks = 16;
+
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  init_kernel<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  testing<<<nblocks, block_size>>>((float*)mc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+  check_correctness<<<nblocks, block_size>>>((float*)uc_va, size / sizeof(float), myrank, nranks);
+  cudaDeviceSynchronize();
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  for (size_t input_size = 1024; input_size <= size; input_size *= 2) {
+    // warmup
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    MPI_Barrier(MPI_COMM_WORLD);
+    double st = MPI_Wtime();
+    for (int i = 0; i < rept; i++) {
+      testing<<<nblocks, block_size>>>((float*)mc_va, input_size / sizeof(float), myrank, nranks);
+    }
+    cudaDeviceSynchronize();
+    double en = MPI_Wtime();
+    double time = (en - st) / rept;
+    if (!myrank)
+      printf("input_size %ld | Time = %f us, alg_bw = %f (GBps)\n", input_size, time * 1e6, input_size / 1e9 / time);
+  }
+  MPI_Barrier(MPI_COMM_WORLD);
+  MPI_Finalize();
+#endif  // (USE_NVLS)
+}

From 7291380ce815e3a19a9c28286cf42c29967593f2 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Mon, 29 Jan 2024 11:28:33 +0000
Subject: [PATCH 65/67] address comments

---
 python/mscclpp_benchmark/allreduce_bench.py | 57 ++++++++++++---------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index c141a9e6b..47d622c2e 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -128,6 +128,21 @@ def bench_time(niter: int, func):
     return cp.cuda.get_elapsed_time(start, end) / niter * 1000.0
 
 
+def find_best_algo(mscclpp_algos, niter):
+    assert len(mscclpp_algos) > 0
+    best_time = 10000000.0
+    best_algo = None
+    for algo in mscclpp_algos:
+        config, cur_time = find_best_config(algo, niter)
+        if cur_time < best_time:
+            best_time = cur_time
+            best_algo = algo
+            algo.set_params(*config)
+    if MPI.COMM_WORLD.rank == 0:
+        print(best_algo, end="", flush=True)
+    return best_algo
+
+
 def find_best_config(mscclpp_call, niter):
     best_time = 10000000.0
     for config in mscclpp_call.auto_tune():
@@ -140,7 +155,7 @@ def find_best_config(mscclpp_call, niter):
     best_config = MPI.COMM_WORLD.bcast(best_config, root=0)
     if MPI.COMM_WORLD.rank == 0:
         print(best_config, end="", flush=True)
-    return best_config
+    return best_config, best_time
 
 
 def run_benchmark(
@@ -152,30 +167,27 @@ def run_benchmark(
 
     proxy_service = None
     if MPI.COMM_WORLD.size // N_GPUS_PER_NODE == 1:
+        proxy_service = ProxyService()
         if memory.nbytes < 2**20:
-            mscclpp_call = MscclppAllReduce2(mscclpp_group, memory, memory_out)
-        elif memory.nbytes < 2**21 if is_nvls_supported() else memory.nbytes < 2**29:
-            mscclpp_call = MscclppAllReduce1(mscclpp_group, memory)
+            mscclpp_algos = [MscclppAllReduce2(mscclpp_group, memory, memory_out)]
         else:
+            mscclpp_algos = [
+                MscclppAllReduce1(mscclpp_group, memory),
+                MscclppAllReduce3(mscclpp_group, memory, proxy_service),
+            ]
             if is_nvls_supported():
-                mscclpp_call = MscclppAllReduce6(mscclpp_group, nelem, data_type)
-                memory = mscclpp_call.get_memory()
-            else:
-                proxy_service = ProxyService()
-                mscclpp_call = MscclppAllReduce3(mscclpp_group, memory, proxy_service)
-                proxy_service.start_proxy()
+                mscclpp_algos.append(MscclppAllReduce6(mscclpp_group, nelem, data_type))
     else:
         if memory.nbytes < 2**22:
-            proxy_service = ProxyService()
-            mscclpp_call = MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)
-            proxy_service.start_proxy()
+            mscclpp_algos = [MscclppAllReduce5(mscclpp_group, memory, memory_out, N_GPUS_PER_NODE, proxy_service)]
         else:
-            proxy_service = ProxyService()
-            mscclpp_call = MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service)
-            proxy_service.start_proxy()
+            mscclpp_algos = [MscclppAllReduce4(mscclpp_group, memory, N_GPUS_PER_NODE, proxy_service)]
 
-    best_config = find_best_config(mscclpp_call, 20)
-    mscclpp_call.set_params(*best_config)
+    proxy_service.start_proxy()
+    MPI.COMM_WORLD.barrier()
+    mscclpp_call = find_best_algo(mscclpp_algos, 20)
+    if isinstance(mscclpp_call, MscclppAllReduce6):
+        memory = mscclpp_call.get_memory()
 
     nccl_call = NcclAllReduce(nccl_op, memory)
 
@@ -188,13 +200,8 @@ def run_benchmark(
     nccl_algBw = memory_nbytes / nccl_time / 1e3
     nccl_check = "PASS" if check_correctness(memory, nccl_call) else "FAIL"
 
-    if (
-        isinstance(mscclpp_call, MscclppAllReduce3)
-        or isinstance(mscclpp_call, MscclppAllReduce5)
-        or isinstance(mscclpp_call, MscclppAllReduce4)
-    ):
-        MPI.COMM_WORLD.barrier()
-        proxy_service.stop_proxy()
+    MPI.COMM_WORLD.barrier()
+    proxy_service.stop_proxy()
 
     speed_up = nccl_time / mscclpp_time
     if MPI.COMM_WORLD.rank == 0:

From 2acddec015a7666123cd81e44f9b8e7c3ea68659 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 31 Jan 2024 03:11:54 +0000
Subject: [PATCH 66/67] add comment

---
 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg | Bin 0 -> 60467 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg

diff --git a/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..119e0eef4ee733480cd99cf5df49bd19b3c29890
GIT binary patch
literal 60467
zcmeFZ1yo&G)-HN*cXtgC2<{FcK!D)EEm&{}?j9s)aDoSSg1ZHGhv4q+et0`o)!kLq
z|NnROeQ&%w#=Q+=!{M+utU1>==iKZ2=Gsp`o>oBU(h^b<AP5KukP+|+dYT7`f#6|b
z;b39l;o#s95a1D!u~CqbkdO&5vCyzd2+2rE2#JZwshMcWDPK?#6Vq`&f5F1a!NEaB
z%PYvkCcwnb!S?fyKp-F>AR{5;qoClkQ4mwG{o5Z;Z6FMINOh<kC<sarBnAW&2E<b*
zhztaRfB{DPbHKlSKtMu4!@$D9BOoFHCsd<@AR(ZjAfcgPV4$Iavps>|gP<{BFez9?
zVX+kT;V5mf*?gnE!c)C0Z^uy@JE3Mbu=7Je#KprWAbdtcOZS|fgOiJ!hnG+6mAHiD
zYbj~vw<@Y?>KdAcM#k?=OwG*g9UPsUU0mJ#KL!K_1%C>Oj){$nPe}Zdl$n*Clbe@c
zP*_n}Rb5kCSKsixqqD2Kr?;<vd}4BHdgjOM-0Ir;#^%=c&hFmn+4;rg)%DHo-A}t9
zKv2J$^>@quWETct7bG+^6g1pVyC5K)fddKy8is-u7E@FaPTv-blFb($`(@PE@^%C&
zcBK;>1G_OqTxyQhXQw|+`^B=qW|-gql4XB4>>qZ`fsmmffXRcx011Px@8~jp5dU-h
z#|Qtx!2fp`SalF4(p&bBn0ny%w6P+IWjg&aMEK3wWtz*;BO0MIBMf46#!*-HF{)rs
z&5NONwCQV!*%F0TsMzzll4O^$$@G_D%fZ+XwV5|p*UG2E^?6cOpt0f-qqI5osj!M$
zJ6CJ_6pUsgD3Am3<Q;7JIy1W272yIxhGgy%Oq^9>|HO+|*PlrmPv@=KyDhjyCMZv4
zI*~jDaHA}n81SQFF^%;3aV?#xH^7`j7WOW3voPBo0|Uu6SW>nGLGtBng>PiS2VOv1
zsm#}<1a#9ujdG%4Sa5@B8%6~<VIq_D2?Z}Vug;0ZyOlV_$tyAXHb8XEr!?iO>1evT
zH-W>$?ui?(i)ZAVAP*M;-p)OIU2xqv9jP{^$+209B)th-*J8BFwV*)?wb*)Tp!WJo
zz`D&N&qu<AeEg<JA+np4Q0obV(j%1NU?n|w`|?$Pt&o<rQP^kh)cW1-4}7DrUD;^@
z8i%>fK_bzM8=`_uuqx5i)R1E3^qZo?T&1?{MP7t6OpeJT=wi0S^bJnAS057FVLnTQ
z$M^B_+E0+w6~?Dyc(CWoTUnWSKI_`-{DfZ}u{fBo?juvm=Hp1c0z$DDB0WtH65uU!
zX_-);IxIdS(~1_4P!gp#2sSx~JIg>B64N*p9|=F^5X6)v#kaEjTF>Zi!%&%Eg5O^U
z@@NSvG4}rMUTRKA8ZnJK=UG$WSlkk0zg;wh$o3-O3s*%nf@p+$dI^h}n{%iT?H<-+
z#rZ*G=Ij90tgaL&j%0&8`lB_pVOfdPIvL7tyy_Ct=mBMPf2W0XVa)`Gb$y7^>krOG
zX}3cls~u8`X|k?_7J`|~a*H}E%qxau^Z}yoK{e!JDOG5<l#m@Fl?vJSf)u%37oR!5
zzfx>5?SP@?Vr{1i+fIv<7dxqmS>3fZDGyY(cZ?L;T60d)U>(J?lY`5X?!56^m((Ha
z^5I$}Jn+oU@4M${ZH$Sy>F0ic_2Fh}Qpwand9$#GfIfk2;WStbZMlM@y^j2vDxhz}
zZK6sK(@d8F>k0IJggOwV#h}lI`bA@91c_*S`}SO&j0_1cm>^uaGr@j$d~c>h)gu3G
zd~IBc0PRr<jEZP)4^H)pJo@t9U0{J3O%85#o@TMcvP#5!A11r?A_3H<K^+`}?+}Q6
z4KBPI4Q@{pzq&GTYj|Vm8F<r>QjGNbU?RB@11m3Fj-~t>K|*(uP>eNsbT_!$J&O*y
zoX&8*Y8#(o#Bi=sKpxwf3{!vzxwj1a^B(<92Ziq^2-tdN6iNGB57mhW1}AFc@fS?v
z<L`y{8L3SNweLy^(C%X)rp&+;r}81uR+1g%X1H^F{ij+oUKLW$rn~92p~)$u;R*WI
zd5ZYbKV2rvAB2;laE&!d<p<GF@p7F|dhTlYET~2K`pDVNeQP3E(_wBWB@X%yepU}2
zYMs!09Xmt>T_5TCoZ1MPg**a|d^@DPXpg~Stpu(Ia_(SfTw}=CZfmh)zKF#rE{4c2
zx+&!Y{1C`yUi+qI;83=hCQ`6LOQ1k7F`BJ{nL-0U#eamN(P69fMp-3Y=OjL+NQc@?
za5LM!w;U3iosJe1ZF@6qdKwY5hNN{bOL?+XF*`stfNW_X0?VR@zhK`s?851La^2H{
z6WI<X9w*bTA`F^+S9{NE*f`jx)6BOW0=h_|4=f`sV*IQ_xHZ1C@AB+u#Z1|T7hhF5
zEoH0HsOHp_%S$&e3zQEty-$F8wP2@w-AXs_>z`)lnGfY;&r%yZU&zsp7GL@yr@W2!
z+mKxg=8WV06A0a6PNJ1GvC{1O;^!X-Tv7Cu=rU=xzCWBxE2J>#(xmX|$zOC{L6x{h
z26&jQHnx6RdUnPcY9?GI9_y5>4!(^eX3_gf>BB#Qxu<Npgfn$1;~1V-QIWL9`57}O
z3JLT<v|BR2RA^)k=W*owqnd8?^c;!(n@`{|OO>W?E3iti2M5N+`XgRVYEtX1x3p=`
zo<QGJX22olXHTGDu5(jegh=h`@T3XVF~f;!9L2HDckR#bh`$Pj(Lqgd%Hx{eA>&#V
z>B1W~=!|s{%QLjVO2PZoz`CvF2t6O2>1aLqOd_{@&tQFA<sY{nW>{+9&6;u=HbdcD
zpmAfkM$Ok<_XIMTII4PGQc^~O5W$NpVG{OPNU5!WU_a7)+vAI}h#q=%%DOlyGhc1I
zzSZ{$>t*iu?BP$Krfxoa5t8`0bsop#V1*xM&S2~0#~6;Y@1u!}?==iS-yBg}8PZT8
zhArfnNnH%AR2+U>s?V)%o}u5`Y;vS>A+Ua{acQ&Ms)w)EBEx!6sY@13@OlCv%OmO*
z#z}hf30N8M&ez!q+Wwf~&lb;JxKWcknXX)CRPEowKxW%n)(@5<&TsRn3w0&(q44D;
zj+yomlz)5L6Xam^QU>Wzq-0Q<@NERbvol`;qW<RyQ6oHS4InW(@(ogPSG5q%P+?Sy
zq^Tp{HPzbbYzFGN7gz6%n!YH&z_Z}vLP)3tKNL+e8pT`iJWFDxQ47~tr{%|wkX#kP
zQWVDq{qH!GHO5$*<Lo}L*(N#Kg1ILSE@hR!J0)CwThx*&taEuUviGApCCtBl1L{Tr
zPHS8I2_&ctzQcAM^B#Ho1ZoMeFI!=T>26H_en%OpY>W?%BYU9Gg?R#fwSV~pYU64B
z?!#xp$9%h&em$Bs913>Jd%R%y@&x+G(AIjvWU6-qhj*j+1cF!d_1@7QZ9PF{KFL5$
z{_RA%^UNnuTEP>@6LHIX74d}m5h)1|4ABL~_Wq8uqIZYK!TbOf`alPj`;QX^c3QU!
zy_a9sr{BD6LV5zd+zNOC`Pg~~kevj$;DG~%fcHRaPas6lZqsk4M*cRwQWG$G8RQda
zwEnC23bEd*<bN#rpXWr<&jV2oE{NmB6KEzyZ&lg*d+UkKBVq$L7(x<^4Kez6kdQj)
zolglLf7N~hJup0hU;!T&WqGe)djmfH<If@_aH)6?m-6>Jihsiz&d3w!4}bVS|K>kV
z`vV~UgS6HE#La(_&3{to-)>6&@2xkG14LX?Q8I=+u<6<Yq-0}N!k>~}k2@4GU@*nG
z8QpUrkv>S-gss?zr4P>7iojqi{`-eLDcut&%D?>ybcN&nNL^Q&Rv(G&-P^wM1iD9T
z1xITuz&`kJ9*Bhbgh_b16kL^MJQU(Rfg*QW7i_i|qMJ`<!t+||l4g2Z-`BcoWcuIJ
z(4KvcetASo3k4;3s->8N7+xSNIsF96G_YPoO?W$X(Tw{;+uXTk+7`=aqE10RV?<f$
z$Fllf{3mC9dzh)l$65Ukd)^p&JP*2K9`bqS@$42!#YxTeL6+PGv-+<rQ*x(PNP`4>
z#;d*1cSxC<Czy{oO59{I+c$KIr!Dw~SBT?KCWs*=byJIWXg`leKo~qWT+Sr8Ygt=7
zwCKEvQyUpD++i%*5vJz?Y$=Kvt3t!mFdx-6_u>GLRmv_Imt7etx_(iD8_c7E_P<fn
z(i7H9G3y^oN=Rzmi{X^pf{j`mVppH44(FmyBDna%C1;|dqTjpl5i1Tx&OS`lGpLi1
zN@>(!s-*aM)})C`^=e7_cA~G=xu5$TN=A)(1jH~ynh@GlHE*>Bqt*elZIRCWLe*Zc
zae4B<l5&LB29dm+R#G)=f|*U0`M13}p?tu3^Aj?bminJ%;CuS$EjxU}eAEPOCpd$L
zvu~27=OvpaFSU?{Xpr8x`kisUun)63oAo#P41(k3$&x!oXsLUhH(G5aNUhYzxj={R
z!#+G?kmBvaysiDV!6ul~ibzRIW%Bq%u}$$Yu=p}Q%d0y}5hzb><C^fTsvEYSPueF^
z*Tej=u&ZF!&{YI1B(Me?4Ni)ozY<rzW9<WzEXRSKrH36>qg2kCHZ%{X99MI~A5M0z
z^DJaJC#pSWBp#1fR}E>-t0XIiz;CF2YvhVPR)F{re5;MsP3+j?#gKe5TrZT6v71D4
z<X<G2KiiIzm8}{!S}ua;HUus+>vnx-JI@IlSzPK{d=rnf$gV-Nt!lRF!qNCC%Ymgc
zBj^=XVcKIp<ONvpTTRl*NcNtYzKQb{?<E=&6{6H<HW9mfyN=Z6U7IFX!@LD^11l<J
zE={8~q{fQ~nm`)*l4$N@ayX^entc-jBwXcO&7?){1!B2F#RJ?|F0UdO0$>Q#%VjsE
zND`q!w7lWvOZL1Nf)raO4;>nt-{4k;V^B57Q=-eA1@uQFrWV8XOO}A#K!|PV5OANx
z4NQe-CmzIY+&N>F>8fMQu;}zyZR4Uo$SDSi9k^gG;|t!o=&sF3g^0O3*9_x^ur0|a
zz9c~9qleTI<SBLQc|Fr4xW)jNevqKb%_Te=*5DW$75vJ41W8Ee=6*MgMOy(*EZ3#z
zP5ntrT*;pDoS9jEJ8^yjH(~}m9lsbCH`(`a<t{EhH+3Q{vud^0I(JhndDCR3kb{-V
zhWzesxU?*st{pCC!WX1i%~K8*`=Kru5JrX(LtE||X94~a5XK9z>=9fm>zqS#i?hrD
z56{iNE#7`MpZnO|A)gw(ieBBHuS@`K*Uuvmc8S9l9nwgn9EP(oA}>#5)+kb6*<AI-
zX6;)V!Mez(6y?#&4OX2zsyNzs(wi*-*;DDI9=)pS@sJS<=+@ne096gV2=sfS^F288
z?GgdTH5B^}FUr)F$>MiJr6n({#W-q}Kd$ieoD-uI!X9mhL8x$gLoT%j7GE|p(m&cb
zBGsbg21i&?*?|>}yG67mh=lTT?57-aTb+vSRO2aaAIED;x2tvuQ)^nkwijr1P>R|G
z#L+0=BS-Z@ub&siXD7dw+W)48gXVKAz;9h~qo<z45J1)(c)T6Bz6ZLMw~PcgYmN(8
zAj_jBuOX?8+erI_KYtut+Itih9<EZc$`LoYA(E)nR8jQ~T5{r0fG0<{LlYCXw8mBx
z2mNn5)Tq|wgB!U01QN?WPAm>LjxT|btcJNUUxQ4jP6|9&PoYFk<r1%JW@M~3Su_k9
zj3r*WIk&@267Jtc(Eccvq}uS5+ES|?&JAg=KWz=LV&8TNm4S9tbO`e%x7WaW&^&ns
zXLB{z{}_)??E#hvQD|E$^uXQ#zah?SNG-wn?)`mgd<!TRBv_ql;Pm$siQDIt^}@k=
zm>!~Jf#rr7=694MbYV-2<PuQOhl-P|J$fP*g@N*+MM@kCt~QG~H`cUD&$X}I7`Bc-
zda=wrfmV4dCumw5nkt)?D!MqrX%?Pat@F0yZ%?X?bMoeD2{csXaHUQiwU&{8oSga|
zLRZF7nb`6KicZe-{y`7WcoJG_^4eY353!3_;C8j;JGQw7HxONB$iICsB%Vq>gpr~F
zG+faWX!lu{!{wk$7FYre?g{im<q32Vz2?IQy#BRL(!2RSf!6PsuZeC$vY0<r;nTRh
zpRU^33=e4#9EZ&QAWE<@ZYN{6D(A@IXEQnx%@K$AI%LPK2-#n{5a(q+gh(4yMya(o
zi7D4zPIHktmksoutvyLUTUO-iJJh!r4?53ZcsOS)HIG?IjKfoSMm5dA=W07r*nei9
z5HuI8;;J7#!H*+#hO^K!%+0J|^iWy79}QgAD7Ra;`<g0dNU$l@`yj5|a(W#20(F+t
z^qH40<N0&iiA~t#0_ZXa+hOS>IX68WcM=ku4OKD>57Eb>qP!oQEmaM5E*BL$-&|o-
z62DUG2P+W4rJ-q-XWayKw@@U=jAyl!S43OUGrn317HcZ6!DdzB3N#!Zg%xagdvDqr
zz<tJcb?S|1v-wChWx*hQikn)zY>w+>mypyg&!iv$A(S;~gmI&mdNUz0<M1dn`~DR5
zYoXpAw}qJ%w2@XPei7@7jFpoh;)vI!q5?!~&4diew;cP4RDlR28xgAt`G|6ML_E+?
z5jVvqy4gn2z9Y(5s_Y>OPT)qeSBly+ajRCkI;8za6J2*zO0qV@IVO~jFtMZ%heL0t
zf+C8`iZ4QiF*Fx-7K@{^k~t(^g+X?nNFW&NrfSt{&d*8N;N6grPRqCmcRY+pWH#kA
zsjbb=3NU)TAa^;+a|%I{iNTwbkD@XB%(`xkWS{<pbq2>#=(*lw+T4{M!Xi_uJ-sB(
zclk!It@*auzJ1^H#1-3=vBRwC{f&<Kj*^+aw3xS-f~p;ol@`q}_xN4ZZc;9Vd>m<i
zsC_bT#p2wUzC^IR<gcp6jI5X!K&S7{BEYl`t2K;iGAtlRoU8X+>rt4YouQRg)(n;N
zTbGIJ-iep`k_DlBbV@V4*xtiQUn_ia`4Ft~m>XMtzWW4%o|~kc@}lU2sPJ2VJ)TKj
zauMBR{|1!}2gis0gV2@aS!rHJ3+A{0t0m<`iBF6*sq>b^gFBkOuQoV!jYM<L2H8y3
z3FTgUktUD)lRc%DwZuBU&XIwDp?{ZioNaE*R($a|N3_TI)DuX{3DEefI5Apw?^F;t
z_63!tWVG&0OUaE0r0}Tn6n8xL?mZ_i@=Ju*hsmyCXv(s%q>9)hg~(=_Uc5;oY`C}b
z+1PcCyq3(incwl&oEj4_%rieve*P++FzMt<P0VonG3D{N<E%8WqPi8;%!}MdJb$h@
ztkq8C)+q9EnmBS(!<`$={|H>edSw@MvfOugc3?iH%ehWs(MgIDPlAi$TAIag5+SL#
z+?|m^#eE7D*+UYfCw4Lub`goGCBC5}kq{snJvB_CMZ7}RB-lk~K=qtQkdP*g9xFNb
z!gMD;Gh|`+|Ccs~=LCKQ@5$*1s4sB(_Lec5m~&*Wj9a;{KdxF;U9dNj$tQ%}V=|cY
zxK%GR($zeCPL{DRDaGq1!*O7SGZ&Mqi`Y6kp);eZg~IUfU63M<8m1RM93j2&52xz5
zx_egXP(PJDn0Sf7O#W!cFxX^slHAQ^!I<AeH)GV$AunM5xuW%rB`tm%KGj8;#fyOt
zrL9bS*-AAFiym`hZn|6cDJ4f)GY4MJT%@lhLzVcZN~&zq7qwqws!-)WPXTM-h=X6;
zIH|GjD(oe7`iIxx$~^!ibNO2#A`kYO6L8R$#hhc@a1lqK*p@^^A|40^+J8jJc3(Fl
zD*H7i$6L`h6@18gTLz{qO~vdSj<m=x&u+QE7d+?Hr9l8KjG!0pNXk(L)!aZDW3FaF
zDwHo|E?m(<z2|*;y~ao@B{pTrh}?-*oC=OxQdIYJV+G^Tl_hY4W9J?aH;+c3Z~RXZ
zIur2F_f*xc`7XM+<hLt|L;br)UlY0R@RCImE~5&eby=9$I<Gx)5m-N?C$Di{+@u?p
zvEu3LT~9GwYK{*%Oh;cg4!o-#*O`u-kj3l6ILI2ZN1Si>!mH|uT0LZZy|Zlm(wmK4
z^F(7_GfZo~1O6f$V65qrLQc0ebL1mr?<!wKBjy$EauF`2GNl)2<_r~Qp%H;UV)P2F
zl`1L|NHa<^4Iyteaj!6=x+ty5gwx7nEWGCWlI`S#w1Xgkmg7*RsHRyhJ-%H;OS{<k
ziMmQu99vb4H}v@Sc?0dEUs>ygaWzo6^t}6I$Ak}XJfA>Wj751FkRf#c#=($2EucaA
ziE%)Cm#|1f(`N>6?0YW$3@Conp9OTLBFExM2bGao|0vldV_cjYXOXqXmpY&sT6x6|
zuM$?SAQk=Ui2r@glz<HGmuIQ9SDu%Pj|*1x8d~%mg`weto!=B7jLF=iKd_4sIsUoE
z)>MUS&S;NkOj>>k!-DeJQ;GExV}Kg<gQuEQWeySEcz5Fc7@+qAl41ge;C%xQee4Ds
z<PWT-!n&rJRq60a85s;8n|~}9h?e8!bp3mcPKya~Pe6>xD<;wnqAhSilbW=#nz<L2
zubo&Y%8_w*jL^{g#ss49N=aI8GG0v>y}-)do#WunU9(xx?l2-WH6V_2hKh*>3BDe-
zm@zFj3HlgQ6P~Qvy;eBtKqA>c)mkQ$mPE>Mvi{wvcBb~j{vAx8ktj9Wa5SZ3yYeVj
zFV$(8PWjlyB8x$fd`-=D$TQ_zUp3P_iH~Lyz2`gn>(X3{*SriPP?D}+_X*wgDrDWg
z?gE<oyUsQHKfLu#PsZQQ6{QNBFg$rvqw0pW+fZIT#gPm>0iLw#=&?mtfzcK02+P+G
zgeKjpH93V1wa8~mw=Fh1St<u<N!f)`jL65POvhPT+pD$UHBMcWQK?L8%IuUX?M?5^
zEgImC(=O6~tdXRp_|$8_lgAZMvb1}Bm6Y3xylZd9SW%1%`Alh@Xr8Gn_yR_fcAkU;
z8)9UKPd=EV+{VVU<j0bRDEueGP&rfW6%gt<1L{&7$+cfqvq>j6Uv909RRNVKG4Av=
z?rQwv+j0XoIT_rIl@A$eQYsFQ(i3lMT@P~34W+Vulg+gGSS5*l$6`Z3TL%Itq{*)D
ztZZ=kiBP01_ASKyAvvRI#L)TcAW~C^oxAyGC_KNKpAQ+XmQe01X)!RWSW~c@$r5T&
zzSg60;x|_C+EGC0?OSWs<e!()Pcl9F>hkryl#UUMq49ltqyhhSjui5kwkYw*>*x;3
zASn!ghUWy(7?T(xg7Sb68^HKMJ*KQw&dzJlenfArwB^3d=~^={@F=6ng<>oVz3Lfv
za^hr`1VIIB^}6aXn%jPUalu%CurL1DSGn+8oKkyDlaE-kr=<mJ6fWaPKfa4w8n*eQ
zw&<gzIb$o28j|eiWQI)s_<5Tuai6!VsOjC4B7B4x2h`8Es_CcOX!m|TG?^iEP{~tG
zaI1$liqT;3&_?}d&>)TDFf1HsJtMz$Rp-X=UiiQw&q`r)T9L{%c)`JAyH+l3YI01Y
z;q81YJ)6UGuKAFe-ZgOh+A_*{o=$1wyVkQqmw8%6@L(#@=|dqnZrzhsq3#~1FF|$^
z4#mQ)Ma$$z8ru0)X?(tU1I?F``CG!cST%%sm)nFLx5)1a)LAFOF|@P&WF6kLk9z0K
zW7>KTRB03^$3v?_YKKmtPau}ZS4SW>s!P(29C61}`em8QMDsh@&4?VrO)Db;jR@Pb
z*Rb%{V29?4$es*$N3u00y@6x?B>J?t0@0#f)wzK3L2is7J;gn=I$p8>?Gj~K9U_{f
z2B7vP0yqEvMhg22=O$B!V#`ydP(luRf{NuaZaTzIs-ztmzR`MLs+OaiOS{7;j;wtU
z+)re28R;PpjRxb5F8gB=FAcYBgEjnbA1x{(5bYzY@^AGQFE6xO6NWqM9VFA2JUpj|
ztHw2UuMB0?zIV`<N5xEj6ClfIyDk%{Iu#Co&@#j63A&IGr2lS=oX`-qWXN_`$b8vb
z`hNS$quNu;v!$Ne&DJt84ShWS0_8K>44H44{=|XRMKj;r_c1Q5`B&u<Eg@Nu%}paX
zaWBzb=*<R(1O-(;wKff76gUqKGkOu$6n>Y-y$MyEj<B<u0*}}{fjnr>drK>&MjcWS
z$rxlNN=mTsh2H4Pc*-l^Oo!vFs&>y5*PAf%GvX~2w?c_y`-RYzDgj@9=kIMtd7>RY
z?G`be$HIdYBn$C#BiaclAFpgx@2kkL>S2yYE)?9!C(wZVdW(VJE5ol>oXJJl2i0DZ
zmTY(9kphps*YWAjYG-fu^XdYpag)~6*FL$LMET|4#b`5s@^ICE7FKTR%<YWlv>~;v
z*^{VDl5OataNT7px<bUerIqS-cdSuI99dPb9{U8U@b|sgYJeN7nq{P{;$lHE&DOxg
z5y{7Q;30l1rIJ)JaSm=dcVlbU-+L~7NqEcLqXw)PQ)WI^H%oTb9y{jaPRvcS;K_mt
zKC{3LS#>;fXA(qla@3VU(y;m3?1H`Uy|{efL4-7GSCv89)>hSen}(^HubQ`@Cr-<H
zS*ZEw62N2e*1ssax$oafIKL~*diJRe(g!!B$fQ8soCw_IZe?E?cxJ3Nf7!kkf2~wL
zVW`1;Y;~v@fs+@ffVG3bQCeba&!BsUYnpFWT5@bPzBcZX=X+lBI)$wU6%~Hv3`O>h
z*}`90;WGH1S{L6%l+y-T7jp(zsfb(5hgz|%+0;O?Z)8b)gy~tf+`f^t<KLr$`gE#I
zO4^tXrXeMmbVM4d2&j!;mXGT9kH#@l4E{l;R5zqt6{&vmcpY@<X+C{+Cx0OSO}9Cl
z(U#giP_plBANi{9J2Y|=|D+3gI;fNr@$jjHAQ|1Z>GQ?Kg(ZA4f?yf;-a^|2Sc2;z
zgZ&2M$H)*5#t(OkR~ck)KR>G;4WN2yWx@-~<NtwYBYj`z@Xr)>21<f^S#^7`{(@pH
z8pnzqpGVWQ&Qf(*2Y*g$KyE7T%S`#|&aW}TF2<D;Dd-GVzbpH)2M2Fr-1$kZ)|rX+
zlB^<s968K~KRX(|b!OnOdtFDxqhie?Xuni)W7FNxU71B-#X(@RgtkZ!FsOJNhHclD
zQowutV39E+K$7Vaa>QREH?<khaW?r$<3l}dhR{~-XcKGpjg^TG)8bdGIJWl^mg_uw
za(qeK3Pp#^p2B4PJ#2Y;bBhCu58e#u8hdw_U7f5Ws|Ad4=UXz*7L1iQX>X)xQ&3aS
zHRq2#GCYC>2G3e-yM-FPb2Ks=Fl&`_pO-9-aM$avA~6~&pI?N_%ec?f$N%U?8A&?x
z7z<hqr0)!Xv2P<O%q{ppGc91nIP$3)>A6F7i0QkKlOB@tGmKjLZA_@J!*{T261)qZ
zP+*Kjw0lY5lSuB~>2TTXp5C>w+1Zfy!8}X|Z9=ZGU@u{MI$>OAp~n_JZB~6A=M&Yf
z`gY9bHWi6y<(sSo?xcv;wda=ANXbPMsL58g&@6h@$+$6IxA0`{&=7vo1^Dg5GE7X4
zq4z_=(k+t$^PS^ab&6*B)v?@mxb$ZTpK&cgsBLvbzS{!Ma}l9A7Lcq3yYbGM?6G((
zFOubJR&1qQzwYvdXR9X6>BJ3H@tm!uEs|gnBSW&#K?OyX9XSf(NlaGHzIoa08t-n=
zVRO}o`AV_j)kd6>lw!L;y6D%FHAx{fwTdq`S?>Ea7N3&!KYv+>ClZCmN^=>}MqY8a
zCd7?$Hb#jXjx*%UrpC{RUnHQS%AKNuIw?|}j-OV<FJWBsV;kJ06v?tDE^j-MqE6cw
zg@e-1*V+Y>oW`nQbzq9Qd!64_z&I>v%xoUP8<TICa3<SepIJ081vwV!YKW2rQ94%3
z%v7Bds570Z#eHCpBlV~D)eX1*YKB)kwyd+O%D&hW9y4GaPPv|o9~Sb}bCn-Oxzno9
zoMm2DWZJ^wN+#4X%8t0sJDEyg9sj`;5)IazFEDFWkZ(&;t9i-5Gxz<E@yy=-G9P&U
z*ykez!!AQM#|`Jit}F+j10l0M!3wU^yUFtID2ej!kf;pA!Yl}p9Z;SEjMa6EDuA+~
zW#?g@o>HgFhv!{KwcXGxhvYUwX1#(ZT<3RhFBD)tT)W1wd<Vgq9Lv)#Z?0cvGrMm>
zlKT&iFl-YW?$_R_B+)@Oa!#KJBsMj93~dod>T*4SGy}(uz}>C&K@_kfOKD!M%w#1x
zHVs~JIC&4Q9pIG=SoWn`b5(exlp|OJ&_(<Fkg(%OdL%tF?fsJe;=}}L%Oxi^gof_*
zfWfE;v7spzCR(Z7(sP+5E7@-%>W?SJ0-i#;-qPjbAkl{XA^nZ6&GYJ}j+ou<<$YN+
z_f5=q!0?d%F}!;Brk-GZa^KeLJpaS9^E)+^Q>gE0l&3mfRjQFDB(V^|YH_|Mv<0`G
zthY%Qg!J^vDpdVf>_yU?hJB2YJU5PyMPCOBZVWC&W^juu#fe^v&~<DX^YCgPK1h{`
zEIV$IpXvV)S@M|p?7m{PeNk%Co#fd`Ip5h}V4^VI_g?e~#Fzo*5Dt9(V39T9h~%jz
zTiaUYc_QaK`UDa?;(M#4o!)p@`!yBj;0fe#X9W;8Q2OmhV2^hAwp-Tc+xusycjI`?
zXYJ^sTP9K4#1NPBkMu)0<)dN^+rx^R;+xJkEE`@#-X$D@TRn`>5P@6}tmKbgy26@(
zgH%nL&qj;U@pu=IJ0W)cYm>b8(**!eGyLAu>Vn7do$Vlx@QgPfJnfuLrFG8{f+?sU
zANpPHJet691{qC_f#3Vx8xv-c;f<UwKt^NVxUJN!8_v~TSzX_elxz{FzX@;ha#*}>
zQ51Sb`rbv4XPr!^b-Zb&ja0CH>OsZRB)@-2XP<x(BjVL7Z_w5JU8LJ3-*BAzR3yh6
z)9G;2^H-(1#||!mB-tq2Ljt5Xx2})R#TJi0<Ll2s{P38Fb~gdASFlrp5&~q$;#+jr
z<d)+U+eqz0aO~l=t^pbi<42U9AzsotBG*THH#*2V_mOm#m#o*n^WS7JIdjbC_dHPh
zHN|5h=tuSEZD*NhjkSr+URvHIth(oh@qVCnB9X+;KYegF1pvEJaI2F68fMrTiWTJc
z?r{4e54fuL=T+RlUL^~77(iS(iwGGN=*<Vo8LQ{+BQfYN?XS<DukQj_QwgDiM7h>A
zjHF@oA;}x!75YNt@@UrhCmZ74hk$&Vmu}jAwVxNR<tM&HuHCZ-Sq^`O@y6B2K+nD^
zT?XQxivz#oOj1@~=m5DjfC4uFf9C}|8HlULEVq8>k_(-A>1Du$uYpifBKK*LTY1kz
zn#k{3#GwGqU+Qrpkl$2x>>A`O|Hkcp0_WJH7JyGTmoF>%-NxY>;3S-ZCCisU$Zfl@
zqr**X0`^<ipC`I1y6Pkk@(Kqo?<*^c;P@}y8YPmSv$vf`z~2G$k#huoBZ|;_`%fMq
z@gWHs;-#&TSQ6H>zF%heiIn3;AWmoa-M~1%xM2EsE`VM7>4HCIpTZXMDerF&{P}w=
zdP!D{OYhQ`$c@m3TP)<myg0RPnOdtxi?**}q0eR}pa|CYFNfT2t!07-sh*FU(On;3
zv-3_WEZnhco}#n{RV~--Rcg#-P<x%(pSv>;#0#~l@g>a!H(42Yf1|1!E&j2hw>S@L
zrZ&y_0{+-f?G!e5Q~=^#0)~a#=HT2>=^nARZe+Z``Nzo2_0t|pH7j*#7Z0J>=3Ax`
zv$N3UsmG3~BRe6Y6Y=yQqxe%R)pHduP4Lmm0LRD6+mMIWpmiMvAnbD9`P2`uyeCI{
zL((*r#1LI=9+xwsxELF)^%J0{o8lW!AooH31~<c*rNlA-X{O|;tW^k~;AUuT0GIU|
z+g&FsTcYC%dl1M2lwkUJ0)U4O01w+1u0gaD;jPnonLZ3~&1aujfb0=)9>EG^kLNOr
zTLNMT&?rx!W&tuA;HqLp5p=kQWpb8|DyVO_dDK_E^F8~r%kG;dz5auX*B<L(Zre6`
z-+#u9O~W}lP+a~|C6wRPN>@~NK^FSG{Sqmxmk=RT9aDPKC^z;@OLR-uT*;Os?%OgI
z*#|zfvV9GP)kBM}7c{4WJ+m*^<Hk$$K#_N*>t&-}aj}IK10;Hv_46{&w<?;)(WLIJ
zC#5Y7%<SxMPmSDyf=eh*o<OO#MLi8%7J5tb4UY8lA%6Tv(1tTf{x464W`Z~C>!%%)
z>#qkw2dHD)3~jr6L{KI8-uXtv?bNtgMWho-nwUW&hL(F&K_ShMrE~Ii5xzcqVP1N(
z)b_51&*(b;ib?K#=8P}Rgizti#b`ZK&B<=*eqMZ|#`8@ogLv(j)gs3uP5+M-7}k1#
z8CWu?2RmkrxNm+{Cug#+BM63?fV1IzAF@at7qvN@y(g1oeLNWCIFcqAhZpyF%yP85
z8>#2^&V8BjlOO07NHZ(H2H6|GdiOyZ@{cipOBqT>f6l9pg4hsRp2xBBQE*?<@PKkX
zV{?(i?A#zAhB^*=zLkdN@k^Et0j*xnRtJMAcS`bp%*u0LS?wSQ&cDhL81S;AvUanL
z*iPf!&orZ+U@sK2!b%*OqtX!QjddpaqDDoUp^Ebd<xI*T!bO)LONuf|x`C(vn*h4#
z9v!UxoKjEWeo={5=VuLZ>*=dbrs+E5ewnCQ*yRL4JNX>VsTYTr&J#!aQ{s(n$JA`!
z6|aq@iLIeA{M=O<Wzk+wY@Z)gH8Pl$N3sz)Nc7~2<jH24Ipdzr59QAGL2IyfpFDp%
zq7BKy@C<I?y+OvUWxp5UB+Vp8AE7JNm#=m45CRs}?-75X!pV8o^>S9e387p4Wbg@>
zXHs890z~Sgy#G2k?n}0^qErC3;+u@j5IWFbIZ6$?$j(??gu&m-o<P|NI1@9hYmvA3
z0%T7hePFdXE)xRU+59N|_Q=?HXJGovhSU>Cm<=og#sOvpE{gWyVc?eeM)CiMx3%Z_
zkvRvmNwa((=^Yh9a3nzZkn#U4pu+dI?`lhg$$xCpVDhCM$k`Jw<ydo*q&@9=lw4hy
zu~AQ>A8oC@Q1hcXJEqX*%<k|fF6!ld6eb+b)~U~GU@O8Mk9F3Y3U6S*`kr)r1rCYn
zZ_zn;q2Ks141ZG>zNb~MyJm0w*)s=iqa2)PlfVks_?P}CTT-fcdE9;BlMoGFT!pxV
zw{?1}piKj~`_9O7KL+KlhKD(oDlR@dgzC_%vl2@1?$P$F+Y=~8^?@@(Y<2OOmC9f(
z>YD`i+NF50sc=JK&ojmbph0Rx`8xW_!n$?y(Sav965h0h_{KLw3Z=3nb$Ihle~<C^
zhkpCC2V{pbxl*k)n3M!k+~~6f?;Nx7X9u*SU;CawJ@z`V>8_+%d0M*Py`60_`&(bK
zOQ*c0rOVmE)+e-&r7W4O1i1Nak$-pFfD(VP>F@y#wsB$#NYJcOmZXW9WBi4!L{93$
z3LS}*V1R4W4Bj&KG3oTsR^^EYbB%jcT#Vv7*S8n)%1@v@5_L=V5?86ScI$iZWKX=^
z0wz`5EAaiA)xn%x<cs(g+<+QAbA}FG(o=EpNC7wnclH7Gq&l~vwp(U4LvkrTx*>Mt
zD^DNk?A=x~Zh!y0Tm`V_4%%-H2Z7j>1^eyg<hk@@OkRLB`a!Li)wOqEulA>XMs-DT
zoXz+@wngY#ta%eHnzfLpEq7G3<gveebp_Ul=6BLtQ);>xUY)pAb)@f3YKj=dMJ#2r
zPGvmwW=`Hc=U-G-`!;4dcdXbDv4pWW^1>6eC;zQf>kv$J>OGK4wbLNx*!0DFoQCXx
z`6SZ2_o~1{a8-ssl1h7uxwV!#N0CyVViDVQ8>37_7_CjsP${<1{6cR=?~E&UN1u$5
zbA)rM1TR|`TyY=fiPXWT7nG@Q3~S<jS<99>-s>sjE!t(NC~kqDUqlsHqA?YdD(Q6p
zGqDW*nQ_@-Zr#Y;Nxwn_x)H|Wr-1wcXf4mUU;W(1a}5rwDNXxd+wi$10PMG*g7bd@
z9ne34?xbvW1pkQ+K~4JuH}`v<?ytQ@rPUxi`Q%c;=p~#WGbug4NA3{*1oG3{@h07B
zd}qhTr=lYxUpwV6ABt49485b+b4j{hVxNXI$wYxCL^!$37a0FCEB)3A?u6%o1#NYy
zTzBJe4E!GH+J18fNR&7HkK0-R_PbZ#R(q<>yc*x0WF|90FO@GMZF7(5(xC=P@jx|y
zs9Cu=?ld^;_7Kb&2*$o>OmVbv)LjOYNxm2h@kX{HZXDC2E2bM6>4y6H<j%W(L&auo
zF@z^jf~jx-+*d*QCs1j_99`kZw<;ldFz~a;@ziN}f#N7c1**ymr0}Kks~2UkF|L_%
zokQ7+&8>pcCUnaY@|!xIwj)m<Oq&q8Wv6AWsyR>OG4Ctk;mq9VuogY<2O@&0YZHP8
zU{-Iq_zCGUtGWLz;WuP!3yELD+WP(Lcb{bXsA_OT_M~$4^fY&Cly#SeKWn|r*@zm3
z3_0&*JZ^ce`)u({)@0>I&jGh2E=J*<i}Qs%uyEWbak1pClFN}CfN6ZhbGr9f`dF_e
zUYEYxietm(u`dk!61+>3-3B%gcmkz7E=$(b_zll7SeQoRQz}GthXyF{fu4_v6QgId
zbeDIA;9$T)`i?^UP~J7L6`CIP+w`0DDr@xA+d<?4EUTCPgL%IU-;9aL__cnRwJY%o
z-8H9}&M9pA>OOOop0uc84V}V!DXlR{w{5E`MwzYz);&^MWt(?Fo-NTOJ(be#={B3a
z#hXea@qDfO_BDYR(xh{hn?YV>{S0Lr4N=WkMeBK{9JvW{qwF<WYg_|%{J}lO*MP+#
zyAIx{W_*wsxwF(S`Y}@M2H^@gbAzM03kVA{!|3zdphlJR!#UD9s=cS%^$4E#%qnSr
zJ7~D=yY%+kiP9bZJ|iE#UgfJV8I^4;2fIGGUag)PGR9zO+b<_~=BzJ{tw8=?+!a9m
zwN@f}UXws%*NSY{-DaO5DQ&MDs|>S*v391whPci^W}&r}7bS#!<l!1gZ(`zUeRtg6
z%4rvFyp0K|{{ac#%Zshb6=ad8HR6%v**F(W!y*N;-#mg)XDj`{hVRhRY&j*qh|08g
z@b`N6uX=v}oDKfDzZXs(uovS0&7mC`{=%A(bf$~Y+`4&!?H4wqWMo<SVk5G<Vecgr
zBs?@JJSTf3TR9HJ2ocW3{l2HXyx-l_bvqxN#Fg-L@eI~itW{y`q18SqLaS4YaL&kA
z)@68?WNQ{CT|(sG_6fD#$dO6w<i5&U;xE*~qmIIlZo!k<wF$Dl;VQkGf`rR=4`F54
z%i8;z(JnI5pR;3_9$&6{Z+;YBd;+;P3!=4>s04)9e9?!A%QoP5QJ|zd5bmyMVps*z
zCJAqNXPQ@z!;<4>`$khx?WhCUquXL;ie3-n@(On5@d)s((K`{SSOwqIo}D5hV7>HL
ztJPjGQ)69Z;){br7E0t5mfkl<8c;7QHmOQ{|9sta!5zc@VUJ!ii4jAU_{5gXM!;Eh
zO#9`0W*k(On;#p!^rjG*jx?AMJX|lltEqi#%=`d4y?N(*qoyW@xfSwRIb8s|5vU_?
zfkOGOHxjCGt}kQCuFAtMn@^v<NWchq?T?)Q{#{18e#(rDu2pH}gXHOpG=l1=nT|R8
zz$t4gp7TxOVtUWuzH8DB!C6oI;fhq1BS{T6r%QKcw)Y2B3#X6V-E*~{niR7&UDEb+
z<YnC_&lyoM@XOn%=?L0++wYvq3Ndgn6tRI&Q(ys=37j0ax1!!SYe%X`l^$*SUbiJ6
zHyT>^_^?UZZ0df}4RnwAzLp2`Q8i4>f}xdp=#nzl80Q$cU(-d%|GkB3oMkTG)EC2P
zU)O@irnAWy8_41*9(U374<Cx+CY!{IXi98XlTlP;%j+<tilwuYb#5QW^t%uIAl4`R
z*!WZ7I%-Hss#s>fnh_r*t%)^AX-$Q{E96TOlRJKh)C3ma9EApgTbE&uQ_cy-n~i6h
zO@~O|-_Kp&8*0{pN2gP~OZ3k~gd6b>1fM|hvJMxDn%#)>z32OD^lIsXl3*z(nM@%6
z<#+zMy8r&?zgp=xJk)TqXU4i?%%$@2*PXP8y!lzPlSk{weW3g!ByD?#9H)Bs6KKeA
zB|=N)b~a`qjXqOxJc0N<>I{AG@{gR!#57cI-dH@~f?LJ3I?M>o#$zLHUZ1lMJ-cwH
zj_W0`bm(S*h2`ln|53b%smz#MKD9a}>>`<pnSs=H>!lp&-Cb3<dXl;OKnILI^ZWmJ
zi@*?ZnljfNsqb2>>EgR)G*_c(Gi&IOrUX*|MBa#J85tX9GUvlj$nd$jVfr#1Ti1-x
zDc>P`;y^!5^0ks`w3*DIauY`RE01;PqdiuQIp^R@Z`@SG@$u_>8ktnqvSS)Q>ZsZv
z_3z{#M0DmqQDgbQk(fe_{!@>o=1E|YW6_R~@vTI2ZC%>1bGTDclTwwj&tYnPkL5IO
zd6~wZ7vo7zHu@d{wPtoe(z<*aHcxfRha+FDG6PRjsD}R{tN*3--|w!2N<6)c#UsV3
zZ;ZC7Tu}GU(URlK4jn}+lkZdmIy#hq-LOTScnzgxw%dF0V*xdhrYa$9a?-xb0w&8X
z7WJphK^dn)agXWtwgMQgelPXOal2i3;UesB5IawfJ3RS2G&|kmaxV9KL;4kGoxV}w
z^N>YOB5tPgHuOuG&D@>w6e{KKciZY7>+Sa*>|}=Q@xT0S?ojzAz|_lFWsEI{EtTfA
z2)kQ79S}E0Ne32@r+1ujrC-+I+nIB8vLhpMX(_b51k`jRDkvM!`#Z=Z)++L{bI8({
zf!&;2*v+%jWWudrl&;JW2IUXGGyT7DRp+VNCo2zzuQK@yP}E;A=G$7`%;XnEFTol`
zIdYEgzrr|OVL44DF7ceA9N5jf=!sbFHzbnt>{_4!P2j1g#~1BOC(A*_P0U~C0SWcK
zE>KtOI>-JxwHa78p-zRbc%$f;l`rhO&0^m8zonGP19lNJJg_onK#>I>`|5Z)85F+6
zP&b!5)oR^JxMfTjy2Ms_+yWLCAFzMF!ocaj$<uDUf1^t>+H0kIXgBUu9oiJW<0<^8
z(YO|gfdbpl;m2HMt|q&wF^^gyT<`=ka?ZWc>c|tXiC;U@tJ<3h-_ltiJ$xJk_a1`9
zjkz{%sm{TTX@R6ErXBm7K+kWSv*+@)+S<jDY>tcJoAD;$$FOfNYKF6Aw~)UiUcSE;
z&LfR0!Ipx4j%{d88K9W5f(j!PGA1thDuOSrP{mr=*=qUcu-RGuh0FiqeF(!BmrF){
zs=69Ne*EkCLF?k#zIbjU&sJfC&mYh@C)8LCSLq0>jhfvjle8K?*`i$JH)^UHQ>gfm
zJ;zCzaC+UJ8DmCe_`15Gs><}JEBJFg)xn3v349V!+O(!e`p35u7dspSPJxSgM(>y2
zOf=>3Kow!y_DI*N;TRjN6&C}Ru=)K@S*4y)rh{;CIphgcb9gy`9wmV}Ja;T>7EmKW
z5?kDh#4XH90Oi|lKRqTwyp*@v{L)jUT9!dWN9fF5rMkh;Vk<i&C#0fuwc6LYojSr5
z4hP~yOI<R%#y9`bZ(FvCONRQyb)SRQjAF*_VLldkdWF;;ilc)9?0erV$qds|v(v%z
zipsth;~+u}lF-H!w@R#tshG76@)-eg0~HhSln1OO6!??vGHgTAKP|FEh&GRf00Mht
zc`z?BZ(w9sre%N7cBAVXRMl-B?F5w#S&2Nr`u~X=y~x9BT*gQ5sPb*P%_G0=0j^U9
z`16j%qnO_~)tGr<KnA0^U#26YL}M2y1JsMs<y}?N40vuG*Fr!(tAO1)N0lrEXm37|
z>qJ2Pp*k{99+?4jk0NXZ7=!B;%_wM?GeC-H`%N)RgZn+Rm=ppk1_<K%-x0*vImcWS
z>f|r%n*et5Z)ya|jF3GVAOKYY{P4;AqnYoqN5va-*DousJ<y&`Pax?^agZ9b^O*s>
zoaD%t-$-T8nZ)(Jpm%Wk=W<eqQMHl3u*x>gXO$bGTh{l0PAzC(Ted34?AXFW5o8AC
zdM+n=>|2{Y{*!zTY8?C$hz{Zxc5@Ya0s%gta9^gwfF^zK`kPAhlT`+^q_=>U<nfD&
zaawyf4%akypW(xD?<4Jc_t4~3Z2*`DkgWy3BU?EhXN0@}2_yY)62|1D0*o<0u!f3)
zYymCnwNO33OAfPZHCH=?;dOiL=>?!F(RX1(!0wlE2F2hd3{Rh<%IE!}oE^tX`2n-S
z{P$S_`b5u3DL(oEJK&en&}xDClqTKx@aaGuMeW#1iJAXzaG>`+6@ZKAM#r~bfzUSp
zHy}LR`eFE<{WhaNRaC#9Dk{}5phx|xq5>0R_fwf`bPb~VIWb?4xp+;<ky*b1Z2wOU
za0-x;DgntL;CGV2#I^SA2B632EuEttr6pco7vyIkwjX=c(7;1&SV0>4@5j`JC|xH4
zc&0txQttVw;Q(lYCMs>3CViu;4)=>n0N9es{U?4IkceR4mlz8b<!GCoiJ>LfBSgG3
z7N(#gAY<lPxhnRC*i%bb!tvuZh#am!F{3f!ukC4rHjH2eNR=6qEbq?rHJZv?Qi@3f
zotF^v&WkapwCUd~(qfcu@#Pl_2%IM#TFcn;|0#%m5f@l3$*lni`tn!bX(q>YEO`LV
zi>&`756BCV)1RA<<FiE}JW~8w+sJgP0paK;&HghO7MnU6&j8njTqhG(p~K<NL3jT2
zK;-mKE$hoK!SHvgUN{F3oqr+cXMerQ+?)>b+i&zdKBS?aP@|d5l1HK+R(k7iVenHW
z0Y*sd%4!KZ1hn;w|4Jpf_$jWz99{oIN(UbGcc%o`xJaWPWC}!(JZ;V2#96?8C?JAN
z;(pU*=1ZHVfN9ZyvIj6Q17#ze3Zxn}G}oX(bU&f3X9B;dx-b8f2{bZ&=(QJ{lVoh|
zH^faa@uE#)!wX@hsPG>xf<_OsPg^SC!b&kEDr#UjiWmZZ_-vzaxzh5ieC9*tf&EfC
zMB7P*RU1&8Z9H0ajaIZk(UY>WA(FSpd*^Py^Pz*J38Mqqf><&Cl{h{ks14+%mN_%I
z%V?z}a}@e$e8a3HM7!}y_uK%-53+~9J<&%fomn`C-*F_Rqd&C&U`fC4;b%KQQpkDe
zPZ6r`rwHY{O^orY2=xFsH|qTNusmw&A$`LS;zvueleYvE<tji?{$rN{{SY<;gnP-U
zFEamYqLCy!G!K*}aIEjqIgS2D5Gv4ZAm93%QrZQy(Yyi@+a{o1{gT+SO7=rD%+B<d
zvR}@Eb}5zy-RQ%cqCFq?rMI?-_|2@Xsnx%Jl4&e&Q{YQl=WsV`GA@=}V|>XaOFtWV
zWn2{5;=Nsow)q5_?i;qZSd2KntDg<%=r)@w%BE^f-o*hnL%|eSh>RV-pL~DHl!qUq
zP#1<|n#(UVKmbOz_m9Xc`_@+)mj)=Y-$b;xKLCnqF6xr`5RPI<NCj(|u4CR1v-m~>
zHK6K`^u*!C{9Q6K0N~!0;ltBCN<&Sio!X?5L?0%-dCZmjgZzcp4O`kTE@%34O8Z%(
z(Eiv9p_I@#=n{wtPxJ(Ga}inwCQ3cihaAnZMdVjt{+8O4Tdw;@0Hvw%=(Y#{a(JmV
z06-Y|KeahzqdA{%!ArpP{T7u#aNvY4-M$M3q~PB32+&U{7)a2+Pia3Rtj>KXLK7Ye
z*%Hz~^k-tP3IpH_pl6%kH_=Cs#p(QKMRfjCR?7vj^COWQEvgD3;%6(!PY`G~Z(*Uo
zl)AtDHLpK+)^RNcs^~Y=pGiFz(3ttsLO>;eEwq4y`L}iuzy*t>zg8&uUqyDHN8-cX
z!Ca>|hJ)(Ffn{FuJ=X<v+F_<rnsF;HMy()EkNpVo<rwvhn`xw?A6-s7TctSi;2wKB
z#7Xw)GUT_iKXAWe3!)@bRW;|3262cy$ZtPOpixC#VzUZHn7pd26rzmY7q6A^WW(1*
z&}G8h(&_ljM^dNXSZ~1Fr9SgD;tAAA$du}oDf3*_)#y{pL4>bUAW6?)59!jBx9kZx
zJ-dR4)gk0lx#BTR6t)}8d#Z~ZPpCIB@_t<<BT-P+p_Zkj>?0KI3uv^hZe5~hKf06?
zXc-fBZ?PX3?u$I7#}K%#YR@`AwEW75PX0Rx`iEgEurrP-b?#3fu1>Rpe9z!{93kN0
zPJPUcJi|w}5pm-{*O%G!Ckr$55bR-Wqcb@v>N6>UjTtP#oTlq@b9)zeu^3saS986(
z$P4TQO<5E?J<Gg3+Rg>3>(~jGz>}O_-G2S$1q8`nLtQ_M{vN8Cd38jpajwYd4^2@k
zEUvT986sLB6R1|Gvm(xhR^=Ssxt|pKPnPaqvR2wXf7k@TkoPAc4;x})>@P4Zzvn3a
z%4;=}b?=*aJyS0#U6}CE5c?<48>3tG^t{W`?n(i%oN$YdnLAt1msR68SJs8zEDU_*
zcK$N<d}F{@@5zbGZvuFQTxF|_LTfKQHX{2HNgF3Nd9Qg!*0nAbqob8{q(8>4WjFlf
zhZO$1(WnuvM`Gw|an=T2U=ry~v6ZJyk9lJc?MB!h?}w>9KON}2>kc)`RX3}}(iC^z
z;+S~VrBQSEJ&-uEIcC4ho(%F*t1+EOugAV+ce`RhTUw^qm*4|8=?D9XA8u-6?<!m+
zVb7n>W+y8in^<|2l{6eqcB=>;iOqYkug(i2nJSn$y^E7ilF~Ysp|~Gj+J1mDpUDh4
zp?@1ERX?=YJQQ8yD~|p4ACJg-lQ4Lu*Ehd)$j|+_a5c=Yma?}*J>?<G=AtZD%heES
z{Qt1`)?rcZYuoT(p@JYvw*mqVA;{1nDlH)0Al=<L3QC8x(jeX4q14db-92<OjNd)1
zwf0_X?X~x_-uHN(_kI8P{@`E^a<2P##eJXWbzYY}*Y#!X7gm$X_UL@>-#46k(u8hi
zjeq<h{gBx{H41*HaVem|9i2~xfaaFPF*G$futHNsGp_B+AV5IFl;cm(5%IPRKIbsb
z%ar$c^6x-AcMCF%kE<0E?*+xZ1Uo62&>;<n5~_0KU*ZpMxA-?d4SE?<N>z-QJaG4b
ze+Iz6<hI{GO#Y|Wwgj{La(2=eH?^Q^ohIv>|AHF-GNeJfcb*wry_aFoxR5uDE3}<9
zikAB5Xr%)DrKqoz(LiFeCg{PBSa`TdWitkg;?pkJP@(Q&iy>QlWWJ#ct-ju+ga=9R
z%ygNlgoKg#TzTF8ux@YnMdl<e^}&Je+D!{Z4xewO<solNONuK>)l6H1qHa#)I^^7f
z<&V5|Z!sJ6Y2=391`N{&{OG?P8}WYvM*cSejZQY7cp7oq*-T8dKp$tq-ZXyd=6dUU
zerx3pKD0Tfd4_xxhT|6$1fT^v($&(1T#&I5`O;cfk91tS6(7S>J^&xdlWhFUM^SM2
z-ifb;o%>_(JqF8>BNhHmo%j2Mm+O3<UUBCO<;lVd8&9nO3|Z+=t%yV4Herb2-D&)y
zac(|d>h8*?YWlk@TBRD#et?3lypE0`V@{So&M7`7hw+iOzhhnO3jI=3&#jkW4IfC`
zt?<dKWuBjud^)zhAI&NjtoM1p%Om6Pd!GMB%c4Hb7F~O`#Os<#U>L;c?%EsO;cdXh
zoR|N3HcGS$%=b0AJ$QNMxA=1?y9SRW*pLU%vNeJ8@vq%N8nN@c2L=41=jF!|HTxtB
z_gwZ5WPr}1&Fh(kfc&X?Bg)7%@wo1$nC_gUz?<T7{O4k~zFj}_d3&8z9JYggBzHMD
zxY=#u1Z_G_ARGLYSH2qBbGGWCX~?G%WU6SL(Y+%L?L6PMmv42Y*|N&-SF0~}pN%$x
zo1>cB%X#Pi3&`U4$kE@0FTO}JsdaX4#uF&l4u<9v(F)?)2=6mvwh@xmE>Xy0lWk_?
zE6U{<NxIYc4KlHf#IG>g+Gw+H!)(1dDFH|y7SoefTHgvt^N%6C0{8OD;*LkUd3ZUn
zDzL3=sXBi!G~2gPhUjqH<(a^Cmw4>fnG*JAwoS)l7F6<rbr@c#KRQHvfdK1dQ`~);
z*a#pQ*$lsjd4K=9Yj|lSQYS#zv#VzNfVMW&^Yty3eY(!@@K$;2k)rQ=krC?+@_qLB
zR?4a*#*9SoH;WMM0ty13H1|FpTc4^TCyo!ctY?1Eq)g!&D1YsLZ$Rmm<PwMNA!HK+
z^~o4@3v%uWft=Qr>^~{0JP4Z@d4R!SV~);z(e)nG^xhU2dTBHs12L2gd_iLypD_^s
zYe7@`bBhX)ivA>82LTZG|IBy-9j6*`<iu|NH@$DHr9ulZjm-n*4Nw*@bM{Y=`Kh7w
z5QqNqM`xeQq9%`|kvivIANw{p$Rc>>w$vAY)7GdFjVVfd%&ZU?g;t&}doqM}RC@nX
z$|X#GgPt!Tkv}nOZAeDQ%<kr2q%O=DBlvunalEcSglQXw_JW9d6m70eZ%@m28)54d
z$XGa1%`3%Hd|%X<WdkOVx$X}Q-p&|3Qe(chUd+n3c5p-=p39!J&!6iA5RJs!=8W%(
zO1uXqApW=Six4vly?u7m1$-HO4_YC&8}bum2HeoEh%{f`$xH#BU;ass=4&-4Jg)MG
zvyt7wcQ=_cq>U((-f1|bxO`_|E2CsE=El3(8%DAFU}H9nWI^Emw0oJGK+?@<<*zpT
z<g)z&^++T@>E~i4`nd6#&QD&yL6es7s(xWbO4D##3>h(t<>WBgG$KjNBklOyvZ^{f
z`s+oQJ0CD_K(pNB(XCnD6GU*J@M9dG;WaltEfxNq&f>E=R2r1J|GyBZsG$A3!5T))
zpA6Yp9~^%)7%dZaE)o7X^l4a6R!ROVyv^1J)e{GKYQh82N;T1vx>>VgVBV1@?2!uX
z?Ic4X;KZE==xMrYdOa@Z)ICi?MPcnne7nB7F-K|M5Gpi>%w959h1k73)!u%)Q#&|2
zn|Hfe%nRzMo8e9(lzJ9Bfx7&|$*O`XB!a4sgm4x9fcgHuOzSy$sYVM|lwCl=HD@%{
zF-%>pDFdHJ8gl9O-4y1-7N3i8f~cL^i`7z*#~ah^uMcWX%n~v`m}_5t;FiQe8NIX|
z?6OFzWW3Oc0Y0^Bu5PF=`pVolA))soyj#x>_OyvaqkS={*JS?iLJ|*RWJaQ}=Q@#I
zD0SgGxD_^?_QM9l_pB@1H86bQJ)bJu{=)02?oza>V6hYaBUuftHx)xVgDXSkRQ0i!
z8YMj`B0G)Y;eCfWWiY39#>b<)v|P#$<~Hfb-+)>?dihyh7T$9(yGhSqvY%`zoo|0N
zwTErf)&x-`B+_%O<Z5ja8${?32Y)j*FyPU-c)dw}u*QKCewfeE@|OLh{ZA0y`V3_-
zN#rN$uny$~r!=F%<-jG&n;*VUh|G`@+q9>5FS@&E-50rGT_0rDpEmQX@F8>DE$Bq+
z&M9^o$g%9tD%{jv^a<#^mJx;N?5evr!_LLn1WTfuPBFw6(uPo*7a6j@$G<SN5$0?S
zOh}N**eY3M(0}LDt9?oL-@?fJzPSA3bpgRpJ&{v|?C>^(K0lYgpDkUo^LZfYUPyGH
zbx4}45MHDzOLkc3CpTU&bw=L;BOGBpe_`YsGpqTL^59WuZB@kL=X@va(eU=eJTeZL
zst_DDwu4!%aXL4t#T;e3C4H;fBrVIB*rUjD$%Nf%3N|KMMRg^6EQ#<yKk+Gv<G<sU
z&=9=b2lU6tsB7;A=cJG!@kdhqAGH<McNJcuZ8Z3-N=cJ+r?nM2Wr!ud(XnFsMs8k1
zapFU@-YsBi8Kky<(KKoE6NCf2_zCKP=?WQO-(Q0t^Hm4c1i<#;9sKiDE|^d)8CClf
z<1Mq)1fc6E#q|r-#es0c&NJLr2_ac`-8UgS8|(qd!wrEvy-g4Hel~X|*V2a4ny6>&
zYE4#YQ{jPD(yOwoggND}vtK@i$CDX2d8wBIH#d$sn&pqmUyk)MO<xPKu@WE-klGUt
z${ZK8#ZNP;H$mT@vB#_;yWM+3i2}-|$-9T(Hj1Qp(f38#%JE(Whd`s(5HqxyJf(V9
zetjg7ZMF)}aedsmE;_R=#dO*=W){0tqDGA*9;&Q$8gI(P#Kg+wC*aAl27;eD<ahW#
zWx-q4{FcSk;JFqZrivdK>FKXG0K35|3H}%iVc43+#S(+y1s#=V=;JNNbwX;t*>E2|
zF08Ke+&E?c9*J7<ZZ0p^CmPcX52esi5jpbVx(pq4pEfjWjU7uE8(SWCD;^TU^ImRG
z!x)+_z8xX<o`(gh8JwE17AUeU6hPNE2S<~&TF4pkStpH&d4#NUXYUu@`<^OT%>#WV
zK#?Efjo?D}fm5(TT8@6q+B?!RgMWg2lJeym$?w<gdS~36p00K5b@Ak)_VX}(vif1e
zCz*=yJ6tnJMOHR;HD<Izw}IlR{Nut?6-{O(<`rm)20?Eja4%l9^v@f<zj0IkYxN`j
z(?FX)GthlkkIBlC^Ojhey}YZ^<xwJ$fw{R->WQ&{m@Q^Hx3d7-I9jgkNBO>mTDY~@
z3elYtJ<3EF<YCacx*sey#Um{UxMv(NpTHoo_k3c$ti&6Kmf+!;H%x*-l0$3#B#m#1
z+AV;aX-Qy*j)+1g(WOp-th{isd<3Y6Uj_foZT&55|6B+L-~)9}IalfjEk?HKsNDku
z+Sb}@PQ_G6(9^AB8n3JE-D`>T+bvmBb>je(P?Eg;52;cUEQXZZ+l@gq!8}J#s)Mv%
zdnw~QpVCyFdX@NeVZfFo-8vM8iG_}NdofMF;3Ja(k39wMX7tD<uNU>(TjYWLk693F
z*E4eSM4E_i71%J@^I^ZKtGxTLyQ(x;c!C?Vkd`!NFqqAsEf{jw_gW^|`;51@3=vc=
z1e3aT$d<wS(fcx_KSBDEYq+2!cGhR0>w|a@WJtP6I#uTSNVNCEYUa`x#b*59tT+Y4
zYlJ{}olZp^rV01;gHov!ppn+$0I9U`B0v3K#_XL{Mwqq4!5r0b&paA9_R|iQ4#+$@
znQ%$jmF^v`UctPI7jNnne}amnYaH11pC4U}Wsvw?Z{=@i!9-(>Y;^aqm>EYYI~EpX
z2e6CZT=Nv{aGRBmJzEHevz`eeTllvY4dke1cXUu}iu1m6cA|c?@pb0N#!KgJ%E}0z
zCq3E~Ni_VH8Qq8tfC5pT>Y!4@mkSf?@cc@l6^r{5#4pV&a%>C`K)rABXA;#f>`+uf
zr!9TZ*tpc)#k=DA^@;2P+;X;bueA-i7~?YOBhL}D9VkfrY~Q&H_-{sQXX+HA0%_0G
zJ9L?k>3^V;uzm#yA6JyJBj19CM4)el$A^=C|I)*9IYx_Lnq_pWL|gDtaiy7y?d%MS
zDRNKO0z=0#rfY!TgcZ!R(zh;1?iGwbX#z>;TU8ycR+O8dq&<}pU(v^(kO=<x^$L8R
z>UfNOOyBu1=<AyhXq`VmU<t0auVlMNX=D(&<)d#V4t!ScP-riS!&|!}4K$WQxlzg}
z)32{h8ar&%xZcWvkIssufipoglt@${UGyCe&sQhYP~sl7uW!s+JFJulW#t$T&x*t?
z^zVLD>5;GpWpo~$HAJESRN(i3XG^C3IDRGcquzB%0@g19m;P4Yf%BmK<vj4Zo0Cu{
zIdw})ZW*|0oop!!lKaf#PZ~X6>06B*t~S)8G)ol|`ui1^{_c}SCG>GFms>jVfEAA~
zqs*S&C|ANr)O2-43G%G1>1UBX&n7tAuT0^UCHeYV8`;7|{k!uAPNV~M{u^iN*uaUf
zsdvbXjraU|iW~<hKGC2mLO75igQg_%CToLMEJdV1Z+I`uWsF9{!63LIW0ZgTDT)uN
zifHXJ%eaB_iH6l9k)2+$Iq4C;E!p&oUyczFB;Dpc;~kMM?kv6T(=lHkw-S5&)eV>h
zu9N6sT|yteb5aE0xp!9<-u8zSzj*}`>u>}9*_lCvx~Qh$vDo{!qGtIN>j#b}EhxPD
zUm^hTW8Vh-Xv|w`;7{iVi5(~>3GkCbjCZjg{$f7&D_goCwJpGrZ=$^kL0mbT4`2xn
zU`Ym2Ro@{5u>bWyX`g8En79%lOMO!<#VNi3cZCqh?Q8-5<k%8VLoD^w;kajp_O2(1
z0jO}J$&$=PH&CZ$8vVzyOMbYqEflMcbU@^*qG@r`v$<WVtp}WK+u;7pd}VE2A}&4)
zYq=jUpWC?1CO3unB#=3crwuTtQieoH6pJAe@mLW8yQLnQv-jF2t5@cOXY=)t3ga@W
z7sSwx`bB6?KT_d@wAcyUBu16FJLL4%s@s!fJ*szAJvz{XLer!~jSOA1xVR!n)b31H
zM8_qJ^)?98c4{<7flN+$&`wR;I%{@vyiw}9H09N>P04v(rKvzicopQ?PzJeT-kQ_J
zsXYQb-x<KCB{{fl?=IM^Dt57@ukD#}o!3f)#KxHlG77Zm-YW%bJjT!v5slj3rfYDu
zF#xX$xM{ZndHVZG@@^tRv7V`BFJhKG-axfiG^h@JHkv#m>G=Hap(?z2-dH;ehS3oh
zRcsEPEki^h9k(QTAv7d5)6Q{Rr_S@kzP#K*hYV8eA`XJ&h<f_j-S=Y**`KFO`?9XD
z2A_f&zIHndDPb38Akq_+(h!+@KS9K6s_9=dL~g~~FU0nivWQW|EcS%FBVD4+7twOb
zn^S8bH4bgh7+W`|DK^l0a{MIFM)+k_D|W@uXlcI+@6**)saEV;bs43BF`WI2^BL8!
z1+s^42mCmn;~r>J>_RqA?@kT6z0%KBs~x$aZVqcFEsEkgz6V?MAJ)fAQ@)0MgZ!x7
zDWEX{*MGoVc7NRN05oG^PS`wmVLlgr2>Es#s#G_ckT^PJCyclK4cq0C8%eQbvZ?vl
z>*e6U#>O4IR356p&k_Rq->Ks+uB&Jg^zUfbC900(rG_ijXsUsWXZTatR;gxZ4|vOi
z=4bIVW2JFuI@H-{f`an%ySAwK2t)gQkqN_t&_;9DG(<hsE*u^fT)}>tK#@h2v8j^n
zFCAmjnQ<0h#v*}%{<4SFMQm*({-M%feRUmUdff{kX0^8DJh9mIS>2JxK+JwKJST-&
zt={C<D(FU(sAqgcxSKDSo4zfa2)#XNpWJ@S==_1;PTykRg_5CKqjgNJl(;>WEN{Q#
zYjU>)-GXt0f8?n;4xrko0o4XRHXhN#=W)!UXq3b7eoCw!azQ=z?v7bL0e|iBF{<MH
zF-N1=p`fl5N+2`D6>*M7w4DG2?zb}4ss;XVzIu37-d1GuCFAA_UWi{+w*g333PfA~
zw|X;%;#T?H5}?t4s8s2UUFqz)`kdfEditnyH`Gtvw6&#^f81lEmidd-S#H0kDMReY
zAD6aQ(q?U2df$ez49tICs6S?D2+(uzUR|tS2rvzms7T1;PEK0>;tE_P!Dg-EZf{Uj
z$Gq)kuce9LpRzPc2U-NATnZpnNdX7L2OJF2Ex{&ZTz|r)Ug<heI8a*3maa6ipt!mG
zXQ}CGs5&M5=PV6SagTt4(*-q%wnQ9uz}Js@E5zUZF-u>uV^iYSa12?dhpr_ukFFr)
z>)jGeX@UQR*0gLjD`ku7O{pJbjsW~*yXPN%lKhQA?~6AmJmTsmr5h-2obHY?)g`9^
zDebp{TwN9=>jZJJZ|m@OLSt7$g*lIUse;SK)I>+i=?*3&%mxa2Z?s5c6wx(%ydJsv
z2RWGmT0~aQZ5TQ(T91=K&rFl@9$s0l1TMWj{*b2EGhyisv9vvGAL`*Fuv)VuxAu((
zB1^MsU*ASOwRt9t-5Gc=w5OO?%a}SL!B*H}TW=3$x@eE%1Xq0S5#(It6Oyo=hqG^R
z>V|Q7zPkVEwIeHS8~61x?;RK<glc)&^t>UtcWcszN@C}rLEDYb84vkE-}+v)ny~J~
zqc`S~8O9jFQ`T_xpP)d<9Ww}tGi_Z}Y3VDAe!{VA(HW-)K2-TsKYlFUtOFNRRCtff
z@VGjK(doX;^nHel^<9Z$;hq@W-ZyU>vedsdb8oM`oOPhR()&gwC5Ziam3F}2e`ss=
z=oBm4Bt90|aQ#)&iL7ReyM47%Srj~GIQ7Ou^5G@JIE&d3GsPBtBLW<C<(j5EsA*<|
zm%uhs?!!LfJvLO+RI${jF^K*@w{v1r;P;warU}u&d};f!6H>sw4)~$FV<Dzs^j=yc
zqBHlL*a7=lqVhar%X`Tqb`_C|rkg>0rK|p|&SpGrr=)X~-FEd1LHhlk{2wgyvOXKk
zEH8=<dL0ynzekrs6X1|4YcuA?%ER_E>xaO3Udh|XADZ%vTTB@JXxAuyrf|bLLL7A1
z@!NK{f9b3GpAfHH`%B>e-w0poZBCC5DuVK;oV^;Op0*QO9yvSoi0D`eNQ<A;LDfok
zw|!90#gyeQn;mug=uT*_@8w&5cex20+eQfHHy7k=zIpPM%;r%5Ql%Mf4Fp@`QK|@E
z;QvM&6jQBAGhK&Vt~u0rv%zF(w+J=d<P#?$(_B#qWm}J7!J2=+&0zsPefRM2u)ru)
z;cl`#DCr$qpX?}>ZlKfDn<6as2I)xukIuU9fVNEEBmkR3D;E9Syx+DFn#9lr1ECFt
zt3z@v7vb8L+c?op4zkd1ubU{$K3muMoQP-xJ3VmW;?8c?xne2GZSBbuoXEbaS0Q{p
zh~ii48_@G{_&ZOqniG63NY@*9>_H8HxT%d$x*R?QM)L7NkWCoCu%2;d#7%+ADkMht
zI*p;}gxz`hC&!G*Y4<3JkNXICO}W=-VDLApy*y&DLAat}G<9l>VfZ+LaNklpY*AT7
zTj}Pa^4%z+rRz0bI~oFr$1&THJzfj0Aq;lqL-KDDC8y=xZ#u9kvNXwsH0B)`@;-}5
z-pz3vhd~>~7R+e@(p1{|{-$kO!xw>_@iU(V(^rzf9v^#={gfeHZ@0N7+lvhJ!Fz#;
z>88oKmlCfA%05nJ8XuqVzGL&nz)s!}xSOw%7k??flttmK1Q5?`Mu04V)N4vcT^)Hl
zMATm*9$T}ae*nVNrq%u7#Zp)h60%B+-3*=-AmY!;>ZR$fU8DT+7=9)){jLrTJ+b+G
z#J1U$Jz_XtUyfiaMggzj1gb?fvzE=RSdwAt0JI?11TQdMNI!MIy@5Sap!Cu>=UX1z
za32bt_&qH7pI-k>Ceh|Rdq!vhcgT!H*e<MJ$Fh7XLgzZmO{<JxmlFgyGoO59Q#$@h
z@JZA2*|M_kmRi~Lgy4V&Z{~5NQvxv+6*SvgOUB8BL;1Q}zSRwk`Cb3a-@+2g@?j{F
z^k|VhGFheiQ7OChF(3Wm7^3Nm=G05&Ikz#2_q}Dt3NT$!x2yJX&wmqSSaL9j5J%(G
zTLZMiTi=EhB|7P3NZSeh)%Ddsy`$dmxdT^Mj83h6J3jeiE=YXf%)V_~w?2&Uh4+}c
zCvN5ELJZ4&b5JnLK%GGIwe?z#Yz*qq?e5f+8$lh<`I|~_5mK&_1`9cx$rz7*6D{pW
z03%qt6}BHsWRf4xt58wjQGM|O2TnxA_nma*JzubY>u$~l_WY!EHc<?mTsfdCmi)B}
zk<8e;cf~h#VJ_pp%&+>>b2%_C_>VmVbk{S0*!u_0_&@u43E{c`4k8irhe?ndRAh@C
zn-ap_!g4NgNmobl^6s)0$jEu`oHwLwJlV?E)r7V{7ncWNy@0F*yN$X^pvl&8oUid9
zvDp!Ctt?<(8fy~hXC9eQM|Y*1X3NMeCRYg=LVm$3Q(FWJ+sLoG;q*J7c;s!KFnRiy
z=K=)RBI;~Gx)Q{qp{4%$XJe^z<=k_Q`6A6Rbt73>A$QkowrWBhHK(!9>;oDQw^xr&
z+6+T17wt<17_>Qv5d-KY`;+%9L{#o<u_WsDIN>H-9~gZdcHD}*g`6~auCDPw!{UAd
z5n*t=FYjv1YTQ)D*2uKh`i3i~85`Z(^Qoig@qnY#+Jy#!W!p?(>w)CMH^4&35Ir1B
zY(>8IqbK?OLKb1-ZTqxbQc|*xX25&Z`wc$BjG|lpH2)P}G*vb~kd=`9Hq}5ex)*{S
zSm`^sFR(rhZ7k2svX-`clT2h0tgEmq896*Y#8#C$V9(N4RW`?QvEJ`<kS(zgq}3Gs
zXip<}(`?>e0>VfHEU$6_I>RZIj72;0(^Ov36H*3xJ|zS5G>Kf=J-0JEO9JMzU{`wg
z9FNMBNyPH_u6{snw*L5#wHaEw3$#+ZziJ}LOV#9HB*vWV+-mRhAmgKtLb4@F#52^Y
zyt7;X0+(ppKORUlKvE<ltVK$f2TW!^*OdfE;?<r*zn{ei;oWeIDlU~E>tY&6c51Ao
z869#77J=}YSi1gZ`d@KkzkTiRA%$;MCtm446uB0u7`aRthv;eff;2O6X$sqHmbSGj
zk2+@y6?yblb^pb8dy@Bob8}vSCt2YVvU&r<u{hGW-A`K{*Sx-<zH3`nh=pB$TOdeG
zxt^?|#?PL9hq$lP-bueDHU0IQ$B&38Z6mk{^Rgdr=i@jkY5zy+#<ZLuIuu4z;mc%a
z_mL$sIT#;{b2-p=tQGr>Jy$+^lTz4XGyaH+VPILEbC}V@Jfj)>%^f1Q2m8K5=4Z-o
zxoTd)rp0>%J`A;z88BlvJq3UL1lkber*rqQ2=Od3yl=Q#f$w&T?6HR|cxAFhsn+E+
z-&f&5J63rYL^kQscbA`cq>Y60`nC5w;{Q+HyWHMpdQ@5-wXz@-O!&Dr7=|W~83w&O
zSBq!K;ctIYK&I8&iTi%i_&%`3PXR<t#4=XYyE*31Yi!NzUn^)jr*?-(`yG?L-{+s8
zocH%%J8vm39>QaG%;)?^CcsWq%JY8Lg#Dl40aM6#3q&qQu`ka9wgK+b{LCeEPv9kR
zNop4wQ3aq7A-O<zh(V3hY++9lwGU@h+7bN;n&XKnDKIThARkz&uc7|14<o)@S3guB
zPcK3|Pu6I6K~jyrH&>mv#?@*vCeYJ@0;=@U9=UMKFdn3c1Mx>H>|CW&#m=)tOQB&z
zg?=K&se>f_lOYI^P)~{Lfrb>+r=Or-<nUZKCB?t8MM#Boc(bQ=T5KDzg_))gNDAL5
z|0+jk<}=RKR5&qd2OFtthY4Y(x_Zg>zo39<Y?`K@R%^U!YMv0hk;@oO^Ip@(d&`;g
zq(G}|v{u{F@ZNB%pK11l1aVu*`hJ8?svcQTJLeX-_=1-fsSE>#C;ETJ#Kekhm$*mA
zv14J?-zXkyndmV)$Rz>wb$KXG*}$Qy@dJ>TDfO`#9xQnnR7<FJs;#{)kA5~K3=O8m
z6kw66EUT#f*kXG_uGJ+qlaS*3D8zpE;6;=iU!2@}1zCFGmu8VZyGyX;3{&qew}3l&
zFEI7wd3H+W&Xy&SvHE&FrI(DaosX;!I`TqO?0l4OA;j0?QD2I8&#I_*>*ZIygBzVs
z#Rz%l?1^}7cjTUK+&z%Q;eBJAGb9Ps6v>iNPmkZDXd?b-952S0_U=ybytp}ktS@X+
zgC-B2w&kIpHpFq~axE20Kzsl~#r$b%0#94u(-a^w&-u#^i9gd3|4#b8Y<^_Fdw5dr
z@+op>hbb+AXs+B@R^Nf)mOSaRo+%1J4TX0`=VjFrSwGy}{2cL?(iRic@j?V@C?7UP
zOgJra?ZouB=9%fs*&7;2&JR^S)0~mEIG#1DHeU1;^m#pz&4JYnpAs=2E>W&0t+_B$
z{kYdPs0!J<j1kuP5snd`QN1&RZf3Vop4Bw}G`>o~^WWKns)?u%@Yhj@xuL`!egG--
zojDV7&zww!wCj$)$Q0IGlE)4iw(5FL*q-Ls1xX3{0h)ol(v0%K{E-6=B^%-GMy~t$
z_)kDOZ&vxL-`;drYMxlD8ciQy+#-j)1W84sbwx8#J*fV6?V7gA3u<<;t*0urLb)5~
zapeg_5}>H#XES0n`!&wq8m*i|;Cr=?Xo9Une`KHgc-Ow{&1=b!OmDF)cpO_EIaQYH
z{RlUR5W^2m;f7uT)o#v1Cr1@vRe}YA_4p<egLX+6&#L`?m)b`gdIg?Jo4av_*F#^B
z%uoz3jw1LnHui*dCE6ZXG6su3U{%`X^}q{Xa6oV7#|C*q%d{FUxz!abd=7mNv9XQM
zU&|6b=(*w3rR}+diI0#L2(l&a^|6x#w<O@!4ju+zNMzZI-3S-7@ZG+byX5(zed+oO
zvhKfV-}tRm{kMJAIW1^*K;rMhd$KrZ*4PDaZ!%<jk~tDcf|P06>jGT(Nr;-PWn$C$
z4R0P91P6guGjiSuHYXyd@r9cuE?&IhhP{V-bdHPrPBN=QKZ)k`-Q`xlR0-#-#OG>5
zOBV_VpG@6;et#^M&6tg~l)?g4g$?d-pk++)P~#Sgx1`JU##53RGPjH8H{Be6t+<=N
zqKoj!;*VrIXfeML-uoDaYoan8eC@+f%z{~RG^{$?Bs#R%$)*3m&|;Co#2WSmU?_ne
zuFX}h?mVg*?%hgz*`ud5FL@gJog11Qi`cy~G@QcaLt2Q3ggFP1f>F`~08^6?C{PQG
zH3<zK@TYGi%?y-!G#U@rE0svmXy~m$O-m_>`^}I4(`Rtt9s!_9Y_4vOiJU&#5Z%be
zA#%vPPDli35@WX3GUKQq#^6xdxTj1iIo}fd-N@3`&IOLuGA7}ORx4&4Eeq?<oUJ^_
zCpj8!JiFb^31$wrMl1$YJNiQpL5qj^>HeA>_+@^_xbTjJU;*U)OP$Vm+LrUccXkpj
zYN3a#gx|XNjuTJdhe3)gKS8|H$)Ax>Mh9HkV=*_zf-iL+l^Q3`euBQPAgNCSvevof
zL#30ZjjIE{-4~-`DAL0TPdG&%94NKZ@<Wo>4(0`2?v~Zj@vcqKez!QK)FnF~p2a<V
z0Bec<tUjZu(X1WPDeo6;sUGf$Eim^~>z)%>dUv?2yH>W_9(Ri{txfUn3o1$w_sRL)
z7UAlw`UL0z*O2woeB@3d0)rqA0qoSMb><%SdWd*3FNpB#HW6Zz%U{zMBN!&cIagGG
zeMIotC-Mjt>bMks5JY_M*6-Y%Z0wm_9Gsbv-f80X7uocmI8u7u6ELFj@T~0od2UJ-
z_UQ{nEHgK;0@ZW6nyJK4ApAHHBJ+&+=AurmVxhO?y<O?XQ+4}pU3$D9h*S~MltK7K
z8VhoXb2CGLy{Lz(;CAE#rxdZ(>V8WD!?2eYl!jc?KS6{)LE{ZO*+>3%CfS86LT__(
z1SNrRQ;q$5w}<LVey_7^DBf1=!hbLb<;qRe4py(e$JGnp3<j6wKj6{A3X<JFbCY(~
zcC}hw33guee%;<PB^v7){N66t)GYLqdl<JCmPEE>+O6T-q>>0OoLAT&kq;GAwCDvy
zi~b|I=GV4+l$Skw<*J8<)vT+cG3&gCt&*kR%o*mH4L8jTM<DS4)%f3RT(y~Du!&jy
zNTh3gdsu5(T@P49rtb0?ob@k_;Pn9g$O53DqyGehVOqtVpV03`Ho0^4uKo&#Mm7L&
z=rq70UV0n@+~bSAE6xhw9uu`<|G_<$dToV+l0Mrcaty8@ol*hL>Max?ZGXkIQ2;dn
znwy|--4~t9Rt=iAqD(Z8+DpCw!?^y*J#w)64u6VR3bRQ{7a3h?L2+*=#_A8;Tk%wf
zt^>QAXJup9SAez+>4})?z##<4N5iLg^-(M`O2`F(;kGDTm-`p;@eM#eb^_$1U~5;p
zbi+&w02hN}{e*iDxq|0=?*78PMfZkx*<UdH6VgWE?z6_x9ka*uCl<gjT5p~d`CC_p
zyO|z~6@Gs(uFmA{42Q=6a*BK=nkN8l=iO)ZoB4ntUYGth&A%fb|AgwN60k<vn62J)
zxhA)A?l#+lVgh1TWQYrk<Dz=xaXT(O%;XSf!2A;CtpPZY+`ekm@mC#Rq2W<0awIIp
zad@q`5uNV*X0owgUnb+*hW!=$_$#b0C)Abx7{xI>1J0uz;25r`0BbrH;3OT<egn4Y
z0NVlp+mb$ytF!ZQ?d<rYq#^)unRi88Zkcr$k~g>#qWB4bevG{$;8#ZzW`==@U+U`O
zK*xD<7ke=4l|8+<-YYVM&>|_Z-+2IVD!)he0ZKT+f!+4Ele{9}Rr2+5zG2X}n=K2z
zDHvUWEu;A7Kf?MbMJJO!O9=`6j~m)zZxS=Q{pGCbZVLf@1aE&iOBAbs%`XiJ-ztWK
z=JzWtGq}<()3xXTR%1HOtV8eOatk$|wW>0LO?NB&{`UtqS8RYv8o>jC&+cLGa37F>
zea_vcFZcc`0%qJ2bePBWo6rDK!=rX}|CjayxdwLIdv(7U-vdzZREih4OJRa9<fwjv
z$Wac5;wPE`e!>{I5&o+24PZ9jsC^aZ0jy95tf&B@#@?WiS7RX-dUu_|$f&zz3%;x}
zLCegWk@5k6E3@#@p33Dwr0&A~YQQBeZn}FEnnP|v*~$}6p0nbEa}E`vA1$k3kBWcX
znI0m4@6fAyMDfXSPaVNGfIlg1q$@hXK_@)0(af^6N_{Yxm2WUR7g=BYO@5F3n_W!V
zRFJhZS$<34b5gig#Np9OorKsx?xEz<S;@iq`xBGtZGEH{VbV5IV*E#kMCIUXyG2df
z6_T3mG%2R8;OR}rQzE&*s)9!gfM;@yW!$)b?ZH=p(<_e6fMJEFdhWNjfO@kruV3P<
z@~zdlI>%rf(jKRwRen<R$#ust4GuW+WM)0#>9VtBK>o^JR2_c6e5IGP1sN0Mev5*9
zAe*a?!fHYl-%WkRjK}p`@bB+to_-5(D-wqvYNp3`4+6-oEWSevur!OJECQp51T>00
zgt<zB)=L?;xMo`Rsf$kAzg2X>&zex<gD5E|iClnG5&Z)L+^(Cbbg9HuZPNh5iBzxc
z^kTljPjr2a6R6e_Qie6m&teq#J_&g8IG?T8T^v`uM71OU@$Ehiqy-TmpYZgCcgtTe
z)QAFo6$G0-CN5yPUVxDowJ}>E`7OY&eBz(2C8+j~-x&FS<Tv_&KR1@3RL2~wABBPV
z9tV0-P)ubO;DEnRZYYB6Q&&1GD!%Xk+Fc<2=f)DCvjf%q!40%c&`|^3H6c>27lR&y
z%>T&ki=#WbsQ2{$la@f|#{WgqGQ-u}GJArs;LyvOxKQ6;Kd9Jw$!%=$d?XS#ran;e
zNp2U9<U_H+{72QryIaqC(3)CsP5M{8dffTIN`M#Y0bclm(=DH59q>XrF~$$TtOb8m
z6$4t2P7Hr-J;F`@bK^;_%o{`O{0$C8K<}qQ*<WC(Zd{T1_`^J=$XCk)w|zPodDW8j
zNDYRuUTu*WO+`YF7{Q9au*Tk{c$E6(om`AREZ6B`O}!>8A~W%x3m9fW9d({wV=$`3
zfipSPS`_OSBm7odn2L5PTsSFg!-GzPC4bu-u2Xy=h(GY<>vUXZCo?CSG#S@QYVmG=
zadk~4aZd(#I%L+;$Xo5*i&=(uo*+V4nzVfq%*I?#ZF7t%Z((1kOR8Qsx%HL4C>JH^
zV!<25r>x6m!S+Nps?$&zUKU5b9iO?dl~)XH`;7xQom-*djzgciRfh7}a*PHfWlcH5
zIfbkBWfFCUA)f+?%>&sTIur?`O+0NpQctK4Vtx$g6)f(tYE7>BWxY1P!Q~vvn~n)v
zzPE9^x6eg^r}gf6)Kv0~gn}JzIvQ`M2Wrk#VfaKMK_6uKe6zq`b|*~Al?d)n8NKY8
zSy^7CVWQIWc?v=kp{22L9~+fcxMK^BykjeTd>5FpQ}|b)&7Z&ar<=IH8q2x9_Uq&N
ze|pUM-~ZmXN?<?5j6{H+ES-+*4prPunRe<yu?oGLe3q(3yT_r{*(T3#=Ow8ZZNrK`
zOoe~PYj+Y6svm_wPbGOp?nqP-Sj6g01#n8ggidM?T_CR|L|bw+$9wR(Alaoc$3&qe
z=6Zw|@?KugLj=d5M+fVWbp=y&$^MW%nmzu`Go{1>_vtpxgN7FXC)fp`FO_gdmAa8K
zj@?n8)N#56B*Lb=eOhpP^$^kK*DXXwXvUk<(fPW}d@I@!SV%&YyHSxqGqk+B;>8k7
zcsgW%bN&7fG5Ho$x|d8iGEHTz+KwInnQxNsJG7yMrG%yPK{j*ZPQk&)g+&2+Vnd0w
z@aZonPYMP;61xpdjXv}ZOv&sYz}T?_#?T2Jm;rC;mGA%lmHpR|qJI}iTG(lrzjoo?
zJ4xIL*EDq=H<)gqb(A{mc8ClB)`CI3e}dL{fFUw-FMoo93}vUy4_I?L;12b=yg(B#
z<wYWOmzl@Ftj<A&#eQKOhi%a0Ucwk$txT~=ws@G*c#CPi%1=$(N&VTq_tv4ap=ZO{
znP8EZNn!q?fUW<<9Hn1b=l{zTQvv%>M)ZPa!;=7g0KO|d6FL6WLF!&C)AkAbL<G>P
zj&wKRZxYN<j$nJ%@~{LuzMV+hKP>(D6Lec(R(lfoKKCVwNKqeKtow7Rk3#0x;4)L^
zVXk4$PwKc$H{^$np%1j~A{7w5y=NZCW&!8PWQ*I6%RK>67C<fE_Vq%Ow*rwz(%>q6
znLYx<hf4Gdd>-3sd-Jo{^GC)?&n1T$sgMV*=h(>dE2+%sl~hK6qn~M~&CnOp*?IZ;
z)U+v|f2!${Y0IYE2zqQg1`VxBHgfQw7<|ytj@5s_8`LQ}Jsx6XoIugX$jg&~p=MSa
z=-A7%f55fTWvZBR-*F`hz((q6aj>+sU(IQ_w4Xpz);EDsM_{>+|D7I-`dfe3Ygfu5
zqoqbEhjcw#=V?BwQ4>7OWhkhOOQ!Txqr@3qR$Vv!yr&uOG`gpyHcm4qVE9^#1!sYt
z^D}(r#hDK&DI%pLXLp)#q=B)&V1808&)g!$+9CwFTuW;8OJ0=s-p|>77MrtaEeF#I
zmRI|OI|C}Ch%A{@p^wx#BG|Lvi_RrR(~`C4nPj@r3@ZBnzn_BB+aJFo5d&une`Dyf
z0v$c6tb4U!5ylkLBv?_kcPmL?IVvGG)O|v_8ojdH^3^@nnlieXSsP%&-IutqDY(k7
zixZCvF!!)w?U>O~)#P7Nc-gFGBx$=7VjFcSUPzA))UfyS+Afq)sSQ8wV>=-<W#m|m
zk2p4#1~kPrp>Rz{$6X9H;!YvM%<*_gBA6|=QAo@FdiWtH|L`G^FbP8%m%EF#xmj89
zSsuy0l4iOB>1<<&<}Q9fhA}yQc~qmW-Dn$AMlH0#YMWK1=45?xGiH4wDkS|?%38Uq
zew8j>D)?@RJtd~zMM9ZFJ@lc{#7M-BYef!3yyJ%u9&7{oxNf+Nh>Zdx(ll~m8xaUI
z;X}{4fFA92a%JX8#qkr{FB`2z!LO2iUk9*F%KjdSVr3uPpscF4+8)1niFw>H&iu1o
z+Xi+6|E1C=1P_6aT`yf_!m;j(hI-s~WG@%>!G{Ea^OdCjT7p!gV*>V8Du)wY0y%X=
zFO$pCxnayiVTmMvD~ae*)N4P79Px`-vSQZ}yLl!HV?=H6mcdi6luYeXRgH<M5w$a#
z$<-<yK!nyy{ofs9|0{8}h)~OzYR_zPskW<VOElcCO(IIti{ts68dB>;L@^C}rSZvz
z{P8|TvrKwM#)jhvh5jwJGEWsMF%EVKv$-(##r$V32E}=~Z%k9?4a|)0Pf;sXaNxtu
zESX~LKD03sM=G2FOT0>ph>%Z50YON$uW;1&ag*KV6D)I~>D(kC{Qggd+QG@b?b|o)
z?d2z|`?4iDjesr?m85^C<^T2TXe~|^Kf+BX4vBNEj4E_yx|8rH@{+$21$ejpAdx2Q
z!Io>$UIY4+PvjZ1jMrk^h-to!t2}p$Lh6cB+qK>jiOMF_1!3w!rZ#2>2g+(b@FA42
zkXjLqJXI-5Z^FwW32%`mBpqcQVWY2c$2qiHXM9pK^60(qspN%YeNj>vm~nlXi+l(T
z`+WhEJvQ1}g;H2)NxMl+8nlj*^cH{Wdo1@&3guyDW?r><QQE#T48lSA&v`9P-(}yZ
z_B^{L+TE-9VqK&5831})2@=CFO>lrovzdgzdY`*1h(8T6enH0f-$@O~4wtF%Jru7T
zD!cQMy?ni1go^(=+7!^TRwwD|YMP-c=ICmTS68dc%#e#G?=I<V=?@{3TZjs`{^q15
zDKM_)#h>N$My9R92A?G`E_9UQE%&Gc#;wh6u60RBH)EJ)L4kf1Yh+Yxv|U56CT>Hm
z6O-b8<Fh+r8G(3a4rgMNNIYBQLw9I1l-8@3hw8;Ngir2S(~-hO4}W5$v*kzr`={}>
za)wWGw2yhfDkCbX&NvmHxF(Vw#w`-9Yl$i4#SC^@*~>Pj$&`&K`Cn8(d7O!WwMjF5
zbGoOt>_v~KoO(5?M@Y#Ud!8?;CjSIkI}2#?ZI%umPLe$xHn+PFWqhX@+V}x|>FF@u
zf>O&!KCrxvKqy7i&&Vhx>W2`n5{sayr~*SX=$05?R(S8dyxQ<7@z7VqQI59nlN_H^
zI!0YAV0PoxNa)XhG~;b>keIT^SW8?Td%fD3G90gL2F`JA2j++W1PM*o2OJKC9Vj;J
zMNU;K`J|b;GqI05XEX1)q-tgW!v*saazy*L?D)8O$oSE4>CcL`#kNgcd3enE2fz}d
zvV@N{vWPK3PhH-Ax*FZvqQfF?eJmf%Kp>TO;yyc9jY6qOfmVUHX#WEr=>Ia3yXud7
zQV*}@RVWcX$+C8RZY1H#<`Os&5kMCR4<mW2;&put_cq#*r7FORM5tGXa&nn<X<_lE
zFuok$)~^a4*-Q3TiM5Tqo9a2e+ttKBTqevSB(D(ZL%5#<SoQZd5#+12gxxOdz0Grr
z(xl)-z6e+^E)m=mvtjx#x^BzZHt(+7Splm14Vy5EAm)k112Xo`h#QI1?3{<3KM+?^
zlF5|)+U{`pbOrgV)%{i|v2|@AuW7ki<qePJ!si>-utgN$*Rmv(s!^hkMtj0MIT#GR
zD_!3F_<>oXF(>Tv8KNi#rK35nL=1<R)qJ^z9Wk~#w0P~aX7i6Wm3pbFPq1Z}z$c(Z
z7)&x21+fXA_q%?di`OMkPYyXuk??4eX-A8`{IFMDCmr|^#zRSxO0nI_o_AKxhlK42
zddV$u-xkBBRl*PL2w_fy37|;Ap7?eU!&tmcCLJ+`*z<o^=i90`&X3m0>(jd&h+4)|
zRqJbZW^mRj{0KT4EI_PP23z4akEOdKP;F+tX5Bk~1z}DTM6Q@6TEHv!DY(vcIZrZ3
zDw|qk4LaY8Y^#x;$622ap(HfUIIQ<d+}?*@L|<~Q?xu3|-~Jd&J0Zog#1CaKKnbYa
zvjrf!lT0hEfZ2$>52DOE`VJWU5A(IH&w&4g+QZZB{-8Bq){WiTmo$M-n3et~$Rwn9
z-=VW+w8Oma`6%I)ddmHwaL)rbrsrQs$=z?U8Ut2m0#+Y3GeP-(F&Z5(T9vPOTYc$F
zGNhM~t=}ETx4qasY4PgVOi*KvW?p0Jn$2<o2L~v?IT6M}Q~FDiSBJ(J%!kGGug5L$
z#*qAQ2e>+q&10F+D5}kD|9Rdb&`ypvR$B~Yob9PB$=F97$?&;2-8MI*ayJUOy^!~?
zX(`u$CGW>lB(O*l6tVK$OuQ4gf{wmRxVoEJrlH#OdN6GW0!}S2^VMI1_Mr_WYO9C2
zY3=Go$*GHE@zm;}Sx{Otd_26HYMVG%4^9up#?j;*N?Izfmh@|993Lw?lpy93+Z}JE
zv~fBjY(tpFjntZx-eU~$$%|GSj)1fGNxgsVqTgc!>eW7K(RB}LspngL>s-r`Bp{(2
za<0(`Oq87cj-4?lxT%rgGk*zgio(!O9Tzgcy;<tk<E|3I;NETdl+9|k*+dShP<&*J
z#MpTP-?Db#zn`cPZx1Ea=**HNxfLMke+Pf$n)^c{+ifW6s8cliuFwM!+$<vX$Eo)|
zT@QSR1?1CSEj-y8^&O#NRhl(_8Bs5|^;sVIxFa2PB7G4ox%D0gyofLm@Efe({6>8d
z9NXq4Tyy1aS`7uH9&9Q5touyk#el0t{eUF{5p$(;veQ)hTI^m^zKFW=y^9K#yn^Y<
z%CVCHK~X;(oofgS1}Sy<ibf%cd8dt?5>>?Rsj?=jIaW~)Q-uQX1b_a9NaybIPuUxe
zT9YyPi8X@W>*>+b<p$UL<L@7VJ|uiyB~{`RN96YnBSnflJ1%qG(;w}Nq>o02NjHcT
z4d>OZ1{LrR=fEK4AF!xB+i|0+x5+woMJ@n!hTqn}$_$24v$c3AS{V-Gl=<~?(471+
z{Nd#5KHYY`o6nxHF9!2i%PJOn2G^-H`89XsC7TNK&=dnZ1I{b5P(j?Rra$F+CRuUN
zN+00*kP3+VFTce7>TV93LR^|bou3S8>Z#k}Rlcx39YiGzok^zo*$(rD1ePn5_fVKU
zT<=sC?G`!^{4sLQ_s1h+SC@1ME}a(lP_!3?X-JpdB-7gmP`k0kzD4xi^vh~yaMw8h
z^rST^hz<8pMBJ_YXM6eM#dA0pQQv#BfLuvg>suO7Zk9xiT)uGmHIQe`eP2_L^AA{u
zJUTejWkUr{uB-mk_xo<?F#Zfe)Rq_ZKPJ6Lb$+@^^!$XMS+0k}GFydyjT|Q*vzj`$
z^vOX!Yk!^D*Znmg6ERSslPDONh}xWl*Zs$&XFO{X(qmNKS!vHNr+d(PCAa%S(i;F0
zeQ%UyoZg@9X8+CZ#Kzd;wc3K`dKZr7K+f(RB!ble?}n1Y+9!y;I~o)+RnB$9xww}~
z5c-gElB4a8Kot~~=uz^=KPJ8S<IJl>A4_=kW^SV94g%p{1BCzRvr0rIkR?smuJU|I
z;EScb4VF;UoHmbPio_*&v|~ZUq>Eub%Nw<|7<Q}hdF#0Ea|dfI;aAeXV;Xs#lKFET
z0)a#z9vVzSx{Z!It8BxU*#&stp|Y{|A99|t<kE{)_U;29VQU>NhP0vtrWlo-ZLtO&
z*Q9|Q?m}vND(CU%G><xMlux{#WLLi?V23JIA+4V^Uppd7+`MR?HLU-mR{cKZ3BYB}
zI`PLH$$dJ^Je*npy(h}%-={p3(QH6*`_1T6R3V}$6IuQ8&rF|=F?MCODEc!_-g8X7
zt6Ej(_wqrB+a(J|V(+iDsuXalq%gp#$bASQfK#dcF?&SJye5WRy?dL>U%eb1JAYHL
z+Ok;#-*}<Uj@W!lPf(w&JCSwvR7MO&W*r>24=yUJ3EuDE$&At_4X9F7j-f>;ynpBJ
zo9}Uf6u0pE33`{ab4g^b!3VNBCLr+3Fyh|Snik{#W=lNvaz7kL%yvK9PNVi+uoG90
zsb#YRchIJ?`^UC^7?^D@ZFAC6<0D!f*F`f$W4!r%0FPRxp2~IthBL96tB94(ryggS
zKwy5scj^y7%+`lIDy{J3(~|K^p1xSNGYqE{U|*(Otib#D?#!&mXfPTafNYp=-aQ&}
z<W?LCUf}Rl@>bq{@|>HysEtXCfe*_?cU`qiT5|B+KpuauWvwTisy3)<dTgnLd?}_%
zwFQFvwf5oFUZ;OQ7VXcESpNrR<fsRi?2)`2*slL!w7)U&b>6IW=^&AR$%^l+x6E7w
zTd}svSg7LCz0sVpr@GKhaFG!mrL}m-_mb_)WvAWQGV~4Q>K?BW*D*8sX-Vv6C4xD<
z0#R6M$qx6ajVIXPI?mg{J|$^Fl2~$gw23)6yLNi5bit~oTns1ihL7C%uorv39*1l)
z)^F=r%)nyZUH}8iYZkD5n#P=THU=fi!h3}(-PF}d@89;dd3J|$m9Gx`^;BZ6+rHM?
zFl);AggweZC_&87H+`}6eNI9K?JnM7B;J}_@2bQdUYe)ILeusG7dth`T#3}Us#pLv
zs&_-R7lG(f4dh>L|4<MXqXpevVnS$JXKU~$&Vyy3-0{Vl?JcDOyUYvxO-Gq=i)20i
zgsPE~OR9{M{mhDX+~p-1Mh*s>GS#Aw$vDWB&w(3=G++#d(H=#hQ>ls_BT>V0O~~Qb
z(aTT-=o}EGM?%GR^`2?olT)p7S5w$reZwzW7s`!&Fi`CutRlV859rkePSK^yl0Oaj
zuO=Qq0O|2v*YnP8VNJhz`17XYRx<vvN0vun72VW6!?9UNg)AYwZ1JaU^I5X?daE1`
z=KvQ$|M@=&HU2xk_cz5J|Etf%|09FjY&EI(c6@m0t;y>1GH-8FlYC_P#&h$C5(|CA
zul|vf_&rueVM&iJwT_x_>}YM|+*yP8x-!F?Z$#NPCTd1UhqPZExmCd*eu6%EWo-&D
zf8wQwy7{CnuP`Q^drlP|&-DXXb8k@Dw*3xne(ZJo(th1B>z^PEwH2gJIkGoGAOwQ_
zWE;8ccrvDbGPZF9#aWxc)HKM+nwY@UV<-l7A8Kao;?2$C@;{m0<k(d`^*G<36j!cr
z_9cF3HO{;0DgO>_C+3)pY998f)Y<Lyu(`ULmeJPb&Hls9s5y!xJlAFqtiD>M8!{%x
zV2*HP*04Xf>op!%yZo&hg6O$DoGiKyhmEcJwawFp(C2|mc}0s*%;e$vM<UMYn=!~-
zV54X9zHThJ#e#<tv{t0EFR`PpMi^j&<~OxUjJe(LLr?ZcS1eDy?9ck8_PKGTp6W_p
zV1*h)_2EyAQI}NYJZy(#sh_3m9%T<4Yr^?MCszgrc5ITd6!PbVWE)1<rs+AfW+&A!
z_3r51mF}~L^fWXx^3Qj<K)k=@v|&9rw|IGdyF1X`T9Y^XB{#c+%nuzflZlnN<NC>#
zeV7EPnp-;_N1?)$b(jm<&Z|GkI|{h>7)Jvjl0*ZWBE^N}k;A%vUwW+u9`0{VHkLcL
zxAK|fC(diA?q~)x<C=2EIrd_ovyIry6M@dcJ@^&LTDguO0r_Wngh-!c%xv<`35D#)
zf|Qel_`Gii@G}>?!6u8?l}IW<gw%KJks?c8^Z8iPvx(tks!Tsb(;1UVpa2#xTgZyK
zYPU1<$+iVq?*VLG&B5hXh;~S^v4d48LhPE?*&-xuqrP(w*nAZoef{24+P=)C{u!ab
zZV}sNFk3f=(ojj?q$-P6*Fxqd_l_^*9{D;2@KB&z#1c-^vb^6?=rZT_-G9e8tc7&%
z%W{z3t6_CLipKB^E-;q|#qRQ8%Y0V1IIhFaliB%>f_>f`iSl7y31@BNMTuBnvxH|O
zZ$B)jEn=mO+13zpQa+3ev`v&LEh#T7lQ0#f^1MN)yr_;QD~4+WKcG??`w?x`b9+ss
zjz9jXdV*zxEX&x5f3ZXqc&p%(a~jaR80Cv%9p3*>AfkWuH~%gh@te%yZ+*^<H`VeI
zG8L698(yBCES{X0$Sw^tC(=E@dkfkSBNNF3nf^<m9qlg*&~TI@h)Q{lHbejTt3x}k
zzj*PpEx{X#B7d0K`VYO;60hrG8}CDX4EN9iW~S?e@33&St;3{gNrSy&s_m8)O>m|o
z39ER7G~(UyR>?6>tI|(>#qje;xp_CzhSRHLNd0e<;<sY+@s|x(1o1M|xrwEE5ELbD
zs>%Nt&EP;MQ!<C*CB>;uD^I0VjqI&{94+t#FtkcJMD~-@fccx73mDH2b)cWK&hwy4
zP+N@W87Y<Kx(gNFW_C4xxy%dowF;Nr)!rIgPDDD-&~3NCp1Z59aQ(8pddlK*4M^+*
zUL4j^{>AW0m*DyY9~WnP=SDdP^*WLg55FCx3jr}i-;l6kA6~HJT87ss0JQDg4molp
z0G^qu^l6W-<%sRYHW{Co9r+1NmjL_7z!1-7C>7$<*2;0fMab)v+7=|I(Ypmc0(IKU
zN*ocSj{<W&gi2Y;j13%_*lEeQ62lAD^^|`gg{?i3vw?R$?$RN!BAcuNM?ZN!VpI^m
zg-}@};d8x^xG*>t!>g$HO2uY_y&bXcHAuKPQ74L!++%F?!u235Y3n_goa_wA@0i7j
z9!MXhaT0?-{lHqINQ!DzVqPp3OJNNcj*26sNP&zd=L}?zbau1Wa6U)i{HQoV9Xcx^
z!(In$vDrJ>)3iKcw9{gy*kw@SC~6W}Y!-A|`U#?oC`Q;$09@lPzN?V)tZzN5lVel$
zQjA=aqay=$v15h8B#9$wXc*5=5ci7X8iByx(c%u_v`a42UKWB~DSES>NPkpc1F}t9
zo?x&~Zl_J<>4Z+TVj=9l*}xnu2`6q*s;+oFZ~?mv|I+SSs(Pe|=uL$+OW*zpGHi00
zp)3+zez6fmcK|w(uEm~L3E4AkvMG{e;*(aiZk;3{&`mtW2S@HIlt<71f5p9ZRF&)6
zFFa|aK|qicCW3T#BQcrMol3`Kf^<uVG!p@(rMp8)=@gLeM!Kb>*7vyfUi)2p@3qf(
z-}8-g4rBNS_weDl@B6tre%BrP+>!&zuKP72$;N*Eg$6=54n+Xy5o&&ZVqC4tRuNc-
zo0Elj9*(tM6h8PwV1NBtdCL5;n!|~Vr!kiZgyA=kgZ#NQkcWPd8Od=q)`H?ie3qcv
z2=(#R{Bb?{-A85O&D}`Y+53^(AMzPeKX2>YZ`^wTGpOX)M-5-kl~^K+*teqh^lpgf
zdC+~+Fi;kmHQ{HY5qn(7u$%iE=q!JFg=2p%(>7P<?hDOmO_w;m2Ss;M1{iUAY`FK*
z7VFI-T0IISQf={;Jy*c41f`c%2L}=^g%s@0Suw=*+2%542CaFhC`kA;H!aO|=f%Y4
z$z;YCdOf8Ar6MT0Rb3U{ZlU;IWrF`=sE~MQZHvU}qW*6n=p%xwsqJ>2nx;;B!p?*r
z=->?kZ@2QV@(<S>1+{`V>Kb6syzkk@ZURXV<#!}DQ_<<ot#=Q93N&uJ8gHpq*N`_g
zH~9OF>mdUzErz|fEb~H*U#47}_s<_1Q+<geF`ZxC3W3MyD7+mhEg!SGzkb@HY#VF-
zYyKYkn?W?1h}_IvKOKgj4;QYsI!QnL;A@tN{kd^(bsv<n#mDjXf^>T7Qkz3;Gc_<7
zvmTgYI$S!_e^9&D429!Es(WW$2ALd}1WEkj@-iWHO%n$dTiOVy1s$>FPlC)f?7PF+
zqIb4+Ine8Kz8oEn0+xZa+|o<f>GxZQk3BmXv?ujV2qT+o*?0WzsC=~zPPc`bvOnKv
zG~Thu+!Gr4sc-3@XbpVaLr)7tqn>YyKPMTrxmGGqW*F2*k7ft|c+9p;*Wi>>?*#y8
zJ-1%;KKSxp`zx{KeHmuA%lOOI<(GW0L+2!F)Z?X$d()k&5q>9&rl;DB)0=%G-*lwI
ztFoS}77JqKfHVrr%rJXbwFP0O%v0_qj2t>ZW}7)m`aksOBL5#=$20vGNAq7S-R=Kz
zFKxL-SEN{2go~Jm-lCDdfs8Zkb$w;+nEd8X3q!i^;a@!jF^;fyx3lnI?QRXKGqyU(
z+GG0Z@@grkf(&FX(Up&0?<=TwxeZys9^+<wSX;(2Y6lQMp~VX1_IwXErBd}qFb}o7
z+)&1vi~kLx;P0s^&!eto<i)Gjv&OSkvgTTDi@XF|I4uDll9=SqQOX%Ji=vj5qTx3;
zZ=Z~4pQ(`X)B--frrT@Z-gpm*)#t~?gNzYbfoo5L0Z7E*cCAb2G`QXRLX67lEPm)`
zLYg0#G*^TiKyQkX5lPg}c)}Mt<F|e<-xRp~uya{l)bj!Od*oEe<aoV9=Kj^rZy-Nl
zVXa8M^UH(SpFy&~@4#H*i1O@+Of!|AItx~JdBC{N%rT&oKl5jra>MbKWa{(gNaZm?
zhsWGc%s<4HxCwz{gwOZ}cfMd8vhsOTMD7*VQSiR=tbA(v({Umpy^ZoUSIYS=^E<kF
z=%HNYP*r?nY_0vx;+G#|^E(banbhdVO=qKQ5+p;S9vq$Y`C(>PUAXq6&{>&kp(O$0
z%OZ}}_c<NZ6jZ!yA}0+RchAg^%Y&cz(Myf_EEZVaJn!+gib<@Ir5>hwd%gIk#?M>y
zf+i%1v<U7_7^<ZB?0Ge7f2w=`cfgX(5D#qD8m+CpX@aNR(W>1s9}rK_MV7^MX*bj+
zxZRg+H@VRoP&~h>ujClxWY&7F(p3CKDgtQWaB382#%gaoPLIiPb$2f2F@_2ZVhhk%
zD0hhZj%T>L^0mr7lZ84fY89)y=kE_(kp9r0=|t-uH-bJsut}G+x)eOpX38#NAs<jy
zYWU^hqH>u=gFQ+4bbK6MaT-u@dyi78bvF-2Yj~5wbI2Ks#LuposAhnG1Lj}ZVgE1I
z{*Tyx+5T7?EjEr9Y(Qu8kb^rf>0GMlvSq#JtmgcStOH49?lzYrBRDbASHBMLT)&b0
zW*`U*{c<lLf6JI6e|ru4as>vK)?LSeF;Bg>j7{>lKllE==JM+i{GTq<62}mrp1x<V
zAeWw5zAy_<8$aBOYO=+7zO!@?CxTjtezg39kqT`feLJdqk;JOln8enI(Qlx31|a1~
zQ5O9Ws`OB5{H$=pMtxvT#MO<Al~6j+kNL~*FZ}Z+et+Sg_Gkoz;Qqevj6e3R)_d|$
zCIzC-<eKbZU>?!LH_9}F8+7v-4^vPq{Jgxp(B@j*=CUboKw)?;q_6R{`~n@*dfP2M
z#z_{A-<66>?9U(5et$a;0IC&y2{g!NiUFrXP4>IBZ}I{FcY)O(zgCt0SdIPX$Myh(
zGa)Hp7qGAfUDpI*=H?z=SlK|S0%|M=Em2v}zAv-Nl?rd{&$B;Xv}a27?8)@uxd}Zv
zJN#ok*LnzHZ500cW@ew(HE8o+FD&Q&;Sl}3FW>Sl5panoO*?=fHM@Vf_~*44mGBfw
z4k9RCbu5ovw1Hd`+B{mNgnNG*BTA)s;^ZW*jG~hCCA86#oI<5hfC+T7(GB*H2oAm{
zMgkg#A8f`0VVjhr`v>fR-d?0R&e);(u~iO~<ROlLn!ITsu<ZP=i$F-=UmkNgQ^j~D
z(4~a8h%$m#Lqw{HbImsv_*J@aQ#>~gbg1wblwhW)WI7;X1Gb)If7OU4=VY1)Mmb@^
zSl8M9Z(jS~KE*#Sc30^B$+lZ1M665rt7=!?214oaj+Ukq<f$k~3|3P7cqu7mDX7p!
zU15~rpf+iLms?Xf5&XvG_d7U@lY7(uk2C+p@!Cuz&xmXPAizH}&Yzj&561a39lKIW
z5b}z=<3JVuT1XyOZ9V-0<r#Bl_;nyAeHD6B)5n}0NX{$-P8m(t`t=fA*@IKv68r|O
z@0JOL=J)UHbN<J-msk}Am}&eU%v26E!Tc4${1b8fA{ccEfI!B8DuUAHj(M=wX!%>Y
zHanWd<JVH{`|@QF;A0;Cq5-czXu$42vQ5d^f4GAGrR;Dezk2(ztizyIxbMScx|uNF
z)TS0K5k@is$pV;qS~ofQpf)%4W$k2C6u7=VWkx=ROkuQ2BJ7<6k0j6@_d7xE{L{zz
z=k5NYJ+6PF$bY5TUxe(0@Ol1i)dQjH`pz@e+;)!d8~Y4fz|Ci6BtfcxGk`|_`GU)$
z{}bnsyzv+M7fJ9B0rC$~@z(+*Cv%ND4CcVl-Szmt$dnid0f`0S>+kM{Hss|uWbHoI
zMDRkq;S9~fQHgkY5Snxl&>ag%V4DAuz<=?=SDaa1So(iS6n`y6Xij#@{r<-26R&+_
zz8#jWe`*NQOCr}4b&}69;%<tMlOkLR+A~rkR~M628|AGX<)xBPjFC%ZEz4=pF$ruZ
zW@0@3U6}q^#{FY!|0YcTS;PWY_~*xd7p4mLbk@$IMa05`Eo{}YMvu4O*`c@JEn}ck
zQg?YxWc~{^O!Z{uJ<aqkHq4DWC^U>rM)mEWixI`|;PiX^A63df)W_ddiszqIp;|?k
z5^g#5qRG^Y?cxKn(eei7Dq14Z7JtQh{HilRrO4!gh3}S1!Db9YwfsGzue15%18#lB
ze`0|Eyn%mFA%Atg|3o(bbXD&FnZfr7>K3N=P`D=7O0H>PAQxHk^i-0yQG9(0Ffjk+
zMfFd|5b!tS+T5IoxjC2#wV(;TT-QCd>g~|WKlH`_P>vLC0z5lU>$y5*Dm@U8Dc7W`
z7p_IwJAnXE^^*WP<v&U9zj3S$&5!?Gj{UFae&^VPTbXYM@QqKOg7hND-nAegIY!Lc
z0SE}!3eukXFPr2~9I<C+Fg9-R%@vm!@)+D&k628XQw&qjjUBE)3PJg%iox@5lJ2jB
z#4m04-`M{%d_dc+Vz=YfFr%BZrtpn}1{5pT#Ipi1_oe<qlF%qBQ4c_^H*81_tr{6Z
zIV}=5=#SauSO2c<{*5mH+vFdb?H6B&o^Ew(Yg`>cCy06tZ&)AuPH;<H@J-1yw`rOU
z_ODW;xm}i@jvcZL3LfU@X$w7i{qXsyAoXq==QSZvXG7^4Q&ya+M5C8+3~?w@j<3sn
z_t6x>Q(1!4O#gitg29*v+Y``<F1BehO7E|OePl%_7nTmhhpfsVb3-e@)I3@TqFfBk
zvu)P_2})J(hT!lG{{hcA@*h9YMo>_)h8?xpd13wQRhc&mxXNEY&nMVpeX=LlL!vaJ
z!u&2(N|ec#dA_h2W$QG%T<{Wlw|*<(?d<B{j<ylWdf~2OHx@F#2|__{TLHi&L^kVo
zL*w)PEav*6xiUQSY&PGQFCH!`i{Sa=OETZVunTwLs1sTp|En1w|Jk7b*wj*A)gbW5
zozS2I6CGm69H@Yju&Y254SM9RrZ+d_$R%#5RvTRhrOGm2!Bcuo;3N6PAe}|EDQ2>^
zOo<^$L_{|?;}kOyLL~k*CX0n#Tz#gWcB<raLFvL(pb}phypBzH2hPUm;Z7PQPh1)W
z+5XyPZWiUf**Nrg_+Cl<{-&dHcE>Bd-N4tf=Y*T>C89<?M%Sr}1F2?M1mvZhTBL7#
zZTC1NsspAs?qiS6%{*zf?Irs;D0^-vDQ}L&wauz$6TDH?OAb!xEwffdQqaXQUl$YG
z`Mp%I;So{pFViP}9xDPJ0zL!j@Hjd?ZCqoS$JT?d->|2X3`L)IRr9J3ZI+)iGRsYp
ztvBvTk|kVSs>QEQ?007YZfUJ-;7qML#M?2nD~iNZ8e^Eah(S(S0BiqD$*r@C6#a)v
z{2X6=wkv`=mZCz-;uR*}*3`w^yCp{NL8t)*2Zg@0SyO}s-Mm@@7@toUr{akUWgw>$
z=7{I&;=>z!6@-wz__(JtkKy7-EfYffri<w&#~*Zt0?x2TG5!_&s1j9KW4aajXt<00
zpb<f;ds`=nH5BzobYCqFvM)l-qPV42Q`=Zy|EV$J+2Q+K!e)^9D{kqSEfl`4IiO<x
zGw!&L)T^X%LqmGQm%5l*>OT#ltjKYQLHG?qS(pbtuFg5O0+o9WYP;H;xoiygspj`e
zxv}Y^%@zTw{ApYCulp*bwQ)Ya+aQdC4e?6g^cUM?nvGS+ehI2D%1aqmP+aJlkNp6W
z=m-f|@DwG$7pj2%f(Lws<j;g;on5l{Qhz2qda+COfjOa^`K#u%GqsG|Z)k7_-5}}F
zBfe*2dg@d3EV5XKZd8?GcPiAaERhW-OCJk%)jn5VOuK35QS_UQmF1?FqRTBFHsa(#
zDOcaD(wbIuT=!JvF59N))yAsFs%1PR4@PSWC8y?lJ98?Cx;6bx`)OGAqH(uZ#-kKA
z4(No1^|^uG+TC4L7HI8B6p=iND2E~SJEWbYY0f2m8CM6oe$+m2^mNWMa@02@Y*H3h
z=;<a77Ivv@2kW{yhFd_~>Wy?k7IJpRnuvA!LS`cQMd2WU@Cu&F>fVdi&voOa8_lhs
zVwRS;l9S}Hip7SU8II}}p<QZvhV3wS6R`G$jw$Ht$Lttw)U@xqm6UGXAd`8Z7CLF`
z#e{#TH^pW0W7)jsAx~oFg3Bn)<MzP7Pi@Ol2p*%&*pgXTljh^3%x8$XSBh2Y1%Xni
z)KaJf@^zkvsb<B7dpZqG#BNOz;dnEAk+AScX{q7a_1>jJ$zgmyjqS%z<IzKMr3wsc
zW8ybz5tOwtT{T>ntdR8>Yn0z3ZdE?eHL_a?D<p!{$oa;Q*1XRL*$@>;-Q!{qIAO+J
z5Ph?xNIu_Y+MQf33EPI9lx~Kd(8&)kNc_ATMrUz+p1ydIPBk3liJ7S$O3(xIe2Hf%
z2Tpq?hM`g*iHl3qs}^m!Cv$>2MS2~j!{YEzzou5bhUryvc*mz<PGwk41>d(@p29_&
z3oq)MXFgmG!p7Z&&!v)%F_t29YV66aL^Fbl=*J3y7+i^K;jEsa{`|s1FY&apSpZcf
zK;@EvQy|iHPZ!(Sgu9ttnOeklQS_D~#F7?y>x$T%s5fgX<vW|GqkGP-PD$w^u8v9|
zS%&u_e}7Af5QRwmyyHMXP=*4k6^SW;q@mM6`uINm!o4-s=Q4?oLHB6lwi(jnjtW9n
zv7=FwmpSgVO|hvP;}Gv(#on@Ew*v>4btzL2N#>qO&-U+fO76uW?br_Nk}Fuy(+0H@
z*G7CKW{zSaLm5l*$BNKf4>n2NV%Af2`ifzQm;b@n7nVN@_4F2&f)gDg<OQ)<-Y>ha
zQE7CMUL?os4_a4tn|Tu}9+7!W$!!EBn$;4B;2T?`AmdZpPU{HU=b^~+wQihpTu{|j
zB^?S95E#u%u!N$}t{}IWrySK9S&qCMTbp3j`^rtKUj0l*PF*YU-6<w*Ad7!h<>cY{
zeDe?+yw-s@J6E_E`oP=!{jl5c`yLh47&*tFC>_6S>tkzuO>NR6Z~!*X;FH0=eJD>u
zBRE5FD+2T_J7P-|UT%ElC;4bf-^YD<hp?MhT}YDNvR$aXSC!j?ynjU%)tK&t@}emt
zGsmXdK*rrCmb0aq=2HwWxt)_hBNZ`FaFd2->!Hz0Xq!FOskYD*(wn+r$WR+Uzr@Ui
zw8kn@h#QsBs^qL+H=J5xJ#;anz17}80sjg^?&S~T1~n@d_NjWE_o^lGO<HGAPi3mt
zwVqJj)lFZ#vfd}1<v;41vrM?F{=z<b8l&<CB)t18N!WEFAD(l(6v=QB3G}opm?_O`
zDs&;WHqk|GRBelSX(xRYy_kc4y{HEYIX=Za_4qg}+5?JfJV@yF%u9Z2&vyVenzi|c
z9Z#OA`)!UZwQm7wv!}~EJ__fI-ew<=(<8k+Ho|r1v2WSgwjxMF-Ygc(K{wjzx48P2
zIZ=HIglK%sv5sJ0YdLVBMK**xDHdlK?E>uf{A4Fi;!gU@`N&5YtZ(cwshq@kad5Bz
z0H?S!!HsX|ZyHObK3Y$wr0uX4!3YzxL)=BYKw=}ShIZm2pU3$a;P3->yZovfv+_jL
znp6T^yB}0e<cbJ)lQ*k^?Zy1Y&n8^QUngDg&%J!BZnbQ~9jYWp7WpjF6ZBn5i01f7
zJ8uegq_eMsH%&->n9}IEke(~l=9iHCge)5yMKtKo6DJ%ijkHsLNoX`L>x>$xU?F+Z
zbOsRkL>*dU+GF47*2HSNz$Ly2zTwx!oInY`7w8#zq5`VF?h4An`Te`Pt@|e@DziQY
z;;}_@dC|p^(I%z-l=Dzp^B-iK9Z`tpP)P~z&=q8gOsZY=!Ml&9aP4lsC$E~EJ5G%a
z<7VRhHQqgOT2)Ld?C&`g5yLmGI>Aj?)UC~ur<Y-MmNP>zQfa+A#iwhzXHio3BT5Z7
zsi`?>eaB#KV@5ZxSr113iNfsMAh<!J3TrvO2zP;!Ya0YPd%FU<b^(qBPOL%@`+fs~
z@Fj)+(6x<UNUq)ijiqc>8!eYQbfb*#?$>5V(7u~|!&lB-aqNFgP2c)?{8pgI*yc4u
z_Zh$=T;gsuNEwd(*X|V(N?yJ|sf29nJ12Q!niKxx`!>|7#>I<pWhA>T(K|eRUZj#7
zk4-3aK2prmljGpcLVeO)ck73k#f1<GvW=)P1}Sn3>bm7=q%048?-Bz{zuK;)hyHns
zAywhIL$?ZuhqsuXDCCUff@ETMAYwJu#eQCG=YBXric55g;2<2Y?KRIlW7Icvod{7`
znlN6G$P>U|;Mbkv6GWQv&zrw`6bGdanhr2*Er==S^;N#ceY4AQ!=z4%_zk2~<M(t*
zet3JsF5aJ?@|>daVraZYsSq}2r!lW9g<e!SQfPfZv_Fw!f%P>?PS0PRD`A|(O6GWQ
zH$}B-%t4(4-B?74p+-)2fNEJ3bRpgLy-}1v7{E2<B#@-tpgvOvkw^#lql5AQ9h$Nx
zNSy4D3geXh{vPiWmCCC6X3MhLiP(Myc(OKfJA3SDjBLZWtGVQjJKF1NB2JVzFE6vg
zA&@*qtgAs}x`29kw^-SJHOB=*Sq;?vm&%N3yDcYUVwMws&8@<>WZg8B@g+~DNLPff
zrP+TePfr0%Li;!QH@@!*YqdZ9p#X)`bQP20>OWiMK<#s`cj?ub2#C+Pzq@~%EA`Qa
zbH<<r!EA4pB$FIJlVs)K1QB%{0$Km6&(o1u;~*4Ib~VYH>7<;i&;DB9&tvx%`E>YT
zn_G|)AxYJj+}*T)PjS3|o!1()Ec(K~Bf25<Ro>gJfV5+l;7JbNMGd)Vdpg`zc{ThG
z1T2oZ3pjzpPRb8N8@8t7@KQUeh3h8m@jeuez^F^v83NEgIpg+p_DA=%AH^d)_-1q-
zWL<gdeo9sh)4sv7(Wl3#g1xXjs3(D7ZbuaN2J|xuwQV&6Sh3F%MC4HstZ;f}1fgS+
zc#x>GB>6q<G+b05d!%T|flD^z0okBP6;`o`IRZnFIu~n`?bhqsvhHc-T26tUeW1|j
zfw`wP+lD`)egWlv8Z@QU5b?6h>&3L2moP>0-DXAhLg49ww2n2`!}c~R&_>X5sjxs;
zxdU%x8GYvu{h?L3d2PbWf%$S;jm{3K@5?mIHNM~Qoc%E|N{})8c4WE95Sa9#T*1-O
zg=e5w8rz`arlgPX+?{0M=#31cx>_aHion;!+{zeF%XHEhBiUV359BCtg;?vZ7_7CY
zt`H6NCnEOGh2~0~C#(uFVkFUmuR_EU&!gtPx8`Le7aH_Cf@0*8)`$&F@-$Ij1q}<L
zu8Zop!X0tH*Xr<}mR2eTtjFWY0OsD~(?txGrWo(ROf)ImI9vJs%eZB8Xl`HD27I?q
zsLKZ@Q>Hi9x=HDCNVcAhc3u=|T&aQqIW1lRG18m=U%5%<)`Ro>R6wCOa2d=OA>}4^
z6oOZ9GSww-t&via@~m=op}u~?oel&P>5aRuK`00aEl3DF1urkj^5_^iZKq<-L9rMr
zJ>eD(y4!e-dA*i<!k@t2{IQnQ9UFHHLN!!ch6q%c5w`Eu0>8SUJ|FaK3awd#<L%uO
z-W&+%)xIygJt8~B{XB)2E%|z-elg4ZvKyp1)`I4m=Mk~n0bTSw8!QE6Uwt0<jhdF$
z>372)Z4|oZE3eKfq{nNoY6rpf5s>fLQwDGK^(e1v1fG_SN}|a+JL9I;HVAO%sy+|d
zh#w{aus&5xprXff?chtm7wgH(kbhO2<qFOStUjDpI&lxJfWsZKGsaFI`H#S9Up11g
z1Wj1_(~aHgecZd`FoTPb&xzg3e#L(Xp}9;n#2xJ0uqtyR@(~kOiI_4#pHz<a$tH~$
zR7Idy(bnLA+IXK=trsT%F#Szg(|i;nY}}VJO+aucv}f~a?Ihbb3LkW&+~qh;;>AQU
z=P2sL9T%j`-0frQivt`UqhY-Ua`r=GcQJGUAut{=dJ_xqzN+gg2=uI>n2nE!hJ;LT
zTT5|ytKsCPBvOJXMLT`xB$iuRRW+Y$t20)$clRp;*`ea-khXm>FDEe=eb=>d4qJC<
zO^&BSkxlo-{D50omrC@>))iz9Q;b_^FAlV$A$@Wgwn(HRzWfHIT;0G^=ZmF021eD6
z;dJO58iv}3J560)@ub1vM{&E;n8|8saf%OsGB(UMjKV|og0R)*=9BcL>J+}E1UTRO
zh+UC%{}DmIM1OVXkiG@S^v<PbW{|AbbF3UJK=Ni6FWYJho72H==Dl^_&QW_xBl~hI
zfn-Zy{QAIf&zAsZ@S5N#Vfu#isFo&CZ_$bi>P6|*VJ!`%^(AT6T{h)@3OT*?_mI0u
ze{++w1NpaSC9T>6!$*oK3=?*YkK8_++$|K?s7J|LYSYu_OO+Z4F7196sG6kz=5UrA
zv!q+c--tzIjNYyO;0LB%TxX?tmMXe?*ewcM$oXcnpLq2};ftxuZ?-jE<>YUf`pP+H
z`}@fFwv*~dXXm;d2%X{!CC#1oCCR?O1!p=B0hR6~8tM~eywLz|;ZCsjaa7H#%vT^v
zgfhS=N+PNgc4sBz91e%3_4Sby-xKSLH&&d_X|-qwY@llr_)0do(}*#<B<v81K;cRk
zJ9i5I^-o!tc6@gfT^)PM$SQmq4`-JQYSryT&2#(}GlZzK8!0%#%%CT{^?P>({ITvg
zecu}<HiPH5%aOTD{0AgKuguNH^NPOI*QLLd5fG>TR6S+KK3UE&`fi?=j83v5t<oe4
z5H?Zk=&k7f%r;H%&xxi<Nj;X)JfRs@iNg@l8ib1RX4xokti;l*1i9m@@ZGs6(hPgI
z2l!yT0+h<W{%sz8A|r!t-q-!25Y{xgs1=?&46pCEGPs8cYfPe0nC0a~_QdS6!Sm)k
z-Aj8Sy5B&gcuuVfJL9NG_e{r=$x&w}Vu>rE`yG2QKNOVHO4=g&V#-q%mX6Pq6ZX8@
zp*f`<F;m_G1WGXQ=ud9D-UT|&7j|O!g(Yt=HH|d%;oR7TF621s26<h$1bX2PymXbQ
z9190gTNvZDS?Dr2+Q+$^J@H);B!t9hs=^=)DV6LZBiku{ov9>^Vba}$@IcxJR&D))
zRG@Y6F}IjCGOpFjnr~g$!iFPaog|@4Q>00HoZJdp`xqMI1%<^1XB^bq+|NrbgWSjn
z%u<%5_Fwm{jCtfhCfM^TCu=REc2i)@@;JSOBNYDB+<w7nI0B{^b`ithOj=t{(&nZw
z*p}i^lZ>?%+*v<lz}redQEdmRo$lld!t4GxUvPZ{>uW>TUrd&jZ^TT5)VD-xq-(G7
zAh-in5|cjyxboY5k&l^~IttsUCu&uZv|QQbrXl0`Us{s~8UQ2wRl+_%YAPT+2=CtB
zjS~OpVMVz~eZvd;BrB{OQ9LEf-J~RA&g0sti-v)zO9dPO?R_$q5(`~7K?_Q6Q-At#
zA7~zId?cS&quu2px8BQnUgbXT1Ct-mE{2?5`|#F}w`kDD%<Znreg642RyBpR#=;n#
zq-TQ`BocGS+}-ttc8bHrnxmGS?c7)VLWo8y^!bg5H2x7*+F@q}1y<U_1E9tORFH4u
zmLe80J4;{63_*tI%k>KhS%NdWCI_A&N_dqE=XHyaTJg(3OfIIHoM2bbDN8@!OFW}1
z+sz~8WWA<H*-5=IismGqPD%vjPmL1Fa^3XmB8{Z>E%dBFDi+1#+wsWoGZpgfs+t7|
zYpVXqJ0j?zL@PwRfFcx6vO>Q$;o#b$cD%iEh)7{n@Fv&)wS1w(fIoMwA9i<vQFq~>
zNPm%7iosHgAFI3Fr>>YVA(Nn>wAJvkYe4EIydLJ}_Ew)xu@$zA<%r^gzI+(k(AeUd
z3YqvC!cQx!LFtgj!eTGtF}%NWo`p@#Im#!-W8^?$GkyIbAYT0x+eUy_u@$T^&%EN?
z$khi9#ML_<2|hTi9c20Gs_XEWrKffL$mNXl;BB87HTN<L)vQ{;55fTbd#s7_Iu-l`
zZ*C(f+>x)S=>@!zuu6@#mvTMLeaMBYC$6WeCn7uttC$mXt%_V6qFM|>$wE#7#$8~8
zB*8dmLyqD?$D4Vs?(mA8NAAluHPJbUv89#X%^pt?iC{rNAU5ESPomLHs;_sF(!ZAK
z_%y-OE>6!#p-UbnTtiAeO!%JU$*1A3R#Dmwfewrg2S!h!M+amsgCU_ymmbbwyMT@w
zywb*Y2}Mx%@nsn834f-auVYEs_qxRnLj$M-#%wv9odE782nK%uQ6gPBc}-^*2a};J
zoJ?Ggn?(o)5N2nAtFX8bUMapH;;44b8UmmQHm@AodolDh)x?=E$9N|x*%U54D&@dW
zRm!dDyHcuEHL`e-g?-Nk06E_>2&2q&`|{<sPgygD^dZX%8MP5MI?rEEQvefeerQdv
z)lHlu*X)c!(e4v{X;MyQXxnDl<v%>2CD0|}UrF1nxbj+89v9WKGFxqxW=Vgf(U4?j
zMftHiHGlbooKwVXyYYv!EwTHqAEHAgEm8Kpf@eb@_0igJUNLqm@j1vwC{C%I`_e!#
z;vr*tNgC1&i1`j^xK_K%CKL=Bjvb~KH(qU~{<NpAlQfdQt!jH+pSk8pompgP)bC!f
zXWzl^V#*)<{>;41C^#mvm(Ks`iA3lYaISE69}NrV!bp+qvNvpbY+zKJ2h#l7&wFu#
zT|hCiz)a{4e$^c>wuy#8U$GCKa1MBKA1+RC<qKq|_a>)jD~|dZcoX(@u@7{Om*Jk@
z<#1^YS+ls*XsM$>4m}NkcKF??j{w=klT2b4x^3s4zDeA==Q&<H|J(rNV6;ED7ugeH
zmao>Oh3WRnM=NWtoN7i<`zlG5?99<!ebI>Rj>~?UZRX3NnOGn0mW9p`_Zz7Ab$c2f
z3OX<iXv~8~t#N`^C~5P%18&|zebv&lzQ*T*_!+?$f+Muw(B(sKS*!|o)Luf1*-rUx
zY)J*Qt9;YfaHL#TGO*DX1b4lsJ)>Omv%%JcBwCVfT4c!Sl4#s|W+B*vbUi^OTx|u@
zB<m&Cq&JXBJ_DpeYwdi^)_|NJq--l_?+Z@QI#H&P-U3|sS>U+lD(6K5e}lS|uLaYm
zI*Y_a?3kh2*nDsEX@S*fD%U&3^E!%I3&J_yK%f}&bBMn6@Il{;9Eu5_dEG!y(Jj8b
zRf<#*m4`;#b5B1~U4%{r9zLP`Y+d7Y^TH(bH3b4m6wbF0qGkCii*#3VPm-i|kL+wn
zcxiw!Nns^Qu7W4o38P5NcJfyTK1QCWa;nLu-m-E^dUsef?=%8h<LGq5ND=vr38L|k
zom10sJc>~ALFrCPPe}1ggO{+H!5kRgD#MqT!l+iLWVw`Nz4WkWy@TUruKM`F)Z_c^
zb0I*pVhs?<eC_kinDP}5Iyo5c?q;gzgQLUROB_9;MSXf1UpVE619Rxh4>GS}q{>u5
zWB@Eb2n0g%7yca-zz{#$j&BdwRD#H+sV6H*t^2!?eg~;+i09pQRFvQrtMEyCufn+j
zi@|RoswLgM<m7t9f-J>$GQpy#VzcHEb|7&d#paWMf%QTRDvY3Wmic*#c`?Q3DL<5&
zy@l8x9uC#AVDy>+aqz5f7-Cr-ibir1yF|OB;T>m-I}Ituax=@g%mlMYdJVTND{1HD
z_e^vFSe-<2d697uAXa6_dLuYNsje1C2pM=oJ+eX|ER+f)a>~k}%cuE(PstK|V#i<O
z@wtQQjlugfSgILrX}n>}GpePdHFJ6#9VEy)ZOnrW<dGPcbhp*gb6<S*&oNM(53FSi
zNWy96{!%z?cu2bsdhiXE9(GoH<cr+=JZFKF*#~PF7|qFjTaa?|FHz`oB8dpb5m_H=
zQo%A>mMXISO!K8#V5q=TUB<2Jkw-3$ro@ez&DJN{X?X`LxU~iU#fh|K+#q98fP3R^
zFBPO$9|VPI*30-m)@e^N%Hk~rQKV8SgKmqx{tj>Tzw5I#j!^04Z3VM;0pLv53i>Re
zktXmx%7}eVB*qXibXX3Mk=$#q^gv*b!$oz9R!a)OkG%$V8EXxXEx`6TQa>dbQ7Y(~
z=dq<sOdPDArkLQEi1sf#f5y-j`!%KvXif9yY|+VH*h?`d(1+NALw8wX*}p?&nPk$-
zG3-|EB+KzTuFLu+S<NdabAT<VO_;+~WJ6I2=d#fB!cm+^=F)1MyqzP?Q%pOF6i&ek
zKz)d3pufSc_ct_O;dYmK?EuD4Y2?&Omqf{SH4mdQIz_P>fOv+sa^o7DzB6!Uw;|Hu
z;sdJV#zUadSeDWv7KMJQuDEU}Bz>K<u|Cw)%sTdcKy~!xyRE82CY0clu16R8clkAB
zpWvjsf@Q7`O5Z+gf7~p{tZ6^f{f<i=kF@kkf08fNNV}iooTUtnF{G3|H$^rs4n;Uj
z&{9>BTxeI`)-5O>cKtGZYmC*(<Ce{G;G}iJJ5LBT)#H7JnPE^%RBRJZpuDr>FSK>?
z>z*2$Y`wDX(fFtsx=^wNp9BRjI;LH}IMg&G-6nW_!dmms^VC$KIbGG6R55A7`>3`<
z9}ar9#0_dmlCgd`&R92VJU!&WQ+|4e2V{b`EFrO-r1keamG^yBEfL;&OM7aQPc84R
zn;H%;1+F`wRx!jJYHhRxR}^YthR1ndLbD7CLMb4$O|1>dgQUHVn$fz>oA51^Lw}wL
zlx@eX7+y);5D;=Nczt3*+fBR4_ftbnL&Orn*Cj&*V)L|wfCYrgy2dvot)ru?ahIFO
zy@F>B4ipf2kTNk&2FBA;Ew-00g7n7N&*sy@?X#`ZqRf%G;xmBqBG`5A<D*~>HgF(Q
zoeXB9(Nn7!NS`zqDF^jbI!0(uZwasJge0l7l&#$4xNL5Kjn(4A^LoYCG7=P(uiXkw
zRVis~NtZ~b*K1<so5yG~?)V5@KGVfKSC>z1X~ui`W;q!pDC5&+tGYk)&J@>9iVLY4
zgcks0A=!#+ZfG>?=d%Uo36++?x>9t)rX&t1D%kQ+b8RRT_A%4}CzzaEu$O)+p-cGI
zn5NPothT8oNuID9ZbSVOY+ryfgY1X5g`AqpA785Q89bhD1uJn4enEEE4L;~KLpTYa
z97t08u2B%Wm&$sBn`CBT?)9g-FhrwOU5wyuTR0wQQ0}!%)X%V}X|2`fbonk)^}7OX
ztH*Eh)kWsPn$YcUH~<4QunxFol(d(|A}L-^1v%vQ=XUZ(TU*Ag8T6fk-zm<k^O^Li
zO*UD9tT3p<aJUSl<Jup~5`hZCvzTKgF%&_*#M<`F#gJSZxFcH5;sQ^uYDWRZh7*aD
zL`S1&M(312-PHM&i(6g2g7J}Riy!;cFr7f3{F12(R)i3%2la-($CpP+;FJyFo^%)c
zvoMAIa~s}S>=F3mMxVgoN{!4^$`!30m+2Gj3h~IbRH#Qq=~~Ptb{W5p>T%!KGIB=T
zK-EBD0$KfW7whESOuMnOpuAZ<{npY`R%wP7OJ+?p8cz+>L$P@j(Qjye*mJVLQFVLW
ziwv(kpxC-Yyw$k@VPAKvx$yeP*dLtJGm<R+aw32E#xBvq-Z!&8CDz2v^$!HkKu(Zt
z&_+rPYammGG+D5E7gN9B8P(nh(C`w@FC3lMJzU}dzSv)rvVmKUr(#YV+v}RMAh1d9
z2{4RvQS$1ID0;B-VXp@-QoT(Nh+l|n#YGo*5q4J4W5g!_qX)zIf~!xcYgX@szb+`c
zx+Aor>*Z}w<jL84DT%u4vo_MsKq+s^@^>L+v%47&;!S-`ROT%_F6VxFJJrO;-?Yb}
zi(ksL^&P<3=<DR>iY*e*ZKTm`bU_0A!f8m7xZ^HYyk+7P6lTvtN#-)_%IM7&-y{4U
zY8Phv^PCsfJv%G<IA#^IbozpBc&~W?^|X!p0SeeEYFfl>t>E|;VI$7j6!BmsIyXqo
z0l_kpT!MetC%BLq-bdHWJUDTNO=bEUNTunE+<RRG%TQ8@<{ys!-@U7&*2lr;-5_pU
zQQJq{M!QK(z5}cQ`!v85rIr}6tmy7W+_bZ97;xXjv(WIgo{za$Zh@~K91ICZfL2i-
zZ$=h+_^E!GVlQZ%u)iPjwd_`Ea)_S6oWM`EQAO(S8+J}v7MBm-l+K(@i3zEH{!Z94
zBwu^}bn82<PutCS3v{)_fmqK#-^g%Lic$bu9G*Kf;@?(e(uc~0K_+@$W?-PpT4q{T
z7xJVb1X-RE79vXt!Z4A<MkO?mjFTdgC4R+B$E2mU4cbvx14v?vnq%Ue3Zwhfwto7Q
zIAQ=d#TlW#$K6y-7jR#Tc;5rLKyIZj8p+c&vgL0IzIZ+8X-g)>lf_1T)LMHn%!Goi
zFj;UC8eOp13o}m|Q`jleTiDc&1+5AreWGea^*g_25n(YBpM{r}eXI|7v#**ZOCu>*
zq(+E@S)7IF1@aa}1-~OBVIZyFl+1f~@#L4?#U(`z%$a{ygJrj?D*PlfmJ{kRNL!Cg
zQ7&8KA^uF=T3;7t6r>#EVXtq-tA46lv#J^N$qGfxPrN`Ku*=}W-q<rO^ET74a)tGo
z7^y?d9WCCMcstjaAIeCrQtUh%9zEo({$xx{YO0JONF^j*i)1;dR8z&lO6pdr!h~)W
zNRT@6Sx6$<5NHX1mKU&4-1dozXM8N(P%dxP=Iq&_4>W6$YvQtHI(|_pyuxg+>g_64
z3}F+_tCMdiX~#=-<>enrMhfoY9eAgH@L9Ve(W^L$kBtPy5lXfhB6EID+~zkmvX=4{
zo`>Wa&dIWBSGRv4mH9ygStp}4BMHi#wU!_-c}ZaGVUrCZc}}$GQ{ha_lcehT0)f82
z;T&ZqR*57D0%3sgCIAx>_}NBd1Lj_f3@HzEuScw%&OO-d3l+-4G<T57_!8n+qYo2i
zSPCjO-h1S1ytJIxdeD%X`l=fJry5t!*dvUhUhu|LShx=fdI@^g4lDBRwZiRQb75B*
z>a*dQnvQkUm|1XI%5c><_@q45uVA++_cDoJKPA;jdY+)QO_m~=3Djn~t$4l_FG<Bb
zL=*lr`XC?hp(Pn2)&#;EJ+=HPAU_pQIYnK2`6`IH+OcGDY$&Zc3nM@P0p`!mwOiPb
zl$AciZDAr3TsLH9S^<e*0h#eKze2qoHwSlLN2=B)0vrS-PJ;ih0_hUIULV=|$=^U#
zWpzrVGe(O$h`NsX0~`acCVN$R*1RC3gXKJs=2Zwd&S<;gWKSh8mu?l|mO$_U8AFZE
zkKl=s$JXY025v0wRi@5~aja7b3@^T9euyAx?trQ<JcJ><W2JJrp~CL6)DM5KV0VeW
zvY~3?u<k&#ZCXHWSLprhaImL6Hhz3RJO5S{t)uaWfVA}#Q0Uz7+>jm|{xLxJ(1ZvL
z^*T*+LQN9Q3O21%?}h%+F<<H;b-QeFB^bAR0S}#<=j=#>KgZJbVLsai@@@BKxIo}4
zPxQRkI$x))ICW5`O4jbt>oFK^QCD(PkO&Gl<dfhlWn`DOZRA)*P+s0uiRzg71RrQ_
zxGsCX`_VjO%kz+f&7K`BIW65nZD}1!t01GqsUVw*UGDy}?Kz9ZTB$dkEagA3;*)W(
zPJXzd2C^`ZNRIkwf#)V3+L|f1zK1Y;jLWNAeSyS2oM&KrjSeEh+?#VDBL?}S0g(*Q
zE2O!kh>s;Oeftnwc2|*42icZ0dDTz>Ly!<7C_{`1d3}i3KKzufB@DB)jZ|<LqXLxj
zV~~arrbEp*RaQ#-=sK~b99-Mj#D6^BRu&$JTj1QsJs(TCF1P}@z=i3as7~|xN)N27
zv^F4*XjuHfjWRour}S*MdiB6;7#&{<nr9u<^*BF&E^(r$#72k4?0$M2%s=EzF6
zL{IOAc}`B2h5P<{SWlC7D(kgaSx<sfHAUf(#FP{rC8#9u0Z!?IzHQ7Q2b*ixq(t*1
zFW*w_ss8k;=N~oAH!_QOZJJ!m6AGkEEFL90U>(TzGcF)J!5I)ygzY4VSW8Spt9sOB
zr{ZXfV7`m{D}W8*|M!5hGMWi21wE%+#i3XiCgA1U-ST9Al{Q!ppK(ZK9n~k7I&B@@
zpQlB}aScX6rP9TydPB)ibQq*Um;*j80(~G0oHc1C>k+)3mM^iJ=)Ev^yY{_(W6J&!
z?o9hsP?y?mWg`s-B`WHN;qt}GNk(R&hukhO6>};DBjO5dZ0=To0y`4ru6N2QA_qd^
z6V_f-`9Y@egTa1qjK!zM_lHSqGLwtV2|tniIYGoo?1|YsDO5EEJ8qo%&xcx*!1Ik&
zQJEV47^^yfvi#*}{NMgZPzWYMqA#d(bQdR?kXYq~ML*qzvnVvGiRaQf-Y0U}^Rx4i
z)B5a-AJbEA0NT_o%PjBjND`E98djq<v`;)fMV!NV-<GN`9elKWL<Emd9aI-zt3TAG
zTzO?CMCHYt{kLdsAP|YO2BbL)h35DXedVWE$<TqN-9wdSB~Ds)Ih^+|TRLMNke>tp
zqcPVe&bo;&>4P04aEv4b+YHRqqYN}`{m;<*UZjwfzw%&=W{8kw-rmKk6qCXz1pc^F
zp%zva2pvRKWc|16Cg<f^kUk!I4D^v}jfTpgOb|jG8~`jsw~G_7YR^Y?Ue)3(7ZBb%
zHA$jBP+$Tihe70fP{2n6R*;5Z#yI13^kVyHrFCYs8EkUm<`I$~<&BF10>c8uj^(t&
zE-Jzwvx}<b==)j9(~&7}buYv>-fe3{J%7zfafIx|HIeNvlBiltA_>9)iKqkDOrmJS
zJOwvh){2K3&Uc(VhpfsFKv1fFI%X;|W#O@O=)X%AxGoPev)G*LE?&C(y!uvQRXaYS
zC)oE=XawUs26)5Y+gIQPFi(@ds<J_WZGCv+rMeSD48n*|0Rw;EAUguE8oKVOfCOCN
z)J&&4L;Cr<j<vB>NfSzcLPWunPhcmJX(4#NfsL`iI2$i?aOa4C2lmw9ErVQmk6UeO
z%%szIg&x7h9t<>Mq<KA?dN0RM{7E|x4F{|~ymO|{DD5p0MLGj%7612Kd;H&K|33x2
Bu$uq?

literal 0
HcmV?d00001


From bd28082be7d93cc75db52231f0610080e4956ac9 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Fri, 2 Feb 2024 05:16:10 +0000
Subject: [PATCH 67/67] remove useless file

---
 include/mscclpp/core.hpp                    |   1 +
 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg | Bin 60467 -> 0 bytes
 2 files changed, 1 insertion(+)
 delete mode 100644 mscclpp_vs_nccl_comparison_num_nodes_1.jpeg

diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp
index af969108c..02c277a3e 100644
--- a/include/mscclpp/core.hpp
+++ b/include/mscclpp/core.hpp
@@ -493,6 +493,7 @@ struct EndpointConfig {
   static const int DefaultMaxCqPollNum = 1;
   static const int DefaultMaxSendWr = 8192;
   static const int DefaultMaxWrPerSend = 64;
+  // the recommended buffer size for NVLS, returned by cuMulticastGetGranularity
   static const int DefaultNvlsBufferSize = (1 << 29);
 
   Transport transport;
diff --git a/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg b/mscclpp_vs_nccl_comparison_num_nodes_1.jpeg
deleted file mode 100644
index 119e0eef4ee733480cd99cf5df49bd19b3c29890..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 60467
zcmeFZ1yo&G)-HN*cXtgC2<{FcK!D)EEm&{}?j9s)aDoSSg1ZHGhv4q+et0`o)!kLq
z|NnROeQ&%w#=Q+=!{M+utU1>==iKZ2=Gsp`o>oBU(h^b<AP5KukP+|+dYT7`f#6|b
z;b39l;o#s95a1D!u~CqbkdO&5vCyzd2+2rE2#JZwshMcWDPK?#6Vq`&f5F1a!NEaB
z%PYvkCcwnb!S?fyKp-F>AR{5;qoClkQ4mwG{o5Z;Z6FMINOh<kC<sarBnAW&2E<b*
zhztaRfB{DPbHKlSKtMu4!@$D9BOoFHCsd<@AR(ZjAfcgPV4$Iavps>|gP<{BFez9?
zVX+kT;V5mf*?gnE!c)C0Z^uy@JE3Mbu=7Je#KprWAbdtcOZS|fgOiJ!hnG+6mAHiD
zYbj~vw<@Y?>KdAcM#k?=OwG*g9UPsUU0mJ#KL!K_1%C>Oj){$nPe}Zdl$n*Clbe@c
zP*_n}Rb5kCSKsixqqD2Kr?;<vd}4BHdgjOM-0Ir;#^%=c&hFmn+4;rg)%DHo-A}t9
zKv2J$^>@quWETct7bG+^6g1pVyC5K)fddKy8is-u7E@FaPTv-blFb($`(@PE@^%C&
zcBK;>1G_OqTxyQhXQw|+`^B=qW|-gql4XB4>>qZ`fsmmffXRcx011Px@8~jp5dU-h
z#|Qtx!2fp`SalF4(p&bBn0ny%w6P+IWjg&aMEK3wWtz*;BO0MIBMf46#!*-HF{)rs
z&5NONwCQV!*%F0TsMzzll4O^$$@G_D%fZ+XwV5|p*UG2E^?6cOpt0f-qqI5osj!M$
zJ6CJ_6pUsgD3Am3<Q;7JIy1W272yIxhGgy%Oq^9>|HO+|*PlrmPv@=KyDhjyCMZv4
zI*~jDaHA}n81SQFF^%;3aV?#xH^7`j7WOW3voPBo0|Uu6SW>nGLGtBng>PiS2VOv1
zsm#}<1a#9ujdG%4Sa5@B8%6~<VIq_D2?Z}Vug;0ZyOlV_$tyAXHb8XEr!?iO>1evT
zH-W>$?ui?(i)ZAVAP*M;-p)OIU2xqv9jP{^$+209B)th-*J8BFwV*)?wb*)Tp!WJo
zz`D&N&qu<AeEg<JA+np4Q0obV(j%1NU?n|w`|?$Pt&o<rQP^kh)cW1-4}7DrUD;^@
z8i%>fK_bzM8=`_uuqx5i)R1E3^qZo?T&1?{MP7t6OpeJT=wi0S^bJnAS057FVLnTQ
z$M^B_+E0+w6~?Dyc(CWoTUnWSKI_`-{DfZ}u{fBo?juvm=Hp1c0z$DDB0WtH65uU!
zX_-);IxIdS(~1_4P!gp#2sSx~JIg>B64N*p9|=F^5X6)v#kaEjTF>Zi!%&%Eg5O^U
z@@NSvG4}rMUTRKA8ZnJK=UG$WSlkk0zg;wh$o3-O3s*%nf@p+$dI^h}n{%iT?H<-+
z#rZ*G=Ij90tgaL&j%0&8`lB_pVOfdPIvL7tyy_Ct=mBMPf2W0XVa)`Gb$y7^>krOG
zX}3cls~u8`X|k?_7J`|~a*H}E%qxau^Z}yoK{e!JDOG5<l#m@Fl?vJSf)u%37oR!5
zzfx>5?SP@?Vr{1i+fIv<7dxqmS>3fZDGyY(cZ?L;T60d)U>(J?lY`5X?!56^m((Ha
z^5I$}Jn+oU@4M${ZH$Sy>F0ic_2Fh}Qpwand9$#GfIfk2;WStbZMlM@y^j2vDxhz}
zZK6sK(@d8F>k0IJggOwV#h}lI`bA@91c_*S`}SO&j0_1cm>^uaGr@j$d~c>h)gu3G
zd~IBc0PRr<jEZP)4^H)pJo@t9U0{J3O%85#o@TMcvP#5!A11r?A_3H<K^+`}?+}Q6
z4KBPI4Q@{pzq&GTYj|Vm8F<r>QjGNbU?RB@11m3Fj-~t>K|*(uP>eNsbT_!$J&O*y
zoX&8*Y8#(o#Bi=sKpxwf3{!vzxwj1a^B(<92Ziq^2-tdN6iNGB57mhW1}AFc@fS?v
z<L`y{8L3SNweLy^(C%X)rp&+;r}81uR+1g%X1H^F{ij+oUKLW$rn~92p~)$u;R*WI
zd5ZYbKV2rvAB2;laE&!d<p<GF@p7F|dhTlYET~2K`pDVNeQP3E(_wBWB@X%yepU}2
zYMs!09Xmt>T_5TCoZ1MPg**a|d^@DPXpg~Stpu(Ia_(SfTw}=CZfmh)zKF#rE{4c2
zx+&!Y{1C`yUi+qI;83=hCQ`6LOQ1k7F`BJ{nL-0U#eamN(P69fMp-3Y=OjL+NQc@?
za5LM!w;U3iosJe1ZF@6qdKwY5hNN{bOL?+XF*`stfNW_X0?VR@zhK`s?851La^2H{
z6WI<X9w*bTA`F^+S9{NE*f`jx)6BOW0=h_|4=f`sV*IQ_xHZ1C@AB+u#Z1|T7hhF5
zEoH0HsOHp_%S$&e3zQEty-$F8wP2@w-AXs_>z`)lnGfY;&r%yZU&zsp7GL@yr@W2!
z+mKxg=8WV06A0a6PNJ1GvC{1O;^!X-Tv7Cu=rU=xzCWBxE2J>#(xmX|$zOC{L6x{h
z26&jQHnx6RdUnPcY9?GI9_y5>4!(^eX3_gf>BB#Qxu<Npgfn$1;~1V-QIWL9`57}O
z3JLT<v|BR2RA^)k=W*owqnd8?^c;!(n@`{|OO>W?E3iti2M5N+`XgRVYEtX1x3p=`
zo<QGJX22olXHTGDu5(jegh=h`@T3XVF~f;!9L2HDckR#bh`$Pj(Lqgd%Hx{eA>&#V
z>B1W~=!|s{%QLjVO2PZoz`CvF2t6O2>1aLqOd_{@&tQFA<sY{nW>{+9&6;u=HbdcD
zpmAfkM$Ok<_XIMTII4PGQc^~O5W$NpVG{OPNU5!WU_a7)+vAI}h#q=%%DOlyGhc1I
zzSZ{$>t*iu?BP$Krfxoa5t8`0bsop#V1*xM&S2~0#~6;Y@1u!}?==iS-yBg}8PZT8
zhArfnNnH%AR2+U>s?V)%o}u5`Y;vS>A+Ua{acQ&Ms)w)EBEx!6sY@13@OlCv%OmO*
z#z}hf30N8M&ez!q+Wwf~&lb;JxKWcknXX)CRPEowKxW%n)(@5<&TsRn3w0&(q44D;
zj+yomlz)5L6Xam^QU>Wzq-0Q<@NERbvol`;qW<RyQ6oHS4InW(@(ogPSG5q%P+?Sy
zq^Tp{HPzbbYzFGN7gz6%n!YH&z_Z}vLP)3tKNL+e8pT`iJWFDxQ47~tr{%|wkX#kP
zQWVDq{qH!GHO5$*<Lo}L*(N#Kg1ILSE@hR!J0)CwThx*&taEuUviGApCCtBl1L{Tr
zPHS8I2_&ctzQcAM^B#Ho1ZoMeFI!=T>26H_en%OpY>W?%BYU9Gg?R#fwSV~pYU64B
z?!#xp$9%h&em$Bs913>Jd%R%y@&x+G(AIjvWU6-qhj*j+1cF!d_1@7QZ9PF{KFL5$
z{_RA%^UNnuTEP>@6LHIX74d}m5h)1|4ABL~_Wq8uqIZYK!TbOf`alPj`;QX^c3QU!
zy_a9sr{BD6LV5zd+zNOC`Pg~~kevj$;DG~%fcHRaPas6lZqsk4M*cRwQWG$G8RQda
zwEnC23bEd*<bN#rpXWr<&jV2oE{NmB6KEzyZ&lg*d+UkKBVq$L7(x<^4Kez6kdQj)
zolglLf7N~hJup0hU;!T&WqGe)djmfH<If@_aH)6?m-6>Jihsiz&d3w!4}bVS|K>kV
z`vV~UgS6HE#La(_&3{to-)>6&@2xkG14LX?Q8I=+u<6<Yq-0}N!k>~}k2@4GU@*nG
z8QpUrkv>S-gss?zr4P>7iojqi{`-eLDcut&%D?>ybcN&nNL^Q&Rv(G&-P^wM1iD9T
z1xITuz&`kJ9*Bhbgh_b16kL^MJQU(Rfg*QW7i_i|qMJ`<!t+||l4g2Z-`BcoWcuIJ
z(4KvcetASo3k4;3s->8N7+xSNIsF96G_YPoO?W$X(Tw{;+uXTk+7`=aqE10RV?<f$
z$Fllf{3mC9dzh)l$65Ukd)^p&JP*2K9`bqS@$42!#YxTeL6+PGv-+<rQ*x(PNP`4>
z#;d*1cSxC<Czy{oO59{I+c$KIr!Dw~SBT?KCWs*=byJIWXg`leKo~qWT+Sr8Ygt=7
zwCKEvQyUpD++i%*5vJz?Y$=Kvt3t!mFdx-6_u>GLRmv_Imt7etx_(iD8_c7E_P<fn
z(i7H9G3y^oN=Rzmi{X^pf{j`mVppH44(FmyBDna%C1;|dqTjpl5i1Tx&OS`lGpLi1
zN@>(!s-*aM)})C`^=e7_cA~G=xu5$TN=A)(1jH~ynh@GlHE*>Bqt*elZIRCWLe*Zc
zae4B<l5&LB29dm+R#G)=f|*U0`M13}p?tu3^Aj?bminJ%;CuS$EjxU}eAEPOCpd$L
zvu~27=OvpaFSU?{Xpr8x`kisUun)63oAo#P41(k3$&x!oXsLUhH(G5aNUhYzxj={R
z!#+G?kmBvaysiDV!6ul~ibzRIW%Bq%u}$$Yu=p}Q%d0y}5hzb><C^fTsvEYSPueF^
z*Tej=u&ZF!&{YI1B(Me?4Ni)ozY<rzW9<WzEXRSKrH36>qg2kCHZ%{X99MI~A5M0z
z^DJaJC#pSWBp#1fR}E>-t0XIiz;CF2YvhVPR)F{re5;MsP3+j?#gKe5TrZT6v71D4
z<X<G2KiiIzm8}{!S}ua;HUus+>vnx-JI@IlSzPK{d=rnf$gV-Nt!lRF!qNCC%Ymgc
zBj^=XVcKIp<ONvpTTRl*NcNtYzKQb{?<E=&6{6H<HW9mfyN=Z6U7IFX!@LD^11l<J
zE={8~q{fQ~nm`)*l4$N@ayX^entc-jBwXcO&7?){1!B2F#RJ?|F0UdO0$>Q#%VjsE
zND`q!w7lWvOZL1Nf)raO4;>nt-{4k;V^B57Q=-eA1@uQFrWV8XOO}A#K!|PV5OANx
z4NQe-CmzIY+&N>F>8fMQu;}zyZR4Uo$SDSi9k^gG;|t!o=&sF3g^0O3*9_x^ur0|a
zz9c~9qleTI<SBLQc|Fr4xW)jNevqKb%_Te=*5DW$75vJ41W8Ee=6*MgMOy(*EZ3#z
zP5ntrT*;pDoS9jEJ8^yjH(~}m9lsbCH`(`a<t{EhH+3Q{vud^0I(JhndDCR3kb{-V
zhWzesxU?*st{pCC!WX1i%~K8*`=Kru5JrX(LtE||X94~a5XK9z>=9fm>zqS#i?hrD
z56{iNE#7`MpZnO|A)gw(ieBBHuS@`K*Uuvmc8S9l9nwgn9EP(oA}>#5)+kb6*<AI-
zX6;)V!Mez(6y?#&4OX2zsyNzs(wi*-*;DDI9=)pS@sJS<=+@ne096gV2=sfS^F288
z?GgdTH5B^}FUr)F$>MiJr6n({#W-q}Kd$ieoD-uI!X9mhL8x$gLoT%j7GE|p(m&cb
zBGsbg21i&?*?|>}yG67mh=lTT?57-aTb+vSRO2aaAIED;x2tvuQ)^nkwijr1P>R|G
z#L+0=BS-Z@ub&siXD7dw+W)48gXVKAz;9h~qo<z45J1)(c)T6Bz6ZLMw~PcgYmN(8
zAj_jBuOX?8+erI_KYtut+Itih9<EZc$`LoYA(E)nR8jQ~T5{r0fG0<{LlYCXw8mBx
z2mNn5)Tq|wgB!U01QN?WPAm>LjxT|btcJNUUxQ4jP6|9&PoYFk<r1%JW@M~3Su_k9
zj3r*WIk&@267Jtc(Eccvq}uS5+ES|?&JAg=KWz=LV&8TNm4S9tbO`e%x7WaW&^&ns
zXLB{z{}_)??E#hvQD|E$^uXQ#zah?SNG-wn?)`mgd<!TRBv_ql;Pm$siQDIt^}@k=
zm>!~Jf#rr7=694MbYV-2<PuQOhl-P|J$fP*g@N*+MM@kCt~QG~H`cUD&$X}I7`Bc-
zda=wrfmV4dCumw5nkt)?D!MqrX%?Pat@F0yZ%?X?bMoeD2{csXaHUQiwU&{8oSga|
zLRZF7nb`6KicZe-{y`7WcoJG_^4eY353!3_;C8j;JGQw7HxONB$iICsB%Vq>gpr~F
zG+faWX!lu{!{wk$7FYre?g{im<q32Vz2?IQy#BRL(!2RSf!6PsuZeC$vY0<r;nTRh
zpRU^33=e4#9EZ&QAWE<@ZYN{6D(A@IXEQnx%@K$AI%LPK2-#n{5a(q+gh(4yMya(o
zi7D4zPIHktmksoutvyLUTUO-iJJh!r4?53ZcsOS)HIG?IjKfoSMm5dA=W07r*nei9
z5HuI8;;J7#!H*+#hO^K!%+0J|^iWy79}QgAD7Ra;`<g0dNU$l@`yj5|a(W#20(F+t
z^qH40<N0&iiA~t#0_ZXa+hOS>IX68WcM=ku4OKD>57Eb>qP!oQEmaM5E*BL$-&|o-
z62DUG2P+W4rJ-q-XWayKw@@U=jAyl!S43OUGrn317HcZ6!DdzB3N#!Zg%xagdvDqr
zz<tJcb?S|1v-wChWx*hQikn)zY>w+>mypyg&!iv$A(S;~gmI&mdNUz0<M1dn`~DR5
zYoXpAw}qJ%w2@XPei7@7jFpoh;)vI!q5?!~&4diew;cP4RDlR28xgAt`G|6ML_E+?
z5jVvqy4gn2z9Y(5s_Y>OPT)qeSBly+ajRCkI;8za6J2*zO0qV@IVO~jFtMZ%heL0t
zf+C8`iZ4QiF*Fx-7K@{^k~t(^g+X?nNFW&NrfSt{&d*8N;N6grPRqCmcRY+pWH#kA
zsjbb=3NU)TAa^;+a|%I{iNTwbkD@XB%(`xkWS{<pbq2>#=(*lw+T4{M!Xi_uJ-sB(
zclk!It@*auzJ1^H#1-3=vBRwC{f&<Kj*^+aw3xS-f~p;ol@`q}_xN4ZZc;9Vd>m<i
zsC_bT#p2wUzC^IR<gcp6jI5X!K&S7{BEYl`t2K;iGAtlRoU8X+>rt4YouQRg)(n;N
zTbGIJ-iep`k_DlBbV@V4*xtiQUn_ia`4Ft~m>XMtzWW4%o|~kc@}lU2sPJ2VJ)TKj
zauMBR{|1!}2gis0gV2@aS!rHJ3+A{0t0m<`iBF6*sq>b^gFBkOuQoV!jYM<L2H8y3
z3FTgUktUD)lRc%DwZuBU&XIwDp?{ZioNaE*R($a|N3_TI)DuX{3DEefI5Apw?^F;t
z_63!tWVG&0OUaE0r0}Tn6n8xL?mZ_i@=Ju*hsmyCXv(s%q>9)hg~(=_Uc5;oY`C}b
z+1PcCyq3(incwl&oEj4_%rieve*P++FzMt<P0VonG3D{N<E%8WqPi8;%!}MdJb$h@
ztkq8C)+q9EnmBS(!<`$={|H>edSw@MvfOugc3?iH%ehWs(MgIDPlAi$TAIag5+SL#
z+?|m^#eE7D*+UYfCw4Lub`goGCBC5}kq{snJvB_CMZ7}RB-lk~K=qtQkdP*g9xFNb
z!gMD;Gh|`+|Ccs~=LCKQ@5$*1s4sB(_Lec5m~&*Wj9a;{KdxF;U9dNj$tQ%}V=|cY
zxK%GR($zeCPL{DRDaGq1!*O7SGZ&Mqi`Y6kp);eZg~IUfU63M<8m1RM93j2&52xz5
zx_egXP(PJDn0Sf7O#W!cFxX^slHAQ^!I<AeH)GV$AunM5xuW%rB`tm%KGj8;#fyOt
zrL9bS*-AAFiym`hZn|6cDJ4f)GY4MJT%@lhLzVcZN~&zq7qwqws!-)WPXTM-h=X6;
zIH|GjD(oe7`iIxx$~^!ibNO2#A`kYO6L8R$#hhc@a1lqK*p@^^A|40^+J8jJc3(Fl
zD*H7i$6L`h6@18gTLz{qO~vdSj<m=x&u+QE7d+?Hr9l8KjG!0pNXk(L)!aZDW3FaF
zDwHo|E?m(<z2|*;y~ao@B{pTrh}?-*oC=OxQdIYJV+G^Tl_hY4W9J?aH;+c3Z~RXZ
zIur2F_f*xc`7XM+<hLt|L;br)UlY0R@RCImE~5&eby=9$I<Gx)5m-N?C$Di{+@u?p
zvEu3LT~9GwYK{*%Oh;cg4!o-#*O`u-kj3l6ILI2ZN1Si>!mH|uT0LZZy|Zlm(wmK4
z^F(7_GfZo~1O6f$V65qrLQc0ebL1mr?<!wKBjy$EauF`2GNl)2<_r~Qp%H;UV)P2F
zl`1L|NHa<^4Iyteaj!6=x+ty5gwx7nEWGCWlI`S#w1Xgkmg7*RsHRyhJ-%H;OS{<k
ziMmQu99vb4H}v@Sc?0dEUs>ygaWzo6^t}6I$Ak}XJfA>Wj751FkRf#c#=($2EucaA
ziE%)Cm#|1f(`N>6?0YW$3@Conp9OTLBFExM2bGao|0vldV_cjYXOXqXmpY&sT6x6|
zuM$?SAQk=Ui2r@glz<HGmuIQ9SDu%Pj|*1x8d~%mg`weto!=B7jLF=iKd_4sIsUoE
z)>MUS&S;NkOj>>k!-DeJQ;GExV}Kg<gQuEQWeySEcz5Fc7@+qAl41ge;C%xQee4Ds
z<PWT-!n&rJRq60a85s;8n|~}9h?e8!bp3mcPKya~Pe6>xD<;wnqAhSilbW=#nz<L2
zubo&Y%8_w*jL^{g#ss49N=aI8GG0v>y}-)do#WunU9(xx?l2-WH6V_2hKh*>3BDe-
zm@zFj3HlgQ6P~Qvy;eBtKqA>c)mkQ$mPE>Mvi{wvcBb~j{vAx8ktj9Wa5SZ3yYeVj
zFV$(8PWjlyB8x$fd`-=D$TQ_zUp3P_iH~Lyz2`gn>(X3{*SriPP?D}+_X*wgDrDWg
z?gE<oyUsQHKfLu#PsZQQ6{QNBFg$rvqw0pW+fZIT#gPm>0iLw#=&?mtfzcK02+P+G
zgeKjpH93V1wa8~mw=Fh1St<u<N!f)`jL65POvhPT+pD$UHBMcWQK?L8%IuUX?M?5^
zEgImC(=O6~tdXRp_|$8_lgAZMvb1}Bm6Y3xylZd9SW%1%`Alh@Xr8Gn_yR_fcAkU;
z8)9UKPd=EV+{VVU<j0bRDEueGP&rfW6%gt<1L{&7$+cfqvq>j6Uv909RRNVKG4Av=
z?rQwv+j0XoIT_rIl@A$eQYsFQ(i3lMT@P~34W+Vulg+gGSS5*l$6`Z3TL%Itq{*)D
ztZZ=kiBP01_ASKyAvvRI#L)TcAW~C^oxAyGC_KNKpAQ+XmQe01X)!RWSW~c@$r5T&
zzSg60;x|_C+EGC0?OSWs<e!()Pcl9F>hkryl#UUMq49ltqyhhSjui5kwkYw*>*x;3
zASn!ghUWy(7?T(xg7Sb68^HKMJ*KQw&dzJlenfArwB^3d=~^={@F=6ng<>oVz3Lfv
za^hr`1VIIB^}6aXn%jPUalu%CurL1DSGn+8oKkyDlaE-kr=<mJ6fWaPKfa4w8n*eQ
zw&<gzIb$o28j|eiWQI)s_<5Tuai6!VsOjC4B7B4x2h`8Es_CcOX!m|TG?^iEP{~tG
zaI1$liqT;3&_?}d&>)TDFf1HsJtMz$Rp-X=UiiQw&q`r)T9L{%c)`JAyH+l3YI01Y
z;q81YJ)6UGuKAFe-ZgOh+A_*{o=$1wyVkQqmw8%6@L(#@=|dqnZrzhsq3#~1FF|$^
z4#mQ)Ma$$z8ru0)X?(tU1I?F``CG!cST%%sm)nFLx5)1a)LAFOF|@P&WF6kLk9z0K
zW7>KTRB03^$3v?_YKKmtPau}ZS4SW>s!P(29C61}`em8QMDsh@&4?VrO)Db;jR@Pb
z*Rb%{V29?4$es*$N3u00y@6x?B>J?t0@0#f)wzK3L2is7J;gn=I$p8>?Gj~K9U_{f
z2B7vP0yqEvMhg22=O$B!V#`ydP(luRf{NuaZaTzIs-ztmzR`MLs+OaiOS{7;j;wtU
z+)re28R;PpjRxb5F8gB=FAcYBgEjnbA1x{(5bYzY@^AGQFE6xO6NWqM9VFA2JUpj|
ztHw2UuMB0?zIV`<N5xEj6ClfIyDk%{Iu#Co&@#j63A&IGr2lS=oX`-qWXN_`$b8vb
z`hNS$quNu;v!$Ne&DJt84ShWS0_8K>44H44{=|XRMKj;r_c1Q5`B&u<Eg@Nu%}paX
zaWBzb=*<R(1O-(;wKff76gUqKGkOu$6n>Y-y$MyEj<B<u0*}}{fjnr>drK>&MjcWS
z$rxlNN=mTsh2H4Pc*-l^Oo!vFs&>y5*PAf%GvX~2w?c_y`-RYzDgj@9=kIMtd7>RY
z?G`be$HIdYBn$C#BiaclAFpgx@2kkL>S2yYE)?9!C(wZVdW(VJE5ol>oXJJl2i0DZ
zmTY(9kphps*YWAjYG-fu^XdYpag)~6*FL$LMET|4#b`5s@^ICE7FKTR%<YWlv>~;v
z*^{VDl5OataNT7px<bUerIqS-cdSuI99dPb9{U8U@b|sgYJeN7nq{P{;$lHE&DOxg
z5y{7Q;30l1rIJ)JaSm=dcVlbU-+L~7NqEcLqXw)PQ)WI^H%oTb9y{jaPRvcS;K_mt
zKC{3LS#>;fXA(qla@3VU(y;m3?1H`Uy|{efL4-7GSCv89)>hSen}(^HubQ`@Cr-<H
zS*ZEw62N2e*1ssax$oafIKL~*diJRe(g!!B$fQ8soCw_IZe?E?cxJ3Nf7!kkf2~wL
zVW`1;Y;~v@fs+@ffVG3bQCeba&!BsUYnpFWT5@bPzBcZX=X+lBI)$wU6%~Hv3`O>h
z*}`90;WGH1S{L6%l+y-T7jp(zsfb(5hgz|%+0;O?Z)8b)gy~tf+`f^t<KLr$`gE#I
zO4^tXrXeMmbVM4d2&j!;mXGT9kH#@l4E{l;R5zqt6{&vmcpY@<X+C{+Cx0OSO}9Cl
z(U#giP_plBANi{9J2Y|=|D+3gI;fNr@$jjHAQ|1Z>GQ?Kg(ZA4f?yf;-a^|2Sc2;z
zgZ&2M$H)*5#t(OkR~ck)KR>G;4WN2yWx@-~<NtwYBYj`z@Xr)>21<f^S#^7`{(@pH
z8pnzqpGVWQ&Qf(*2Y*g$KyE7T%S`#|&aW}TF2<D;Dd-GVzbpH)2M2Fr-1$kZ)|rX+
zlB^<s968K~KRX(|b!OnOdtFDxqhie?Xuni)W7FNxU71B-#X(@RgtkZ!FsOJNhHclD
zQowutV39E+K$7Vaa>QREH?<khaW?r$<3l}dhR{~-XcKGpjg^TG)8bdGIJWl^mg_uw
za(qeK3Pp#^p2B4PJ#2Y;bBhCu58e#u8hdw_U7f5Ws|Ad4=UXz*7L1iQX>X)xQ&3aS
zHRq2#GCYC>2G3e-yM-FPb2Ks=Fl&`_pO-9-aM$avA~6~&pI?N_%ec?f$N%U?8A&?x
z7z<hqr0)!Xv2P<O%q{ppGc91nIP$3)>A6F7i0QkKlOB@tGmKjLZA_@J!*{T261)qZ
zP+*Kjw0lY5lSuB~>2TTXp5C>w+1Zfy!8}X|Z9=ZGU@u{MI$>OAp~n_JZB~6A=M&Yf
z`gY9bHWi6y<(sSo?xcv;wda=ANXbPMsL58g&@6h@$+$6IxA0`{&=7vo1^Dg5GE7X4
zq4z_=(k+t$^PS^ab&6*B)v?@mxb$ZTpK&cgsBLvbzS{!Ma}l9A7Lcq3yYbGM?6G((
zFOubJR&1qQzwYvdXR9X6>BJ3H@tm!uEs|gnBSW&#K?OyX9XSf(NlaGHzIoa08t-n=
zVRO}o`AV_j)kd6>lw!L;y6D%FHAx{fwTdq`S?>Ea7N3&!KYv+>ClZCmN^=>}MqY8a
zCd7?$Hb#jXjx*%UrpC{RUnHQS%AKNuIw?|}j-OV<FJWBsV;kJ06v?tDE^j-MqE6cw
zg@e-1*V+Y>oW`nQbzq9Qd!64_z&I>v%xoUP8<TICa3<SepIJ081vwV!YKW2rQ94%3
z%v7Bds570Z#eHCpBlV~D)eX1*YKB)kwyd+O%D&hW9y4GaPPv|o9~Sb}bCn-Oxzno9
zoMm2DWZJ^wN+#4X%8t0sJDEyg9sj`;5)IazFEDFWkZ(&;t9i-5Gxz<E@yy=-G9P&U
z*ykez!!AQM#|`Jit}F+j10l0M!3wU^yUFtID2ej!kf;pA!Yl}p9Z;SEjMa6EDuA+~
zW#?g@o>HgFhv!{KwcXGxhvYUwX1#(ZT<3RhFBD)tT)W1wd<Vgq9Lv)#Z?0cvGrMm>
zlKT&iFl-YW?$_R_B+)@Oa!#KJBsMj93~dod>T*4SGy}(uz}>C&K@_kfOKD!M%w#1x
zHVs~JIC&4Q9pIG=SoWn`b5(exlp|OJ&_(<Fkg(%OdL%tF?fsJe;=}}L%Oxi^gof_*
zfWfE;v7spzCR(Z7(sP+5E7@-%>W?SJ0-i#;-qPjbAkl{XA^nZ6&GYJ}j+ou<<$YN+
z_f5=q!0?d%F}!;Brk-GZa^KeLJpaS9^E)+^Q>gE0l&3mfRjQFDB(V^|YH_|Mv<0`G
zthY%Qg!J^vDpdVf>_yU?hJB2YJU5PyMPCOBZVWC&W^juu#fe^v&~<DX^YCgPK1h{`
zEIV$IpXvV)S@M|p?7m{PeNk%Co#fd`Ip5h}V4^VI_g?e~#Fzo*5Dt9(V39T9h~%jz
zTiaUYc_QaK`UDa?;(M#4o!)p@`!yBj;0fe#X9W;8Q2OmhV2^hAwp-Tc+xusycjI`?
zXYJ^sTP9K4#1NPBkMu)0<)dN^+rx^R;+xJkEE`@#-X$D@TRn`>5P@6}tmKbgy26@(
zgH%nL&qj;U@pu=IJ0W)cYm>b8(**!eGyLAu>Vn7do$Vlx@QgPfJnfuLrFG8{f+?sU
zANpPHJet691{qC_f#3Vx8xv-c;f<UwKt^NVxUJN!8_v~TSzX_elxz{FzX@;ha#*}>
zQ51Sb`rbv4XPr!^b-Zb&ja0CH>OsZRB)@-2XP<x(BjVL7Z_w5JU8LJ3-*BAzR3yh6
z)9G;2^H-(1#||!mB-tq2Ljt5Xx2})R#TJi0<Ll2s{P38Fb~gdASFlrp5&~q$;#+jr
z<d)+U+eqz0aO~l=t^pbi<42U9AzsotBG*THH#*2V_mOm#m#o*n^WS7JIdjbC_dHPh
zHN|5h=tuSEZD*NhjkSr+URvHIth(oh@qVCnB9X+;KYegF1pvEJaI2F68fMrTiWTJc
z?r{4e54fuL=T+RlUL^~77(iS(iwGGN=*<Vo8LQ{+BQfYN?XS<DukQj_QwgDiM7h>A
zjHF@oA;}x!75YNt@@UrhCmZ74hk$&Vmu}jAwVxNR<tM&HuHCZ-Sq^`O@y6B2K+nD^
zT?XQxivz#oOj1@~=m5DjfC4uFf9C}|8HlULEVq8>k_(-A>1Du$uYpifBKK*LTY1kz
zn#k{3#GwGqU+Qrpkl$2x>>A`O|Hkcp0_WJH7JyGTmoF>%-NxY>;3S-ZCCisU$Zfl@
zqr**X0`^<ipC`I1y6Pkk@(Kqo?<*^c;P@}y8YPmSv$vf`z~2G$k#huoBZ|;_`%fMq
z@gWHs;-#&TSQ6H>zF%heiIn3;AWmoa-M~1%xM2EsE`VM7>4HCIpTZXMDerF&{P}w=
zdP!D{OYhQ`$c@m3TP)<myg0RPnOdtxi?**}q0eR}pa|CYFNfT2t!07-sh*FU(On;3
zv-3_WEZnhco}#n{RV~--Rcg#-P<x%(pSv>;#0#~l@g>a!H(42Yf1|1!E&j2hw>S@L
zrZ&y_0{+-f?G!e5Q~=^#0)~a#=HT2>=^nARZe+Z``Nzo2_0t|pH7j*#7Z0J>=3Ax`
zv$N3UsmG3~BRe6Y6Y=yQqxe%R)pHduP4Lmm0LRD6+mMIWpmiMvAnbD9`P2`uyeCI{
zL((*r#1LI=9+xwsxELF)^%J0{o8lW!AooH31~<c*rNlA-X{O|;tW^k~;AUuT0GIU|
z+g&FsTcYC%dl1M2lwkUJ0)U4O01w+1u0gaD;jPnonLZ3~&1aujfb0=)9>EG^kLNOr
zTLNMT&?rx!W&tuA;HqLp5p=kQWpb8|DyVO_dDK_E^F8~r%kG;dz5auX*B<L(Zre6`
z-+#u9O~W}lP+a~|C6wRPN>@~NK^FSG{Sqmxmk=RT9aDPKC^z;@OLR-uT*;Os?%OgI
z*#|zfvV9GP)kBM}7c{4WJ+m*^<Hk$$K#_N*>t&-}aj}IK10;Hv_46{&w<?;)(WLIJ
zC#5Y7%<SxMPmSDyf=eh*o<OO#MLi8%7J5tb4UY8lA%6Tv(1tTf{x464W`Z~C>!%%)
z>#qkw2dHD)3~jr6L{KI8-uXtv?bNtgMWho-nwUW&hL(F&K_ShMrE~Ii5xzcqVP1N(
z)b_51&*(b;ib?K#=8P}Rgizti#b`ZK&B<=*eqMZ|#`8@ogLv(j)gs3uP5+M-7}k1#
z8CWu?2RmkrxNm+{Cug#+BM63?fV1IzAF@at7qvN@y(g1oeLNWCIFcqAhZpyF%yP85
z8>#2^&V8BjlOO07NHZ(H2H6|GdiOyZ@{cipOBqT>f6l9pg4hsRp2xBBQE*?<@PKkX
zV{?(i?A#zAhB^*=zLkdN@k^Et0j*xnRtJMAcS`bp%*u0LS?wSQ&cDhL81S;AvUanL
z*iPf!&orZ+U@sK2!b%*OqtX!QjddpaqDDoUp^Ebd<xI*T!bO)LONuf|x`C(vn*h4#
z9v!UxoKjEWeo={5=VuLZ>*=dbrs+E5ewnCQ*yRL4JNX>VsTYTr&J#!aQ{s(n$JA`!
z6|aq@iLIeA{M=O<Wzk+wY@Z)gH8Pl$N3sz)Nc7~2<jH24Ipdzr59QAGL2IyfpFDp%
zq7BKy@C<I?y+OvUWxp5UB+Vp8AE7JNm#=m45CRs}?-75X!pV8o^>S9e387p4Wbg@>
zXHs890z~Sgy#G2k?n}0^qErC3;+u@j5IWFbIZ6$?$j(??gu&m-o<P|NI1@9hYmvA3
z0%T7hePFdXE)xRU+59N|_Q=?HXJGovhSU>Cm<=og#sOvpE{gWyVc?eeM)CiMx3%Z_
zkvRvmNwa((=^Yh9a3nzZkn#U4pu+dI?`lhg$$xCpVDhCM$k`Jw<ydo*q&@9=lw4hy
zu~AQ>A8oC@Q1hcXJEqX*%<k|fF6!ld6eb+b)~U~GU@O8Mk9F3Y3U6S*`kr)r1rCYn
zZ_zn;q2Ks141ZG>zNb~MyJm0w*)s=iqa2)PlfVks_?P}CTT-fcdE9;BlMoGFT!pxV
zw{?1}piKj~`_9O7KL+KlhKD(oDlR@dgzC_%vl2@1?$P$F+Y=~8^?@@(Y<2OOmC9f(
z>YD`i+NF50sc=JK&ojmbph0Rx`8xW_!n$?y(Sav965h0h_{KLw3Z=3nb$Ihle~<C^
zhkpCC2V{pbxl*k)n3M!k+~~6f?;Nx7X9u*SU;CawJ@z`V>8_+%d0M*Py`60_`&(bK
zOQ*c0rOVmE)+e-&r7W4O1i1Nak$-pFfD(VP>F@y#wsB$#NYJcOmZXW9WBi4!L{93$
z3LS}*V1R4W4Bj&KG3oTsR^^EYbB%jcT#Vv7*S8n)%1@v@5_L=V5?86ScI$iZWKX=^
z0wz`5EAaiA)xn%x<cs(g+<+QAbA}FG(o=EpNC7wnclH7Gq&l~vwp(U4LvkrTx*>Mt
zD^DNk?A=x~Zh!y0Tm`V_4%%-H2Z7j>1^eyg<hk@@OkRLB`a!Li)wOqEulA>XMs-DT
zoXz+@wngY#ta%eHnzfLpEq7G3<gveebp_Ul=6BLtQ);>xUY)pAb)@f3YKj=dMJ#2r
zPGvmwW=`Hc=U-G-`!;4dcdXbDv4pWW^1>6eC;zQf>kv$J>OGK4wbLNx*!0DFoQCXx
z`6SZ2_o~1{a8-ssl1h7uxwV!#N0CyVViDVQ8>37_7_CjsP${<1{6cR=?~E&UN1u$5
zbA)rM1TR|`TyY=fiPXWT7nG@Q3~S<jS<99>-s>sjE!t(NC~kqDUqlsHqA?YdD(Q6p
zGqDW*nQ_@-Zr#Y;Nxwn_x)H|Wr-1wcXf4mUU;W(1a}5rwDNXxd+wi$10PMG*g7bd@
z9ne34?xbvW1pkQ+K~4JuH}`v<?ytQ@rPUxi`Q%c;=p~#WGbug4NA3{*1oG3{@h07B
zd}qhTr=lYxUpwV6ABt49485b+b4j{hVxNXI$wYxCL^!$37a0FCEB)3A?u6%o1#NYy
zTzBJe4E!GH+J18fNR&7HkK0-R_PbZ#R(q<>yc*x0WF|90FO@GMZF7(5(xC=P@jx|y
zs9Cu=?ld^;_7Kb&2*$o>OmVbv)LjOYNxm2h@kX{HZXDC2E2bM6>4y6H<j%W(L&auo
zF@z^jf~jx-+*d*QCs1j_99`kZw<;ldFz~a;@ziN}f#N7c1**ymr0}Kks~2UkF|L_%
zokQ7+&8>pcCUnaY@|!xIwj)m<Oq&q8Wv6AWsyR>OG4Ctk;mq9VuogY<2O@&0YZHP8
zU{-Iq_zCGUtGWLz;WuP!3yELD+WP(Lcb{bXsA_OT_M~$4^fY&Cly#SeKWn|r*@zm3
z3_0&*JZ^ce`)u({)@0>I&jGh2E=J*<i}Qs%uyEWbak1pClFN}CfN6ZhbGr9f`dF_e
zUYEYxietm(u`dk!61+>3-3B%gcmkz7E=$(b_zll7SeQoRQz}GthXyF{fu4_v6QgId
zbeDIA;9$T)`i?^UP~J7L6`CIP+w`0DDr@xA+d<?4EUTCPgL%IU-;9aL__cnRwJY%o
z-8H9}&M9pA>OOOop0uc84V}V!DXlR{w{5E`MwzYz);&^MWt(?Fo-NTOJ(be#={B3a
z#hXea@qDfO_BDYR(xh{hn?YV>{S0Lr4N=WkMeBK{9JvW{qwF<WYg_|%{J}lO*MP+#
zyAIx{W_*wsxwF(S`Y}@M2H^@gbAzM03kVA{!|3zdphlJR!#UD9s=cS%^$4E#%qnSr
zJ7~D=yY%+kiP9bZJ|iE#UgfJV8I^4;2fIGGUag)PGR9zO+b<_~=BzJ{tw8=?+!a9m
zwN@f}UXws%*NSY{-DaO5DQ&MDs|>S*v391whPci^W}&r}7bS#!<l!1gZ(`zUeRtg6
z%4rvFyp0K|{{ac#%Zshb6=ad8HR6%v**F(W!y*N;-#mg)XDj`{hVRhRY&j*qh|08g
z@b`N6uX=v}oDKfDzZXs(uovS0&7mC`{=%A(bf$~Y+`4&!?H4wqWMo<SVk5G<Vecgr
zBs?@JJSTf3TR9HJ2ocW3{l2HXyx-l_bvqxN#Fg-L@eI~itW{y`q18SqLaS4YaL&kA
z)@68?WNQ{CT|(sG_6fD#$dO6w<i5&U;xE*~qmIIlZo!k<wF$Dl;VQkGf`rR=4`F54
z%i8;z(JnI5pR;3_9$&6{Z+;YBd;+;P3!=4>s04)9e9?!A%QoP5QJ|zd5bmyMVps*z
zCJAqNXPQ@z!;<4>`$khx?WhCUquXL;ie3-n@(On5@d)s((K`{SSOwqIo}D5hV7>HL
ztJPjGQ)69Z;){br7E0t5mfkl<8c;7QHmOQ{|9sta!5zc@VUJ!ii4jAU_{5gXM!;Eh
zO#9`0W*k(On;#p!^rjG*jx?AMJX|lltEqi#%=`d4y?N(*qoyW@xfSwRIb8s|5vU_?
zfkOGOHxjCGt}kQCuFAtMn@^v<NWchq?T?)Q{#{18e#(rDu2pH}gXHOpG=l1=nT|R8
zz$t4gp7TxOVtUWuzH8DB!C6oI;fhq1BS{T6r%QKcw)Y2B3#X6V-E*~{niR7&UDEb+
z<YnC_&lyoM@XOn%=?L0++wYvq3Ndgn6tRI&Q(ys=37j0ax1!!SYe%X`l^$*SUbiJ6
zHyT>^_^?UZZ0df}4RnwAzLp2`Q8i4>f}xdp=#nzl80Q$cU(-d%|GkB3oMkTG)EC2P
zU)O@irnAWy8_41*9(U374<Cx+CY!{IXi98XlTlP;%j+<tilwuYb#5QW^t%uIAl4`R
z*!WZ7I%-Hss#s>fnh_r*t%)^AX-$Q{E96TOlRJKh)C3ma9EApgTbE&uQ_cy-n~i6h
zO@~O|-_Kp&8*0{pN2gP~OZ3k~gd6b>1fM|hvJMxDn%#)>z32OD^lIsXl3*z(nM@%6
z<#+zMy8r&?zgp=xJk)TqXU4i?%%$@2*PXP8y!lzPlSk{weW3g!ByD?#9H)Bs6KKeA
zB|=N)b~a`qjXqOxJc0N<>I{AG@{gR!#57cI-dH@~f?LJ3I?M>o#$zLHUZ1lMJ-cwH
zj_W0`bm(S*h2`ln|53b%smz#MKD9a}>>`<pnSs=H>!lp&-Cb3<dXl;OKnILI^ZWmJ
zi@*?ZnljfNsqb2>>EgR)G*_c(Gi&IOrUX*|MBa#J85tX9GUvlj$nd$jVfr#1Ti1-x
zDc>P`;y^!5^0ks`w3*DIauY`RE01;PqdiuQIp^R@Z`@SG@$u_>8ktnqvSS)Q>ZsZv
z_3z{#M0DmqQDgbQk(fe_{!@>o=1E|YW6_R~@vTI2ZC%>1bGTDclTwwj&tYnPkL5IO
zd6~wZ7vo7zHu@d{wPtoe(z<*aHcxfRha+FDG6PRjsD}R{tN*3--|w!2N<6)c#UsV3
zZ;ZC7Tu}GU(URlK4jn}+lkZdmIy#hq-LOTScnzgxw%dF0V*xdhrYa$9a?-xb0w&8X
z7WJphK^dn)agXWtwgMQgelPXOal2i3;UesB5IawfJ3RS2G&|kmaxV9KL;4kGoxV}w
z^N>YOB5tPgHuOuG&D@>w6e{KKciZY7>+Sa*>|}=Q@xT0S?ojzAz|_lFWsEI{EtTfA
z2)kQ79S}E0Ne32@r+1ujrC-+I+nIB8vLhpMX(_b51k`jRDkvM!`#Z=Z)++L{bI8({
zf!&;2*v+%jWWudrl&;JW2IUXGGyT7DRp+VNCo2zzuQK@yP}E;A=G$7`%;XnEFTol`
zIdYEgzrr|OVL44DF7ceA9N5jf=!sbFHzbnt>{_4!P2j1g#~1BOC(A*_P0U~C0SWcK
zE>KtOI>-JxwHa78p-zRbc%$f;l`rhO&0^m8zonGP19lNJJg_onK#>I>`|5Z)85F+6
zP&b!5)oR^JxMfTjy2Ms_+yWLCAFzMF!ocaj$<uDUf1^t>+H0kIXgBUu9oiJW<0<^8
z(YO|gfdbpl;m2HMt|q&wF^^gyT<`=ka?ZWc>c|tXiC;U@tJ<3h-_ltiJ$xJk_a1`9
zjkz{%sm{TTX@R6ErXBm7K+kWSv*+@)+S<jDY>tcJoAD;$$FOfNYKF6Aw~)UiUcSE;
z&LfR0!Ipx4j%{d88K9W5f(j!PGA1thDuOSrP{mr=*=qUcu-RGuh0FiqeF(!BmrF){
zs=69Ne*EkCLF?k#zIbjU&sJfC&mYh@C)8LCSLq0>jhfvjle8K?*`i$JH)^UHQ>gfm
zJ;zCzaC+UJ8DmCe_`15Gs><}JEBJFg)xn3v349V!+O(!e`p35u7dspSPJxSgM(>y2
zOf=>3Kow!y_DI*N;TRjN6&C}Ru=)K@S*4y)rh{;CIphgcb9gy`9wmV}Ja;T>7EmKW
z5?kDh#4XH90Oi|lKRqTwyp*@v{L)jUT9!dWN9fF5rMkh;Vk<i&C#0fuwc6LYojSr5
z4hP~yOI<R%#y9`bZ(FvCONRQyb)SRQjAF*_VLldkdWF;;ilc)9?0erV$qds|v(v%z
zipsth;~+u}lF-H!w@R#tshG76@)-eg0~HhSln1OO6!??vGHgTAKP|FEh&GRf00Mht
zc`z?BZ(w9sre%N7cBAVXRMl-B?F5w#S&2Nr`u~X=y~x9BT*gQ5sPb*P%_G0=0j^U9
z`16j%qnO_~)tGr<KnA0^U#26YL}M2y1JsMs<y}?N40vuG*Fr!(tAO1)N0lrEXm37|
z>qJ2Pp*k{99+?4jk0NXZ7=!B;%_wM?GeC-H`%N)RgZn+Rm=ppk1_<K%-x0*vImcWS
z>f|r%n*et5Z)ya|jF3GVAOKYY{P4;AqnYoqN5va-*DousJ<y&`Pax?^agZ9b^O*s>
zoaD%t-$-T8nZ)(Jpm%Wk=W<eqQMHl3u*x>gXO$bGTh{l0PAzC(Ted34?AXFW5o8AC
zdM+n=>|2{Y{*!zTY8?C$hz{Zxc5@Ya0s%gta9^gwfF^zK`kPAhlT`+^q_=>U<nfD&
zaawyf4%akypW(xD?<4Jc_t4~3Z2*`DkgWy3BU?EhXN0@}2_yY)62|1D0*o<0u!f3)
zYymCnwNO33OAfPZHCH=?;dOiL=>?!F(RX1(!0wlE2F2hd3{Rh<%IE!}oE^tX`2n-S
z{P$S_`b5u3DL(oEJK&en&}xDClqTKx@aaGuMeW#1iJAXzaG>`+6@ZKAM#r~bfzUSp
zHy}LR`eFE<{WhaNRaC#9Dk{}5phx|xq5>0R_fwf`bPb~VIWb?4xp+;<ky*b1Z2wOU
za0-x;DgntL;CGV2#I^SA2B632EuEttr6pco7vyIkwjX=c(7;1&SV0>4@5j`JC|xH4
zc&0txQttVw;Q(lYCMs>3CViu;4)=>n0N9es{U?4IkceR4mlz8b<!GCoiJ>LfBSgG3
z7N(#gAY<lPxhnRC*i%bb!tvuZh#am!F{3f!ukC4rHjH2eNR=6qEbq?rHJZv?Qi@3f
zotF^v&WkapwCUd~(qfcu@#Pl_2%IM#TFcn;|0#%m5f@l3$*lni`tn!bX(q>YEO`LV
zi>&`756BCV)1RA<<FiE}JW~8w+sJgP0paK;&HghO7MnU6&j8njTqhG(p~K<NL3jT2
zK;-mKE$hoK!SHvgUN{F3oqr+cXMerQ+?)>b+i&zdKBS?aP@|d5l1HK+R(k7iVenHW
z0Y*sd%4!KZ1hn;w|4Jpf_$jWz99{oIN(UbGcc%o`xJaWPWC}!(JZ;V2#96?8C?JAN
z;(pU*=1ZHVfN9ZyvIj6Q17#ze3Zxn}G}oX(bU&f3X9B;dx-b8f2{bZ&=(QJ{lVoh|
zH^faa@uE#)!wX@hsPG>xf<_OsPg^SC!b&kEDr#UjiWmZZ_-vzaxzh5ieC9*tf&EfC
zMB7P*RU1&8Z9H0ajaIZk(UY>WA(FSpd*^Py^Pz*J38Mqqf><&Cl{h{ks14+%mN_%I
z%V?z}a}@e$e8a3HM7!}y_uK%-53+~9J<&%fomn`C-*F_Rqd&C&U`fC4;b%KQQpkDe
zPZ6r`rwHY{O^orY2=xFsH|qTNusmw&A$`LS;zvueleYvE<tji?{$rN{{SY<;gnP-U
zFEamYqLCy!G!K*}aIEjqIgS2D5Gv4ZAm93%QrZQy(Yyi@+a{o1{gT+SO7=rD%+B<d
zvR}@Eb}5zy-RQ%cqCFq?rMI?-_|2@Xsnx%Jl4&e&Q{YQl=WsV`GA@=}V|>XaOFtWV
zWn2{5;=Nsow)q5_?i;qZSd2KntDg<%=r)@w%BE^f-o*hnL%|eSh>RV-pL~DHl!qUq
zP#1<|n#(UVKmbOz_m9Xc`_@+)mj)=Y-$b;xKLCnqF6xr`5RPI<NCj(|u4CR1v-m~>
zHK6K`^u*!C{9Q6K0N~!0;ltBCN<&Sio!X?5L?0%-dCZmjgZzcp4O`kTE@%34O8Z%(
z(Eiv9p_I@#=n{wtPxJ(Ga}inwCQ3cihaAnZMdVjt{+8O4Tdw;@0Hvw%=(Y#{a(JmV
z06-Y|KeahzqdA{%!ArpP{T7u#aNvY4-M$M3q~PB32+&U{7)a2+Pia3Rtj>KXLK7Ye
z*%Hz~^k-tP3IpH_pl6%kH_=Cs#p(QKMRfjCR?7vj^COWQEvgD3;%6(!PY`G~Z(*Uo
zl)AtDHLpK+)^RNcs^~Y=pGiFz(3ttsLO>;eEwq4y`L}iuzy*t>zg8&uUqyDHN8-cX
z!Ca>|hJ)(Ffn{FuJ=X<v+F_<rnsF;HMy()EkNpVo<rwvhn`xw?A6-s7TctSi;2wKB
z#7Xw)GUT_iKXAWe3!)@bRW;|3262cy$ZtPOpixC#VzUZHn7pd26rzmY7q6A^WW(1*
z&}G8h(&_ljM^dNXSZ~1Fr9SgD;tAAA$du}oDf3*_)#y{pL4>bUAW6?)59!jBx9kZx
zJ-dR4)gk0lx#BTR6t)}8d#Z~ZPpCIB@_t<<BT-P+p_Zkj>?0KI3uv^hZe5~hKf06?
zXc-fBZ?PX3?u$I7#}K%#YR@`AwEW75PX0Rx`iEgEurrP-b?#3fu1>Rpe9z!{93kN0
zPJPUcJi|w}5pm-{*O%G!Ckr$55bR-Wqcb@v>N6>UjTtP#oTlq@b9)zeu^3saS986(
z$P4TQO<5E?J<Gg3+Rg>3>(~jGz>}O_-G2S$1q8`nLtQ_M{vN8Cd38jpajwYd4^2@k
zEUvT986sLB6R1|Gvm(xhR^=Ssxt|pKPnPaqvR2wXf7k@TkoPAc4;x})>@P4Zzvn3a
z%4;=}b?=*aJyS0#U6}CE5c?<48>3tG^t{W`?n(i%oN$YdnLAt1msR68SJs8zEDU_*
zcK$N<d}F{@@5zbGZvuFQTxF|_LTfKQHX{2HNgF3Nd9Qg!*0nAbqob8{q(8>4WjFlf
zhZO$1(WnuvM`Gw|an=T2U=ry~v6ZJyk9lJc?MB!h?}w>9KON}2>kc)`RX3}}(iC^z
z;+S~VrBQSEJ&-uEIcC4ho(%F*t1+EOugAV+ce`RhTUw^qm*4|8=?D9XA8u-6?<!m+
zVb7n>W+y8in^<|2l{6eqcB=>;iOqYkug(i2nJSn$y^E7ilF~Ysp|~Gj+J1mDpUDh4
zp?@1ERX?=YJQQ8yD~|p4ACJg-lQ4Lu*Ehd)$j|+_a5c=Yma?}*J>?<G=AtZD%heES
z{Qt1`)?rcZYuoT(p@JYvw*mqVA;{1nDlH)0Al=<L3QC8x(jeX4q14db-92<OjNd)1
zwf0_X?X~x_-uHN(_kI8P{@`E^a<2P##eJXWbzYY}*Y#!X7gm$X_UL@>-#46k(u8hi
zjeq<h{gBx{H41*HaVem|9i2~xfaaFPF*G$futHNsGp_B+AV5IFl;cm(5%IPRKIbsb
z%ar$c^6x-AcMCF%kE<0E?*+xZ1Uo62&>;<n5~_0KU*ZpMxA-?d4SE?<N>z-QJaG4b
ze+Iz6<hI{GO#Y|Wwgj{La(2=eH?^Q^ohIv>|AHF-GNeJfcb*wry_aFoxR5uDE3}<9
zikAB5Xr%)DrKqoz(LiFeCg{PBSa`TdWitkg;?pkJP@(Q&iy>QlWWJ#ct-ju+ga=9R
z%ygNlgoKg#TzTF8ux@YnMdl<e^}&Je+D!{Z4xewO<solNONuK>)l6H1qHa#)I^^7f
z<&V5|Z!sJ6Y2=391`N{&{OG?P8}WYvM*cSejZQY7cp7oq*-T8dKp$tq-ZXyd=6dUU
zerx3pKD0Tfd4_xxhT|6$1fT^v($&(1T#&I5`O;cfk91tS6(7S>J^&xdlWhFUM^SM2
z-ifb;o%>_(JqF8>BNhHmo%j2Mm+O3<UUBCO<;lVd8&9nO3|Z+=t%yV4Herb2-D&)y
zac(|d>h8*?YWlk@TBRD#et?3lypE0`V@{So&M7`7hw+iOzhhnO3jI=3&#jkW4IfC`
zt?<dKWuBjud^)zhAI&NjtoM1p%Om6Pd!GMB%c4Hb7F~O`#Os<#U>L;c?%EsO;cdXh
zoR|N3HcGS$%=b0AJ$QNMxA=1?y9SRW*pLU%vNeJ8@vq%N8nN@c2L=41=jF!|HTxtB
z_gwZ5WPr}1&Fh(kfc&X?Bg)7%@wo1$nC_gUz?<T7{O4k~zFj}_d3&8z9JYggBzHMD
zxY=#u1Z_G_ARGLYSH2qBbGGWCX~?G%WU6SL(Y+%L?L6PMmv42Y*|N&-SF0~}pN%$x
zo1>cB%X#Pi3&`U4$kE@0FTO}JsdaX4#uF&l4u<9v(F)?)2=6mvwh@xmE>Xy0lWk_?
zE6U{<NxIYc4KlHf#IG>g+Gw+H!)(1dDFH|y7SoefTHgvt^N%6C0{8OD;*LkUd3ZUn
zDzL3=sXBi!G~2gPhUjqH<(a^Cmw4>fnG*JAwoS)l7F6<rbr@c#KRQHvfdK1dQ`~);
z*a#pQ*$lsjd4K=9Yj|lSQYS#zv#VzNfVMW&^Yty3eY(!@@K$;2k)rQ=krC?+@_qLB
zR?4a*#*9SoH;WMM0ty13H1|FpTc4^TCyo!ctY?1Eq)g!&D1YsLZ$Rmm<PwMNA!HK+
z^~o4@3v%uWft=Qr>^~{0JP4Z@d4R!SV~);z(e)nG^xhU2dTBHs12L2gd_iLypD_^s
zYe7@`bBhX)ivA>82LTZG|IBy-9j6*`<iu|NH@$DHr9ulZjm-n*4Nw*@bM{Y=`Kh7w
z5QqNqM`xeQq9%`|kvivIANw{p$Rc>>w$vAY)7GdFjVVfd%&ZU?g;t&}doqM}RC@nX
z$|X#GgPt!Tkv}nOZAeDQ%<kr2q%O=DBlvunalEcSglQXw_JW9d6m70eZ%@m28)54d
z$XGa1%`3%Hd|%X<WdkOVx$X}Q-p&|3Qe(chUd+n3c5p-=p39!J&!6iA5RJs!=8W%(
zO1uXqApW=Six4vly?u7m1$-HO4_YC&8}bum2HeoEh%{f`$xH#BU;ass=4&-4Jg)MG
zvyt7wcQ=_cq>U((-f1|bxO`_|E2CsE=El3(8%DAFU}H9nWI^Emw0oJGK+?@<<*zpT
z<g)z&^++T@>E~i4`nd6#&QD&yL6es7s(xWbO4D##3>h(t<>WBgG$KjNBklOyvZ^{f
z`s+oQJ0CD_K(pNB(XCnD6GU*J@M9dG;WaltEfxNq&f>E=R2r1J|GyBZsG$A3!5T))
zpA6Yp9~^%)7%dZaE)o7X^l4a6R!ROVyv^1J)e{GKYQh82N;T1vx>>VgVBV1@?2!uX
z?Ic4X;KZE==xMrYdOa@Z)ICi?MPcnne7nB7F-K|M5Gpi>%w959h1k73)!u%)Q#&|2
zn|Hfe%nRzMo8e9(lzJ9Bfx7&|$*O`XB!a4sgm4x9fcgHuOzSy$sYVM|lwCl=HD@%{
zF-%>pDFdHJ8gl9O-4y1-7N3i8f~cL^i`7z*#~ah^uMcWX%n~v`m}_5t;FiQe8NIX|
z?6OFzWW3Oc0Y0^Bu5PF=`pVolA))soyj#x>_OyvaqkS={*JS?iLJ|*RWJaQ}=Q@#I
zD0SgGxD_^?_QM9l_pB@1H86bQJ)bJu{=)02?oza>V6hYaBUuftHx)xVgDXSkRQ0i!
z8YMj`B0G)Y;eCfWWiY39#>b<)v|P#$<~Hfb-+)>?dihyh7T$9(yGhSqvY%`zoo|0N
zwTErf)&x-`B+_%O<Z5ja8${?32Y)j*FyPU-c)dw}u*QKCewfeE@|OLh{ZA0y`V3_-
zN#rN$uny$~r!=F%<-jG&n;*VUh|G`@+q9>5FS@&E-50rGT_0rDpEmQX@F8>DE$Bq+
z&M9^o$g%9tD%{jv^a<#^mJx;N?5evr!_LLn1WTfuPBFw6(uPo*7a6j@$G<SN5$0?S
zOh}N**eY3M(0}LDt9?oL-@?fJzPSA3bpgRpJ&{v|?C>^(K0lYgpDkUo^LZfYUPyGH
zbx4}45MHDzOLkc3CpTU&bw=L;BOGBpe_`YsGpqTL^59WuZB@kL=X@va(eU=eJTeZL
zst_DDwu4!%aXL4t#T;e3C4H;fBrVIB*rUjD$%Nf%3N|KMMRg^6EQ#<yKk+Gv<G<sU
z&=9=b2lU6tsB7;A=cJG!@kdhqAGH<McNJcuZ8Z3-N=cJ+r?nM2Wr!ud(XnFsMs8k1
zapFU@-YsBi8Kky<(KKoE6NCf2_zCKP=?WQO-(Q0t^Hm4c1i<#;9sKiDE|^d)8CClf
z<1Mq)1fc6E#q|r-#es0c&NJLr2_ac`-8UgS8|(qd!wrEvy-g4Hel~X|*V2a4ny6>&
zYE4#YQ{jPD(yOwoggND}vtK@i$CDX2d8wBIH#d$sn&pqmUyk)MO<xPKu@WE-klGUt
z${ZK8#ZNP;H$mT@vB#_;yWM+3i2}-|$-9T(Hj1Qp(f38#%JE(Whd`s(5HqxyJf(V9
zetjg7ZMF)}aedsmE;_R=#dO*=W){0tqDGA*9;&Q$8gI(P#Kg+wC*aAl27;eD<ahW#
zWx-q4{FcSk;JFqZrivdK>FKXG0K35|3H}%iVc43+#S(+y1s#=V=;JNNbwX;t*>E2|
zF08Ke+&E?c9*J7<ZZ0p^CmPcX52esi5jpbVx(pq4pEfjWjU7uE8(SWCD;^TU^ImRG
z!x)+_z8xX<o`(gh8JwE17AUeU6hPNE2S<~&TF4pkStpH&d4#NUXYUu@`<^OT%>#WV
zK#?Efjo?D}fm5(TT8@6q+B?!RgMWg2lJeym$?w<gdS~36p00K5b@Ak)_VX}(vif1e
zCz*=yJ6tnJMOHR;HD<Izw}IlR{Nut?6-{O(<`rm)20?Eja4%l9^v@f<zj0IkYxN`j
z(?FX)GthlkkIBlC^Ojhey}YZ^<xwJ$fw{R->WQ&{m@Q^Hx3d7-I9jgkNBO>mTDY~@
z3elYtJ<3EF<YCacx*sey#Um{UxMv(NpTHoo_k3c$ti&6Kmf+!;H%x*-l0$3#B#m#1
z+AV;aX-Qy*j)+1g(WOp-th{isd<3Y6Uj_foZT&55|6B+L-~)9}IalfjEk?HKsNDku
z+Sb}@PQ_G6(9^AB8n3JE-D`>T+bvmBb>je(P?Eg;52;cUEQXZZ+l@gq!8}J#s)Mv%
zdnw~QpVCyFdX@NeVZfFo-8vM8iG_}NdofMF;3Ja(k39wMX7tD<uNU>(TjYWLk693F
z*E4eSM4E_i71%J@^I^ZKtGxTLyQ(x;c!C?Vkd`!NFqqAsEf{jw_gW^|`;51@3=vc=
z1e3aT$d<wS(fcx_KSBDEYq+2!cGhR0>w|a@WJtP6I#uTSNVNCEYUa`x#b*59tT+Y4
zYlJ{}olZp^rV01;gHov!ppn+$0I9U`B0v3K#_XL{Mwqq4!5r0b&paA9_R|iQ4#+$@
znQ%$jmF^v`UctPI7jNnne}amnYaH11pC4U}Wsvw?Z{=@i!9-(>Y;^aqm>EYYI~EpX
z2e6CZT=Nv{aGRBmJzEHevz`eeTllvY4dke1cXUu}iu1m6cA|c?@pb0N#!KgJ%E}0z
zCq3E~Ni_VH8Qq8tfC5pT>Y!4@mkSf?@cc@l6^r{5#4pV&a%>C`K)rABXA;#f>`+uf
zr!9TZ*tpc)#k=DA^@;2P+;X;bueA-i7~?YOBhL}D9VkfrY~Q&H_-{sQXX+HA0%_0G
zJ9L?k>3^V;uzm#yA6JyJBj19CM4)el$A^=C|I)*9IYx_Lnq_pWL|gDtaiy7y?d%MS
zDRNKO0z=0#rfY!TgcZ!R(zh;1?iGwbX#z>;TU8ycR+O8dq&<}pU(v^(kO=<x^$L8R
z>UfNOOyBu1=<AyhXq`VmU<t0auVlMNX=D(&<)d#V4t!ScP-riS!&|!}4K$WQxlzg}
z)32{h8ar&%xZcWvkIssufipoglt@${UGyCe&sQhYP~sl7uW!s+JFJulW#t$T&x*t?
z^zVLD>5;GpWpo~$HAJESRN(i3XG^C3IDRGcquzB%0@g19m;P4Yf%BmK<vj4Zo0Cu{
zIdw})ZW*|0oop!!lKaf#PZ~X6>06B*t~S)8G)ol|`ui1^{_c}SCG>GFms>jVfEAA~
zqs*S&C|ANr)O2-43G%G1>1UBX&n7tAuT0^UCHeYV8`;7|{k!uAPNV~M{u^iN*uaUf
zsdvbXjraU|iW~<hKGC2mLO75igQg_%CToLMEJdV1Z+I`uWsF9{!63LIW0ZgTDT)uN
zifHXJ%eaB_iH6l9k)2+$Iq4C;E!p&oUyczFB;Dpc;~kMM?kv6T(=lHkw-S5&)eV>h
zu9N6sT|yteb5aE0xp!9<-u8zSzj*}`>u>}9*_lCvx~Qh$vDo{!qGtIN>j#b}EhxPD
zUm^hTW8Vh-Xv|w`;7{iVi5(~>3GkCbjCZjg{$f7&D_goCwJpGrZ=$^kL0mbT4`2xn
zU`Ym2Ro@{5u>bWyX`g8En79%lOMO!<#VNi3cZCqh?Q8-5<k%8VLoD^w;kajp_O2(1
z0jO}J$&$=PH&CZ$8vVzyOMbYqEflMcbU@^*qG@r`v$<WVtp}WK+u;7pd}VE2A}&4)
zYq=jUpWC?1CO3unB#=3crwuTtQieoH6pJAe@mLW8yQLnQv-jF2t5@cOXY=)t3ga@W
z7sSwx`bB6?KT_d@wAcyUBu16FJLL4%s@s!fJ*szAJvz{XLer!~jSOA1xVR!n)b31H
zM8_qJ^)?98c4{<7flN+$&`wR;I%{@vyiw}9H09N>P04v(rKvzicopQ?PzJeT-kQ_J
zsXYQb-x<KCB{{fl?=IM^Dt57@ukD#}o!3f)#KxHlG77Zm-YW%bJjT!v5slj3rfYDu
zF#xX$xM{ZndHVZG@@^tRv7V`BFJhKG-axfiG^h@JHkv#m>G=Hap(?z2-dH;ehS3oh
zRcsEPEki^h9k(QTAv7d5)6Q{Rr_S@kzP#K*hYV8eA`XJ&h<f_j-S=Y**`KFO`?9XD
z2A_f&zIHndDPb38Akq_+(h!+@KS9K6s_9=dL~g~~FU0nivWQW|EcS%FBVD4+7twOb
zn^S8bH4bgh7+W`|DK^l0a{MIFM)+k_D|W@uXlcI+@6**)saEV;bs43BF`WI2^BL8!
z1+s^42mCmn;~r>J>_RqA?@kT6z0%KBs~x$aZVqcFEsEkgz6V?MAJ)fAQ@)0MgZ!x7
zDWEX{*MGoVc7NRN05oG^PS`wmVLlgr2>Es#s#G_ckT^PJCyclK4cq0C8%eQbvZ?vl
z>*e6U#>O4IR356p&k_Rq->Ks+uB&Jg^zUfbC900(rG_ijXsUsWXZTatR;gxZ4|vOi
z=4bIVW2JFuI@H-{f`an%ySAwK2t)gQkqN_t&_;9DG(<hsE*u^fT)}>tK#@h2v8j^n
zFCAmjnQ<0h#v*}%{<4SFMQm*({-M%feRUmUdff{kX0^8DJh9mIS>2JxK+JwKJST-&
zt={C<D(FU(sAqgcxSKDSo4zfa2)#XNpWJ@S==_1;PTykRg_5CKqjgNJl(;>WEN{Q#
zYjU>)-GXt0f8?n;4xrko0o4XRHXhN#=W)!UXq3b7eoCw!azQ=z?v7bL0e|iBF{<MH
zF-N1=p`fl5N+2`D6>*M7w4DG2?zb}4ss;XVzIu37-d1GuCFAA_UWi{+w*g333PfA~
zw|X;%;#T?H5}?t4s8s2UUFqz)`kdfEditnyH`Gtvw6&#^f81lEmidd-S#H0kDMReY
zAD6aQ(q?U2df$ez49tICs6S?D2+(uzUR|tS2rvzms7T1;PEK0>;tE_P!Dg-EZf{Uj
z$Gq)kuce9LpRzPc2U-NATnZpnNdX7L2OJF2Ex{&ZTz|r)Ug<heI8a*3maa6ipt!mG
zXQ}CGs5&M5=PV6SagTt4(*-q%wnQ9uz}Js@E5zUZF-u>uV^iYSa12?dhpr_ukFFr)
z>)jGeX@UQR*0gLjD`ku7O{pJbjsW~*yXPN%lKhQA?~6AmJmTsmr5h-2obHY?)g`9^
zDebp{TwN9=>jZJJZ|m@OLSt7$g*lIUse;SK)I>+i=?*3&%mxa2Z?s5c6wx(%ydJsv
z2RWGmT0~aQZ5TQ(T91=K&rFl@9$s0l1TMWj{*b2EGhyisv9vvGAL`*Fuv)VuxAu((
zB1^MsU*ASOwRt9t-5Gc=w5OO?%a}SL!B*H}TW=3$x@eE%1Xq0S5#(It6Oyo=hqG^R
z>V|Q7zPkVEwIeHS8~61x?;RK<glc)&^t>UtcWcszN@C}rLEDYb84vkE-}+v)ny~J~
zqc`S~8O9jFQ`T_xpP)d<9Ww}tGi_Z}Y3VDAe!{VA(HW-)K2-TsKYlFUtOFNRRCtff
z@VGjK(doX;^nHel^<9Z$;hq@W-ZyU>vedsdb8oM`oOPhR()&gwC5Ziam3F}2e`ss=
z=oBm4Bt90|aQ#)&iL7ReyM47%Srj~GIQ7Ou^5G@JIE&d3GsPBtBLW<C<(j5EsA*<|
zm%uhs?!!LfJvLO+RI${jF^K*@w{v1r;P;warU}u&d};f!6H>sw4)~$FV<Dzs^j=yc
zqBHlL*a7=lqVhar%X`Tqb`_C|rkg>0rK|p|&SpGrr=)X~-FEd1LHhlk{2wgyvOXKk
zEH8=<dL0ynzekrs6X1|4YcuA?%ER_E>xaO3Udh|XADZ%vTTB@JXxAuyrf|bLLL7A1
z@!NK{f9b3GpAfHH`%B>e-w0poZBCC5DuVK;oV^;Op0*QO9yvSoi0D`eNQ<A;LDfok
zw|!90#gyeQn;mug=uT*_@8w&5cex20+eQfHHy7k=zIpPM%;r%5Ql%Mf4Fp@`QK|@E
z;QvM&6jQBAGhK&Vt~u0rv%zF(w+J=d<P#?$(_B#qWm}J7!J2=+&0zsPefRM2u)ru)
z;cl`#DCr$qpX?}>ZlKfDn<6as2I)xukIuU9fVNEEBmkR3D;E9Syx+DFn#9lr1ECFt
zt3z@v7vb8L+c?op4zkd1ubU{$K3muMoQP-xJ3VmW;?8c?xne2GZSBbuoXEbaS0Q{p
zh~ii48_@G{_&ZOqniG63NY@*9>_H8HxT%d$x*R?QM)L7NkWCoCu%2;d#7%+ADkMht
zI*p;}gxz`hC&!G*Y4<3JkNXICO}W=-VDLApy*y&DLAat}G<9l>VfZ+LaNklpY*AT7
zTj}Pa^4%z+rRz0bI~oFr$1&THJzfj0Aq;lqL-KDDC8y=xZ#u9kvNXwsH0B)`@;-}5
z-pz3vhd~>~7R+e@(p1{|{-$kO!xw>_@iU(V(^rzf9v^#={gfeHZ@0N7+lvhJ!Fz#;
z>88oKmlCfA%05nJ8XuqVzGL&nz)s!}xSOw%7k??flttmK1Q5?`Mu04V)N4vcT^)Hl
zMATm*9$T}ae*nVNrq%u7#Zp)h60%B+-3*=-AmY!;>ZR$fU8DT+7=9)){jLrTJ+b+G
z#J1U$Jz_XtUyfiaMggzj1gb?fvzE=RSdwAt0JI?11TQdMNI!MIy@5Sap!Cu>=UX1z
za32bt_&qH7pI-k>Ceh|Rdq!vhcgT!H*e<MJ$Fh7XLgzZmO{<JxmlFgyGoO59Q#$@h
z@JZA2*|M_kmRi~Lgy4V&Z{~5NQvxv+6*SvgOUB8BL;1Q}zSRwk`Cb3a-@+2g@?j{F
z^k|VhGFheiQ7OChF(3Wm7^3Nm=G05&Ikz#2_q}Dt3NT$!x2yJX&wmqSSaL9j5J%(G
zTLZMiTi=EhB|7P3NZSeh)%Ddsy`$dmxdT^Mj83h6J3jeiE=YXf%)V_~w?2&Uh4+}c
zCvN5ELJZ4&b5JnLK%GGIwe?z#Yz*qq?e5f+8$lh<`I|~_5mK&_1`9cx$rz7*6D{pW
z03%qt6}BHsWRf4xt58wjQGM|O2TnxA_nma*JzubY>u$~l_WY!EHc<?mTsfdCmi)B}
zk<8e;cf~h#VJ_pp%&+>>b2%_C_>VmVbk{S0*!u_0_&@u43E{c`4k8irhe?ndRAh@C
zn-ap_!g4NgNmobl^6s)0$jEu`oHwLwJlV?E)r7V{7ncWNy@0F*yN$X^pvl&8oUid9
zvDp!Ctt?<(8fy~hXC9eQM|Y*1X3NMeCRYg=LVm$3Q(FWJ+sLoG;q*J7c;s!KFnRiy
z=K=)RBI;~Gx)Q{qp{4%$XJe^z<=k_Q`6A6Rbt73>A$QkowrWBhHK(!9>;oDQw^xr&
z+6+T17wt<17_>Qv5d-KY`;+%9L{#o<u_WsDIN>H-9~gZdcHD}*g`6~auCDPw!{UAd
z5n*t=FYjv1YTQ)D*2uKh`i3i~85`Z(^Qoig@qnY#+Jy#!W!p?(>w)CMH^4&35Ir1B
zY(>8IqbK?OLKb1-ZTqxbQc|*xX25&Z`wc$BjG|lpH2)P}G*vb~kd=`9Hq}5ex)*{S
zSm`^sFR(rhZ7k2svX-`clT2h0tgEmq896*Y#8#C$V9(N4RW`?QvEJ`<kS(zgq}3Gs
zXip<}(`?>e0>VfHEU$6_I>RZIj72;0(^Ov36H*3xJ|zS5G>Kf=J-0JEO9JMzU{`wg
z9FNMBNyPH_u6{snw*L5#wHaEw3$#+ZziJ}LOV#9HB*vWV+-mRhAmgKtLb4@F#52^Y
zyt7;X0+(ppKORUlKvE<ltVK$f2TW!^*OdfE;?<r*zn{ei;oWeIDlU~E>tY&6c51Ao
z869#77J=}YSi1gZ`d@KkzkTiRA%$;MCtm446uB0u7`aRthv;eff;2O6X$sqHmbSGj
zk2+@y6?yblb^pb8dy@Bob8}vSCt2YVvU&r<u{hGW-A`K{*Sx-<zH3`nh=pB$TOdeG
zxt^?|#?PL9hq$lP-bueDHU0IQ$B&38Z6mk{^Rgdr=i@jkY5zy+#<ZLuIuu4z;mc%a
z_mL$sIT#;{b2-p=tQGr>Jy$+^lTz4XGyaH+VPILEbC}V@Jfj)>%^f1Q2m8K5=4Z-o
zxoTd)rp0>%J`A;z88BlvJq3UL1lkber*rqQ2=Od3yl=Q#f$w&T?6HR|cxAFhsn+E+
z-&f&5J63rYL^kQscbA`cq>Y60`nC5w;{Q+HyWHMpdQ@5-wXz@-O!&Dr7=|W~83w&O
zSBq!K;ctIYK&I8&iTi%i_&%`3PXR<t#4=XYyE*31Yi!NzUn^)jr*?-(`yG?L-{+s8
zocH%%J8vm39>QaG%;)?^CcsWq%JY8Lg#Dl40aM6#3q&qQu`ka9wgK+b{LCeEPv9kR
zNop4wQ3aq7A-O<zh(V3hY++9lwGU@h+7bN;n&XKnDKIThARkz&uc7|14<o)@S3guB
zPcK3|Pu6I6K~jyrH&>mv#?@*vCeYJ@0;=@U9=UMKFdn3c1Mx>H>|CW&#m=)tOQB&z
zg?=K&se>f_lOYI^P)~{Lfrb>+r=Or-<nUZKCB?t8MM#Boc(bQ=T5KDzg_))gNDAL5
z|0+jk<}=RKR5&qd2OFtthY4Y(x_Zg>zo39<Y?`K@R%^U!YMv0hk;@oO^Ip@(d&`;g
zq(G}|v{u{F@ZNB%pK11l1aVu*`hJ8?svcQTJLeX-_=1-fsSE>#C;ETJ#Kekhm$*mA
zv14J?-zXkyndmV)$Rz>wb$KXG*}$Qy@dJ>TDfO`#9xQnnR7<FJs;#{)kA5~K3=O8m
z6kw66EUT#f*kXG_uGJ+qlaS*3D8zpE;6;=iU!2@}1zCFGmu8VZyGyX;3{&qew}3l&
zFEI7wd3H+W&Xy&SvHE&FrI(DaosX;!I`TqO?0l4OA;j0?QD2I8&#I_*>*ZIygBzVs
z#Rz%l?1^}7cjTUK+&z%Q;eBJAGb9Ps6v>iNPmkZDXd?b-952S0_U=ybytp}ktS@X+
zgC-B2w&kIpHpFq~axE20Kzsl~#r$b%0#94u(-a^w&-u#^i9gd3|4#b8Y<^_Fdw5dr
z@+op>hbb+AXs+B@R^Nf)mOSaRo+%1J4TX0`=VjFrSwGy}{2cL?(iRic@j?V@C?7UP
zOgJra?ZouB=9%fs*&7;2&JR^S)0~mEIG#1DHeU1;^m#pz&4JYnpAs=2E>W&0t+_B$
z{kYdPs0!J<j1kuP5snd`QN1&RZf3Vop4Bw}G`>o~^WWKns)?u%@Yhj@xuL`!egG--
zojDV7&zww!wCj$)$Q0IGlE)4iw(5FL*q-Ls1xX3{0h)ol(v0%K{E-6=B^%-GMy~t$
z_)kDOZ&vxL-`;drYMxlD8ciQy+#-j)1W84sbwx8#J*fV6?V7gA3u<<;t*0urLb)5~
zapeg_5}>H#XES0n`!&wq8m*i|;Cr=?Xo9Une`KHgc-Ow{&1=b!OmDF)cpO_EIaQYH
z{RlUR5W^2m;f7uT)o#v1Cr1@vRe}YA_4p<egLX+6&#L`?m)b`gdIg?Jo4av_*F#^B
z%uoz3jw1LnHui*dCE6ZXG6su3U{%`X^}q{Xa6oV7#|C*q%d{FUxz!abd=7mNv9XQM
zU&|6b=(*w3rR}+diI0#L2(l&a^|6x#w<O@!4ju+zNMzZI-3S-7@ZG+byX5(zed+oO
zvhKfV-}tRm{kMJAIW1^*K;rMhd$KrZ*4PDaZ!%<jk~tDcf|P06>jGT(Nr;-PWn$C$
z4R0P91P6guGjiSuHYXyd@r9cuE?&IhhP{V-bdHPrPBN=QKZ)k`-Q`xlR0-#-#OG>5
zOBV_VpG@6;et#^M&6tg~l)?g4g$?d-pk++)P~#Sgx1`JU##53RGPjH8H{Be6t+<=N
zqKoj!;*VrIXfeML-uoDaYoan8eC@+f%z{~RG^{$?Bs#R%$)*3m&|;Co#2WSmU?_ne
zuFX}h?mVg*?%hgz*`ud5FL@gJog11Qi`cy~G@QcaLt2Q3ggFP1f>F`~08^6?C{PQG
zH3<zK@TYGi%?y-!G#U@rE0svmXy~m$O-m_>`^}I4(`Rtt9s!_9Y_4vOiJU&#5Z%be
zA#%vPPDli35@WX3GUKQq#^6xdxTj1iIo}fd-N@3`&IOLuGA7}ORx4&4Eeq?<oUJ^_
zCpj8!JiFb^31$wrMl1$YJNiQpL5qj^>HeA>_+@^_xbTjJU;*U)OP$Vm+LrUccXkpj
zYN3a#gx|XNjuTJdhe3)gKS8|H$)Ax>Mh9HkV=*_zf-iL+l^Q3`euBQPAgNCSvevof
zL#30ZjjIE{-4~-`DAL0TPdG&%94NKZ@<Wo>4(0`2?v~Zj@vcqKez!QK)FnF~p2a<V
z0Bec<tUjZu(X1WPDeo6;sUGf$Eim^~>z)%>dUv?2yH>W_9(Ri{txfUn3o1$w_sRL)
z7UAlw`UL0z*O2woeB@3d0)rqA0qoSMb><%SdWd*3FNpB#HW6Zz%U{zMBN!&cIagGG
zeMIotC-Mjt>bMks5JY_M*6-Y%Z0wm_9Gsbv-f80X7uocmI8u7u6ELFj@T~0od2UJ-
z_UQ{nEHgK;0@ZW6nyJK4ApAHHBJ+&+=AurmVxhO?y<O?XQ+4}pU3$D9h*S~MltK7K
z8VhoXb2CGLy{Lz(;CAE#rxdZ(>V8WD!?2eYl!jc?KS6{)LE{ZO*+>3%CfS86LT__(
z1SNrRQ;q$5w}<LVey_7^DBf1=!hbLb<;qRe4py(e$JGnp3<j6wKj6{A3X<JFbCY(~
zcC}hw33guee%;<PB^v7){N66t)GYLqdl<JCmPEE>+O6T-q>>0OoLAT&kq;GAwCDvy
zi~b|I=GV4+l$Skw<*J8<)vT+cG3&gCt&*kR%o*mH4L8jTM<DS4)%f3RT(y~Du!&jy
zNTh3gdsu5(T@P49rtb0?ob@k_;Pn9g$O53DqyGehVOqtVpV03`Ho0^4uKo&#Mm7L&
z=rq70UV0n@+~bSAE6xhw9uu`<|G_<$dToV+l0Mrcaty8@ol*hL>Max?ZGXkIQ2;dn
znwy|--4~t9Rt=iAqD(Z8+DpCw!?^y*J#w)64u6VR3bRQ{7a3h?L2+*=#_A8;Tk%wf
zt^>QAXJup9SAez+>4})?z##<4N5iLg^-(M`O2`F(;kGDTm-`p;@eM#eb^_$1U~5;p
zbi+&w02hN}{e*iDxq|0=?*78PMfZkx*<UdH6VgWE?z6_x9ka*uCl<gjT5p~d`CC_p
zyO|z~6@Gs(uFmA{42Q=6a*BK=nkN8l=iO)ZoB4ntUYGth&A%fb|AgwN60k<vn62J)
zxhA)A?l#+lVgh1TWQYrk<Dz=xaXT(O%;XSf!2A;CtpPZY+`ekm@mC#Rq2W<0awIIp
zad@q`5uNV*X0owgUnb+*hW!=$_$#b0C)Abx7{xI>1J0uz;25r`0BbrH;3OT<egn4Y
z0NVlp+mb$ytF!ZQ?d<rYq#^)unRi88Zkcr$k~g>#qWB4bevG{$;8#ZzW`==@U+U`O
zK*xD<7ke=4l|8+<-YYVM&>|_Z-+2IVD!)he0ZKT+f!+4Ele{9}Rr2+5zG2X}n=K2z
zDHvUWEu;A7Kf?MbMJJO!O9=`6j~m)zZxS=Q{pGCbZVLf@1aE&iOBAbs%`XiJ-ztWK
z=JzWtGq}<()3xXTR%1HOtV8eOatk$|wW>0LO?NB&{`UtqS8RYv8o>jC&+cLGa37F>
zea_vcFZcc`0%qJ2bePBWo6rDK!=rX}|CjayxdwLIdv(7U-vdzZREih4OJRa9<fwjv
z$Wac5;wPE`e!>{I5&o+24PZ9jsC^aZ0jy95tf&B@#@?WiS7RX-dUu_|$f&zz3%;x}
zLCegWk@5k6E3@#@p33Dwr0&A~YQQBeZn}FEnnP|v*~$}6p0nbEa}E`vA1$k3kBWcX
znI0m4@6fAyMDfXSPaVNGfIlg1q$@hXK_@)0(af^6N_{Yxm2WUR7g=BYO@5F3n_W!V
zRFJhZS$<34b5gig#Np9OorKsx?xEz<S;@iq`xBGtZGEH{VbV5IV*E#kMCIUXyG2df
z6_T3mG%2R8;OR}rQzE&*s)9!gfM;@yW!$)b?ZH=p(<_e6fMJEFdhWNjfO@kruV3P<
z@~zdlI>%rf(jKRwRen<R$#ust4GuW+WM)0#>9VtBK>o^JR2_c6e5IGP1sN0Mev5*9
zAe*a?!fHYl-%WkRjK}p`@bB+to_-5(D-wqvYNp3`4+6-oEWSevur!OJECQp51T>00
zgt<zB)=L?;xMo`Rsf$kAzg2X>&zex<gD5E|iClnG5&Z)L+^(Cbbg9HuZPNh5iBzxc
z^kTljPjr2a6R6e_Qie6m&teq#J_&g8IG?T8T^v`uM71OU@$Ehiqy-TmpYZgCcgtTe
z)QAFo6$G0-CN5yPUVxDowJ}>E`7OY&eBz(2C8+j~-x&FS<Tv_&KR1@3RL2~wABBPV
z9tV0-P)ubO;DEnRZYYB6Q&&1GD!%Xk+Fc<2=f)DCvjf%q!40%c&`|^3H6c>27lR&y
z%>T&ki=#WbsQ2{$la@f|#{WgqGQ-u}GJArs;LyvOxKQ6;Kd9Jw$!%=$d?XS#ran;e
zNp2U9<U_H+{72QryIaqC(3)CsP5M{8dffTIN`M#Y0bclm(=DH59q>XrF~$$TtOb8m
z6$4t2P7Hr-J;F`@bK^;_%o{`O{0$C8K<}qQ*<WC(Zd{T1_`^J=$XCk)w|zPodDW8j
zNDYRuUTu*WO+`YF7{Q9au*Tk{c$E6(om`AREZ6B`O}!>8A~W%x3m9fW9d({wV=$`3
zfipSPS`_OSBm7odn2L5PTsSFg!-GzPC4bu-u2Xy=h(GY<>vUXZCo?CSG#S@QYVmG=
zadk~4aZd(#I%L+;$Xo5*i&=(uo*+V4nzVfq%*I?#ZF7t%Z((1kOR8Qsx%HL4C>JH^
zV!<25r>x6m!S+Nps?$&zUKU5b9iO?dl~)XH`;7xQom-*djzgciRfh7}a*PHfWlcH5
zIfbkBWfFCUA)f+?%>&sTIur?`O+0NpQctK4Vtx$g6)f(tYE7>BWxY1P!Q~vvn~n)v
zzPE9^x6eg^r}gf6)Kv0~gn}JzIvQ`M2Wrk#VfaKMK_6uKe6zq`b|*~Al?d)n8NKY8
zSy^7CVWQIWc?v=kp{22L9~+fcxMK^BykjeTd>5FpQ}|b)&7Z&ar<=IH8q2x9_Uq&N
ze|pUM-~ZmXN?<?5j6{H+ES-+*4prPunRe<yu?oGLe3q(3yT_r{*(T3#=Ow8ZZNrK`
zOoe~PYj+Y6svm_wPbGOp?nqP-Sj6g01#n8ggidM?T_CR|L|bw+$9wR(Alaoc$3&qe
z=6Zw|@?KugLj=d5M+fVWbp=y&$^MW%nmzu`Go{1>_vtpxgN7FXC)fp`FO_gdmAa8K
zj@?n8)N#56B*Lb=eOhpP^$^kK*DXXwXvUk<(fPW}d@I@!SV%&YyHSxqGqk+B;>8k7
zcsgW%bN&7fG5Ho$x|d8iGEHTz+KwInnQxNsJG7yMrG%yPK{j*ZPQk&)g+&2+Vnd0w
z@aZonPYMP;61xpdjXv}ZOv&sYz}T?_#?T2Jm;rC;mGA%lmHpR|qJI}iTG(lrzjoo?
zJ4xIL*EDq=H<)gqb(A{mc8ClB)`CI3e}dL{fFUw-FMoo93}vUy4_I?L;12b=yg(B#
z<wYWOmzl@Ftj<A&#eQKOhi%a0Ucwk$txT~=ws@G*c#CPi%1=$(N&VTq_tv4ap=ZO{
znP8EZNn!q?fUW<<9Hn1b=l{zTQvv%>M)ZPa!;=7g0KO|d6FL6WLF!&C)AkAbL<G>P
zj&wKRZxYN<j$nJ%@~{LuzMV+hKP>(D6Lec(R(lfoKKCVwNKqeKtow7Rk3#0x;4)L^
zVXk4$PwKc$H{^$np%1j~A{7w5y=NZCW&!8PWQ*I6%RK>67C<fE_Vq%Ow*rwz(%>q6
znLYx<hf4Gdd>-3sd-Jo{^GC)?&n1T$sgMV*=h(>dE2+%sl~hK6qn~M~&CnOp*?IZ;
z)U+v|f2!${Y0IYE2zqQg1`VxBHgfQw7<|ytj@5s_8`LQ}Jsx6XoIugX$jg&~p=MSa
z=-A7%f55fTWvZBR-*F`hz((q6aj>+sU(IQ_w4Xpz);EDsM_{>+|D7I-`dfe3Ygfu5
zqoqbEhjcw#=V?BwQ4>7OWhkhOOQ!Txqr@3qR$Vv!yr&uOG`gpyHcm4qVE9^#1!sYt
z^D}(r#hDK&DI%pLXLp)#q=B)&V1808&)g!$+9CwFTuW;8OJ0=s-p|>77MrtaEeF#I
zmRI|OI|C}Ch%A{@p^wx#BG|Lvi_RrR(~`C4nPj@r3@ZBnzn_BB+aJFo5d&une`Dyf
z0v$c6tb4U!5ylkLBv?_kcPmL?IVvGG)O|v_8ojdH^3^@nnlieXSsP%&-IutqDY(k7
zixZCvF!!)w?U>O~)#P7Nc-gFGBx$=7VjFcSUPzA))UfyS+Afq)sSQ8wV>=-<W#m|m
zk2p4#1~kPrp>Rz{$6X9H;!YvM%<*_gBA6|=QAo@FdiWtH|L`G^FbP8%m%EF#xmj89
zSsuy0l4iOB>1<<&<}Q9fhA}yQc~qmW-Dn$AMlH0#YMWK1=45?xGiH4wDkS|?%38Uq
zew8j>D)?@RJtd~zMM9ZFJ@lc{#7M-BYef!3yyJ%u9&7{oxNf+Nh>Zdx(ll~m8xaUI
z;X}{4fFA92a%JX8#qkr{FB`2z!LO2iUk9*F%KjdSVr3uPpscF4+8)1niFw>H&iu1o
z+Xi+6|E1C=1P_6aT`yf_!m;j(hI-s~WG@%>!G{Ea^OdCjT7p!gV*>V8Du)wY0y%X=
zFO$pCxnayiVTmMvD~ae*)N4P79Px`-vSQZ}yLl!HV?=H6mcdi6luYeXRgH<M5w$a#
z$<-<yK!nyy{ofs9|0{8}h)~OzYR_zPskW<VOElcCO(IIti{ts68dB>;L@^C}rSZvz
z{P8|TvrKwM#)jhvh5jwJGEWsMF%EVKv$-(##r$V32E}=~Z%k9?4a|)0Pf;sXaNxtu
zESX~LKD03sM=G2FOT0>ph>%Z50YON$uW;1&ag*KV6D)I~>D(kC{Qggd+QG@b?b|o)
z?d2z|`?4iDjesr?m85^C<^T2TXe~|^Kf+BX4vBNEj4E_yx|8rH@{+$21$ejpAdx2Q
z!Io>$UIY4+PvjZ1jMrk^h-to!t2}p$Lh6cB+qK>jiOMF_1!3w!rZ#2>2g+(b@FA42
zkXjLqJXI-5Z^FwW32%`mBpqcQVWY2c$2qiHXM9pK^60(qspN%YeNj>vm~nlXi+l(T
z`+WhEJvQ1}g;H2)NxMl+8nlj*^cH{Wdo1@&3guyDW?r><QQE#T48lSA&v`9P-(}yZ
z_B^{L+TE-9VqK&5831})2@=CFO>lrovzdgzdY`*1h(8T6enH0f-$@O~4wtF%Jru7T
zD!cQMy?ni1go^(=+7!^TRwwD|YMP-c=ICmTS68dc%#e#G?=I<V=?@{3TZjs`{^q15
zDKM_)#h>N$My9R92A?G`E_9UQE%&Gc#;wh6u60RBH)EJ)L4kf1Yh+Yxv|U56CT>Hm
z6O-b8<Fh+r8G(3a4rgMNNIYBQLw9I1l-8@3hw8;Ngir2S(~-hO4}W5$v*kzr`={}>
za)wWGw2yhfDkCbX&NvmHxF(Vw#w`-9Yl$i4#SC^@*~>Pj$&`&K`Cn8(d7O!WwMjF5
zbGoOt>_v~KoO(5?M@Y#Ud!8?;CjSIkI}2#?ZI%umPLe$xHn+PFWqhX@+V}x|>FF@u
zf>O&!KCrxvKqy7i&&Vhx>W2`n5{sayr~*SX=$05?R(S8dyxQ<7@z7VqQI59nlN_H^
zI!0YAV0PoxNa)XhG~;b>keIT^SW8?Td%fD3G90gL2F`JA2j++W1PM*o2OJKC9Vj;J
zMNU;K`J|b;GqI05XEX1)q-tgW!v*saazy*L?D)8O$oSE4>CcL`#kNgcd3enE2fz}d
zvV@N{vWPK3PhH-Ax*FZvqQfF?eJmf%Kp>TO;yyc9jY6qOfmVUHX#WEr=>Ia3yXud7
zQV*}@RVWcX$+C8RZY1H#<`Os&5kMCR4<mW2;&put_cq#*r7FORM5tGXa&nn<X<_lE
zFuok$)~^a4*-Q3TiM5Tqo9a2e+ttKBTqevSB(D(ZL%5#<SoQZd5#+12gxxOdz0Grr
z(xl)-z6e+^E)m=mvtjx#x^BzZHt(+7Splm14Vy5EAm)k112Xo`h#QI1?3{<3KM+?^
zlF5|)+U{`pbOrgV)%{i|v2|@AuW7ki<qePJ!si>-utgN$*Rmv(s!^hkMtj0MIT#GR
zD_!3F_<>oXF(>Tv8KNi#rK35nL=1<R)qJ^z9Wk~#w0P~aX7i6Wm3pbFPq1Z}z$c(Z
z7)&x21+fXA_q%?di`OMkPYyXuk??4eX-A8`{IFMDCmr|^#zRSxO0nI_o_AKxhlK42
zddV$u-xkBBRl*PL2w_fy37|;Ap7?eU!&tmcCLJ+`*z<o^=i90`&X3m0>(jd&h+4)|
zRqJbZW^mRj{0KT4EI_PP23z4akEOdKP;F+tX5Bk~1z}DTM6Q@6TEHv!DY(vcIZrZ3
zDw|qk4LaY8Y^#x;$622ap(HfUIIQ<d+}?*@L|<~Q?xu3|-~Jd&J0Zog#1CaKKnbYa
zvjrf!lT0hEfZ2$>52DOE`VJWU5A(IH&w&4g+QZZB{-8Bq){WiTmo$M-n3et~$Rwn9
z-=VW+w8Oma`6%I)ddmHwaL)rbrsrQs$=z?U8Ut2m0#+Y3GeP-(F&Z5(T9vPOTYc$F
zGNhM~t=}ETx4qasY4PgVOi*KvW?p0Jn$2<o2L~v?IT6M}Q~FDiSBJ(J%!kGGug5L$
z#*qAQ2e>+q&10F+D5}kD|9Rdb&`ypvR$B~Yob9PB$=F97$?&;2-8MI*ayJUOy^!~?
zX(`u$CGW>lB(O*l6tVK$OuQ4gf{wmRxVoEJrlH#OdN6GW0!}S2^VMI1_Mr_WYO9C2
zY3=Go$*GHE@zm;}Sx{Otd_26HYMVG%4^9up#?j;*N?Izfmh@|993Lw?lpy93+Z}JE
zv~fBjY(tpFjntZx-eU~$$%|GSj)1fGNxgsVqTgc!>eW7K(RB}LspngL>s-r`Bp{(2
za<0(`Oq87cj-4?lxT%rgGk*zgio(!O9Tzgcy;<tk<E|3I;NETdl+9|k*+dShP<&*J
z#MpTP-?Db#zn`cPZx1Ea=**HNxfLMke+Pf$n)^c{+ifW6s8cliuFwM!+$<vX$Eo)|
zT@QSR1?1CSEj-y8^&O#NRhl(_8Bs5|^;sVIxFa2PB7G4ox%D0gyofLm@Efe({6>8d
z9NXq4Tyy1aS`7uH9&9Q5touyk#el0t{eUF{5p$(;veQ)hTI^m^zKFW=y^9K#yn^Y<
z%CVCHK~X;(oofgS1}Sy<ibf%cd8dt?5>>?Rsj?=jIaW~)Q-uQX1b_a9NaybIPuUxe
zT9YyPi8X@W>*>+b<p$UL<L@7VJ|uiyB~{`RN96YnBSnflJ1%qG(;w}Nq>o02NjHcT
z4d>OZ1{LrR=fEK4AF!xB+i|0+x5+woMJ@n!hTqn}$_$24v$c3AS{V-Gl=<~?(471+
z{Nd#5KHYY`o6nxHF9!2i%PJOn2G^-H`89XsC7TNK&=dnZ1I{b5P(j?Rra$F+CRuUN
zN+00*kP3+VFTce7>TV93LR^|bou3S8>Z#k}Rlcx39YiGzok^zo*$(rD1ePn5_fVKU
zT<=sC?G`!^{4sLQ_s1h+SC@1ME}a(lP_!3?X-JpdB-7gmP`k0kzD4xi^vh~yaMw8h
z^rST^hz<8pMBJ_YXM6eM#dA0pQQv#BfLuvg>suO7Zk9xiT)uGmHIQe`eP2_L^AA{u
zJUTejWkUr{uB-mk_xo<?F#Zfe)Rq_ZKPJ6Lb$+@^^!$XMS+0k}GFydyjT|Q*vzj`$
z^vOX!Yk!^D*Znmg6ERSslPDONh}xWl*Zs$&XFO{X(qmNKS!vHNr+d(PCAa%S(i;F0
zeQ%UyoZg@9X8+CZ#Kzd;wc3K`dKZr7K+f(RB!ble?}n1Y+9!y;I~o)+RnB$9xww}~
z5c-gElB4a8Kot~~=uz^=KPJ8S<IJl>A4_=kW^SV94g%p{1BCzRvr0rIkR?smuJU|I
z;EScb4VF;UoHmbPio_*&v|~ZUq>Eub%Nw<|7<Q}hdF#0Ea|dfI;aAeXV;Xs#lKFET
z0)a#z9vVzSx{Z!It8BxU*#&stp|Y{|A99|t<kE{)_U;29VQU>NhP0vtrWlo-ZLtO&
z*Q9|Q?m}vND(CU%G><xMlux{#WLLi?V23JIA+4V^Uppd7+`MR?HLU-mR{cKZ3BYB}
zI`PLH$$dJ^Je*npy(h}%-={p3(QH6*`_1T6R3V}$6IuQ8&rF|=F?MCODEc!_-g8X7
zt6Ej(_wqrB+a(J|V(+iDsuXalq%gp#$bASQfK#dcF?&SJye5WRy?dL>U%eb1JAYHL
z+Ok;#-*}<Uj@W!lPf(w&JCSwvR7MO&W*r>24=yUJ3EuDE$&At_4X9F7j-f>;ynpBJ
zo9}Uf6u0pE33`{ab4g^b!3VNBCLr+3Fyh|Snik{#W=lNvaz7kL%yvK9PNVi+uoG90
zsb#YRchIJ?`^UC^7?^D@ZFAC6<0D!f*F`f$W4!r%0FPRxp2~IthBL96tB94(ryggS
zKwy5scj^y7%+`lIDy{J3(~|K^p1xSNGYqE{U|*(Otib#D?#!&mXfPTafNYp=-aQ&}
z<W?LCUf}Rl@>bq{@|>HysEtXCfe*_?cU`qiT5|B+KpuauWvwTisy3)<dTgnLd?}_%
zwFQFvwf5oFUZ;OQ7VXcESpNrR<fsRi?2)`2*slL!w7)U&b>6IW=^&AR$%^l+x6E7w
zTd}svSg7LCz0sVpr@GKhaFG!mrL}m-_mb_)WvAWQGV~4Q>K?BW*D*8sX-Vv6C4xD<
z0#R6M$qx6ajVIXPI?mg{J|$^Fl2~$gw23)6yLNi5bit~oTns1ihL7C%uorv39*1l)
z)^F=r%)nyZUH}8iYZkD5n#P=THU=fi!h3}(-PF}d@89;dd3J|$m9Gx`^;BZ6+rHM?
zFl);AggweZC_&87H+`}6eNI9K?JnM7B;J}_@2bQdUYe)ILeusG7dth`T#3}Us#pLv
zs&_-R7lG(f4dh>L|4<MXqXpevVnS$JXKU~$&Vyy3-0{Vl?JcDOyUYvxO-Gq=i)20i
zgsPE~OR9{M{mhDX+~p-1Mh*s>GS#Aw$vDWB&w(3=G++#d(H=#hQ>ls_BT>V0O~~Qb
z(aTT-=o}EGM?%GR^`2?olT)p7S5w$reZwzW7s`!&Fi`CutRlV859rkePSK^yl0Oaj
zuO=Qq0O|2v*YnP8VNJhz`17XYRx<vvN0vun72VW6!?9UNg)AYwZ1JaU^I5X?daE1`
z=KvQ$|M@=&HU2xk_cz5J|Etf%|09FjY&EI(c6@m0t;y>1GH-8FlYC_P#&h$C5(|CA
zul|vf_&rueVM&iJwT_x_>}YM|+*yP8x-!F?Z$#NPCTd1UhqPZExmCd*eu6%EWo-&D
zf8wQwy7{CnuP`Q^drlP|&-DXXb8k@Dw*3xne(ZJo(th1B>z^PEwH2gJIkGoGAOwQ_
zWE;8ccrvDbGPZF9#aWxc)HKM+nwY@UV<-l7A8Kao;?2$C@;{m0<k(d`^*G<36j!cr
z_9cF3HO{;0DgO>_C+3)pY998f)Y<Lyu(`ULmeJPb&Hls9s5y!xJlAFqtiD>M8!{%x
zV2*HP*04Xf>op!%yZo&hg6O$DoGiKyhmEcJwawFp(C2|mc}0s*%;e$vM<UMYn=!~-
zV54X9zHThJ#e#<tv{t0EFR`PpMi^j&<~OxUjJe(LLr?ZcS1eDy?9ck8_PKGTp6W_p
zV1*h)_2EyAQI}NYJZy(#sh_3m9%T<4Yr^?MCszgrc5ITd6!PbVWE)1<rs+AfW+&A!
z_3r51mF}~L^fWXx^3Qj<K)k=@v|&9rw|IGdyF1X`T9Y^XB{#c+%nuzflZlnN<NC>#
zeV7EPnp-;_N1?)$b(jm<&Z|GkI|{h>7)Jvjl0*ZWBE^N}k;A%vUwW+u9`0{VHkLcL
zxAK|fC(diA?q~)x<C=2EIrd_ovyIry6M@dcJ@^&LTDguO0r_Wngh-!c%xv<`35D#)
zf|Qel_`Gii@G}>?!6u8?l}IW<gw%KJks?c8^Z8iPvx(tks!Tsb(;1UVpa2#xTgZyK
zYPU1<$+iVq?*VLG&B5hXh;~S^v4d48LhPE?*&-xuqrP(w*nAZoef{24+P=)C{u!ab
zZV}sNFk3f=(ojj?q$-P6*Fxqd_l_^*9{D;2@KB&z#1c-^vb^6?=rZT_-G9e8tc7&%
z%W{z3t6_CLipKB^E-;q|#qRQ8%Y0V1IIhFaliB%>f_>f`iSl7y31@BNMTuBnvxH|O
zZ$B)jEn=mO+13zpQa+3ev`v&LEh#T7lQ0#f^1MN)yr_;QD~4+WKcG??`w?x`b9+ss
zjz9jXdV*zxEX&x5f3ZXqc&p%(a~jaR80Cv%9p3*>AfkWuH~%gh@te%yZ+*^<H`VeI
zG8L698(yBCES{X0$Sw^tC(=E@dkfkSBNNF3nf^<m9qlg*&~TI@h)Q{lHbejTt3x}k
zzj*PpEx{X#B7d0K`VYO;60hrG8}CDX4EN9iW~S?e@33&St;3{gNrSy&s_m8)O>m|o
z39ER7G~(UyR>?6>tI|(>#qje;xp_CzhSRHLNd0e<;<sY+@s|x(1o1M|xrwEE5ELbD
zs>%Nt&EP;MQ!<C*CB>;uD^I0VjqI&{94+t#FtkcJMD~-@fccx73mDH2b)cWK&hwy4
zP+N@W87Y<Kx(gNFW_C4xxy%dowF;Nr)!rIgPDDD-&~3NCp1Z59aQ(8pddlK*4M^+*
zUL4j^{>AW0m*DyY9~WnP=SDdP^*WLg55FCx3jr}i-;l6kA6~HJT87ss0JQDg4molp
z0G^qu^l6W-<%sRYHW{Co9r+1NmjL_7z!1-7C>7$<*2;0fMab)v+7=|I(Ypmc0(IKU
zN*ocSj{<W&gi2Y;j13%_*lEeQ62lAD^^|`gg{?i3vw?R$?$RN!BAcuNM?ZN!VpI^m
zg-}@};d8x^xG*>t!>g$HO2uY_y&bXcHAuKPQ74L!++%F?!u235Y3n_goa_wA@0i7j
z9!MXhaT0?-{lHqINQ!DzVqPp3OJNNcj*26sNP&zd=L}?zbau1Wa6U)i{HQoV9Xcx^
z!(In$vDrJ>)3iKcw9{gy*kw@SC~6W}Y!-A|`U#?oC`Q;$09@lPzN?V)tZzN5lVel$
zQjA=aqay=$v15h8B#9$wXc*5=5ci7X8iByx(c%u_v`a42UKWB~DSES>NPkpc1F}t9
zo?x&~Zl_J<>4Z+TVj=9l*}xnu2`6q*s;+oFZ~?mv|I+SSs(Pe|=uL$+OW*zpGHi00
zp)3+zez6fmcK|w(uEm~L3E4AkvMG{e;*(aiZk;3{&`mtW2S@HIlt<71f5p9ZRF&)6
zFFa|aK|qicCW3T#BQcrMol3`Kf^<uVG!p@(rMp8)=@gLeM!Kb>*7vyfUi)2p@3qf(
z-}8-g4rBNS_weDl@B6tre%BrP+>!&zuKP72$;N*Eg$6=54n+Xy5o&&ZVqC4tRuNc-
zo0Elj9*(tM6h8PwV1NBtdCL5;n!|~Vr!kiZgyA=kgZ#NQkcWPd8Od=q)`H?ie3qcv
z2=(#R{Bb?{-A85O&D}`Y+53^(AMzPeKX2>YZ`^wTGpOX)M-5-kl~^K+*teqh^lpgf
zdC+~+Fi;kmHQ{HY5qn(7u$%iE=q!JFg=2p%(>7P<?hDOmO_w;m2Ss;M1{iUAY`FK*
z7VFI-T0IISQf={;Jy*c41f`c%2L}=^g%s@0Suw=*+2%542CaFhC`kA;H!aO|=f%Y4
z$z;YCdOf8Ar6MT0Rb3U{ZlU;IWrF`=sE~MQZHvU}qW*6n=p%xwsqJ>2nx;;B!p?*r
z=->?kZ@2QV@(<S>1+{`V>Kb6syzkk@ZURXV<#!}DQ_<<ot#=Q93N&uJ8gHpq*N`_g
zH~9OF>mdUzErz|fEb~H*U#47}_s<_1Q+<geF`ZxC3W3MyD7+mhEg!SGzkb@HY#VF-
zYyKYkn?W?1h}_IvKOKgj4;QYsI!QnL;A@tN{kd^(bsv<n#mDjXf^>T7Qkz3;Gc_<7
zvmTgYI$S!_e^9&D429!Es(WW$2ALd}1WEkj@-iWHO%n$dTiOVy1s$>FPlC)f?7PF+
zqIb4+Ine8Kz8oEn0+xZa+|o<f>GxZQk3BmXv?ujV2qT+o*?0WzsC=~zPPc`bvOnKv
zG~Thu+!Gr4sc-3@XbpVaLr)7tqn>YyKPMTrxmGGqW*F2*k7ft|c+9p;*Wi>>?*#y8
zJ-1%;KKSxp`zx{KeHmuA%lOOI<(GW0L+2!F)Z?X$d()k&5q>9&rl;DB)0=%G-*lwI
ztFoS}77JqKfHVrr%rJXbwFP0O%v0_qj2t>ZW}7)m`aksOBL5#=$20vGNAq7S-R=Kz
zFKxL-SEN{2go~Jm-lCDdfs8Zkb$w;+nEd8X3q!i^;a@!jF^;fyx3lnI?QRXKGqyU(
z+GG0Z@@grkf(&FX(Up&0?<=TwxeZys9^+<wSX;(2Y6lQMp~VX1_IwXErBd}qFb}o7
z+)&1vi~kLx;P0s^&!eto<i)Gjv&OSkvgTTDi@XF|I4uDll9=SqQOX%Ji=vj5qTx3;
zZ=Z~4pQ(`X)B--frrT@Z-gpm*)#t~?gNzYbfoo5L0Z7E*cCAb2G`QXRLX67lEPm)`
zLYg0#G*^TiKyQkX5lPg}c)}Mt<F|e<-xRp~uya{l)bj!Od*oEe<aoV9=Kj^rZy-Nl
zVXa8M^UH(SpFy&~@4#H*i1O@+Of!|AItx~JdBC{N%rT&oKl5jra>MbKWa{(gNaZm?
zhsWGc%s<4HxCwz{gwOZ}cfMd8vhsOTMD7*VQSiR=tbA(v({Umpy^ZoUSIYS=^E<kF
z=%HNYP*r?nY_0vx;+G#|^E(banbhdVO=qKQ5+p;S9vq$Y`C(>PUAXq6&{>&kp(O$0
z%OZ}}_c<NZ6jZ!yA}0+RchAg^%Y&cz(Myf_EEZVaJn!+gib<@Ir5>hwd%gIk#?M>y
zf+i%1v<U7_7^<ZB?0Ge7f2w=`cfgX(5D#qD8m+CpX@aNR(W>1s9}rK_MV7^MX*bj+
zxZRg+H@VRoP&~h>ujClxWY&7F(p3CKDgtQWaB382#%gaoPLIiPb$2f2F@_2ZVhhk%
zD0hhZj%T>L^0mr7lZ84fY89)y=kE_(kp9r0=|t-uH-bJsut}G+x)eOpX38#NAs<jy
zYWU^hqH>u=gFQ+4bbK6MaT-u@dyi78bvF-2Yj~5wbI2Ks#LuposAhnG1Lj}ZVgE1I
z{*Tyx+5T7?EjEr9Y(Qu8kb^rf>0GMlvSq#JtmgcStOH49?lzYrBRDbASHBMLT)&b0
zW*`U*{c<lLf6JI6e|ru4as>vK)?LSeF;Bg>j7{>lKllE==JM+i{GTq<62}mrp1x<V
zAeWw5zAy_<8$aBOYO=+7zO!@?CxTjtezg39kqT`feLJdqk;JOln8enI(Qlx31|a1~
zQ5O9Ws`OB5{H$=pMtxvT#MO<Al~6j+kNL~*FZ}Z+et+Sg_Gkoz;Qqevj6e3R)_d|$
zCIzC-<eKbZU>?!LH_9}F8+7v-4^vPq{Jgxp(B@j*=CUboKw)?;q_6R{`~n@*dfP2M
z#z_{A-<66>?9U(5et$a;0IC&y2{g!NiUFrXP4>IBZ}I{FcY)O(zgCt0SdIPX$Myh(
zGa)Hp7qGAfUDpI*=H?z=SlK|S0%|M=Em2v}zAv-Nl?rd{&$B;Xv}a27?8)@uxd}Zv
zJN#ok*LnzHZ500cW@ew(HE8o+FD&Q&;Sl}3FW>Sl5panoO*?=fHM@Vf_~*44mGBfw
z4k9RCbu5ovw1Hd`+B{mNgnNG*BTA)s;^ZW*jG~hCCA86#oI<5hfC+T7(GB*H2oAm{
zMgkg#A8f`0VVjhr`v>fR-d?0R&e);(u~iO~<ROlLn!ITsu<ZP=i$F-=UmkNgQ^j~D
z(4~a8h%$m#Lqw{HbImsv_*J@aQ#>~gbg1wblwhW)WI7;X1Gb)If7OU4=VY1)Mmb@^
zSl8M9Z(jS~KE*#Sc30^B$+lZ1M665rt7=!?214oaj+Ukq<f$k~3|3P7cqu7mDX7p!
zU15~rpf+iLms?Xf5&XvG_d7U@lY7(uk2C+p@!Cuz&xmXPAizH}&Yzj&561a39lKIW
z5b}z=<3JVuT1XyOZ9V-0<r#Bl_;nyAeHD6B)5n}0NX{$-P8m(t`t=fA*@IKv68r|O
z@0JOL=J)UHbN<J-msk}Am}&eU%v26E!Tc4${1b8fA{ccEfI!B8DuUAHj(M=wX!%>Y
zHanWd<JVH{`|@QF;A0;Cq5-czXu$42vQ5d^f4GAGrR;Dezk2(ztizyIxbMScx|uNF
z)TS0K5k@is$pV;qS~ofQpf)%4W$k2C6u7=VWkx=ROkuQ2BJ7<6k0j6@_d7xE{L{zz
z=k5NYJ+6PF$bY5TUxe(0@Ol1i)dQjH`pz@e+;)!d8~Y4fz|Ci6BtfcxGk`|_`GU)$
z{}bnsyzv+M7fJ9B0rC$~@z(+*Cv%ND4CcVl-Szmt$dnid0f`0S>+kM{Hss|uWbHoI
zMDRkq;S9~fQHgkY5Snxl&>ag%V4DAuz<=?=SDaa1So(iS6n`y6Xij#@{r<-26R&+_
zz8#jWe`*NQOCr}4b&}69;%<tMlOkLR+A~rkR~M628|AGX<)xBPjFC%ZEz4=pF$ruZ
zW@0@3U6}q^#{FY!|0YcTS;PWY_~*xd7p4mLbk@$IMa05`Eo{}YMvu4O*`c@JEn}ck
zQg?YxWc~{^O!Z{uJ<aqkHq4DWC^U>rM)mEWixI`|;PiX^A63df)W_ddiszqIp;|?k
z5^g#5qRG^Y?cxKn(eei7Dq14Z7JtQh{HilRrO4!gh3}S1!Db9YwfsGzue15%18#lB
ze`0|Eyn%mFA%Atg|3o(bbXD&FnZfr7>K3N=P`D=7O0H>PAQxHk^i-0yQG9(0Ffjk+
zMfFd|5b!tS+T5IoxjC2#wV(;TT-QCd>g~|WKlH`_P>vLC0z5lU>$y5*Dm@U8Dc7W`
z7p_IwJAnXE^^*WP<v&U9zj3S$&5!?Gj{UFae&^VPTbXYM@QqKOg7hND-nAegIY!Lc
z0SE}!3eukXFPr2~9I<C+Fg9-R%@vm!@)+D&k628XQw&qjjUBE)3PJg%iox@5lJ2jB
z#4m04-`M{%d_dc+Vz=YfFr%BZrtpn}1{5pT#Ipi1_oe<qlF%qBQ4c_^H*81_tr{6Z
zIV}=5=#SauSO2c<{*5mH+vFdb?H6B&o^Ew(Yg`>cCy06tZ&)AuPH;<H@J-1yw`rOU
z_ODW;xm}i@jvcZL3LfU@X$w7i{qXsyAoXq==QSZvXG7^4Q&ya+M5C8+3~?w@j<3sn
z_t6x>Q(1!4O#gitg29*v+Y``<F1BehO7E|OePl%_7nTmhhpfsVb3-e@)I3@TqFfBk
zvu)P_2})J(hT!lG{{hcA@*h9YMo>_)h8?xpd13wQRhc&mxXNEY&nMVpeX=LlL!vaJ
z!u&2(N|ec#dA_h2W$QG%T<{Wlw|*<(?d<B{j<ylWdf~2OHx@F#2|__{TLHi&L^kVo
zL*w)PEav*6xiUQSY&PGQFCH!`i{Sa=OETZVunTwLs1sTp|En1w|Jk7b*wj*A)gbW5
zozS2I6CGm69H@Yju&Y254SM9RrZ+d_$R%#5RvTRhrOGm2!Bcuo;3N6PAe}|EDQ2>^
zOo<^$L_{|?;}kOyLL~k*CX0n#Tz#gWcB<raLFvL(pb}phypBzH2hPUm;Z7PQPh1)W
z+5XyPZWiUf**Nrg_+Cl<{-&dHcE>Bd-N4tf=Y*T>C89<?M%Sr}1F2?M1mvZhTBL7#
zZTC1NsspAs?qiS6%{*zf?Irs;D0^-vDQ}L&wauz$6TDH?OAb!xEwffdQqaXQUl$YG
z`Mp%I;So{pFViP}9xDPJ0zL!j@Hjd?ZCqoS$JT?d->|2X3`L)IRr9J3ZI+)iGRsYp
ztvBvTk|kVSs>QEQ?007YZfUJ-;7qML#M?2nD~iNZ8e^Eah(S(S0BiqD$*r@C6#a)v
z{2X6=wkv`=mZCz-;uR*}*3`w^yCp{NL8t)*2Zg@0SyO}s-Mm@@7@toUr{akUWgw>$
z=7{I&;=>z!6@-wz__(JtkKy7-EfYffri<w&#~*Zt0?x2TG5!_&s1j9KW4aajXt<00
zpb<f;ds`=nH5BzobYCqFvM)l-qPV42Q`=Zy|EV$J+2Q+K!e)^9D{kqSEfl`4IiO<x
zGw!&L)T^X%LqmGQm%5l*>OT#ltjKYQLHG?qS(pbtuFg5O0+o9WYP;H;xoiygspj`e
zxv}Y^%@zTw{ApYCulp*bwQ)Ya+aQdC4e?6g^cUM?nvGS+ehI2D%1aqmP+aJlkNp6W
z=m-f|@DwG$7pj2%f(Lws<j;g;on5l{Qhz2qda+COfjOa^`K#u%GqsG|Z)k7_-5}}F
zBfe*2dg@d3EV5XKZd8?GcPiAaERhW-OCJk%)jn5VOuK35QS_UQmF1?FqRTBFHsa(#
zDOcaD(wbIuT=!JvF59N))yAsFs%1PR4@PSWC8y?lJ98?Cx;6bx`)OGAqH(uZ#-kKA
z4(No1^|^uG+TC4L7HI8B6p=iND2E~SJEWbYY0f2m8CM6oe$+m2^mNWMa@02@Y*H3h
z=;<a77Ivv@2kW{yhFd_~>Wy?k7IJpRnuvA!LS`cQMd2WU@Cu&F>fVdi&voOa8_lhs
zVwRS;l9S}Hip7SU8II}}p<QZvhV3wS6R`G$jw$Ht$Lttw)U@xqm6UGXAd`8Z7CLF`
z#e{#TH^pW0W7)jsAx~oFg3Bn)<MzP7Pi@Ol2p*%&*pgXTljh^3%x8$XSBh2Y1%Xni
z)KaJf@^zkvsb<B7dpZqG#BNOz;dnEAk+AScX{q7a_1>jJ$zgmyjqS%z<IzKMr3wsc
zW8ybz5tOwtT{T>ntdR8>Yn0z3ZdE?eHL_a?D<p!{$oa;Q*1XRL*$@>;-Q!{qIAO+J
z5Ph?xNIu_Y+MQf33EPI9lx~Kd(8&)kNc_ATMrUz+p1ydIPBk3liJ7S$O3(xIe2Hf%
z2Tpq?hM`g*iHl3qs}^m!Cv$>2MS2~j!{YEzzou5bhUryvc*mz<PGwk41>d(@p29_&
z3oq)MXFgmG!p7Z&&!v)%F_t29YV66aL^Fbl=*J3y7+i^K;jEsa{`|s1FY&apSpZcf
zK;@EvQy|iHPZ!(Sgu9ttnOeklQS_D~#F7?y>x$T%s5fgX<vW|GqkGP-PD$w^u8v9|
zS%&u_e}7Af5QRwmyyHMXP=*4k6^SW;q@mM6`uINm!o4-s=Q4?oLHB6lwi(jnjtW9n
zv7=FwmpSgVO|hvP;}Gv(#on@Ew*v>4btzL2N#>qO&-U+fO76uW?br_Nk}Fuy(+0H@
z*G7CKW{zSaLm5l*$BNKf4>n2NV%Af2`ifzQm;b@n7nVN@_4F2&f)gDg<OQ)<-Y>ha
zQE7CMUL?os4_a4tn|Tu}9+7!W$!!EBn$;4B;2T?`AmdZpPU{HU=b^~+wQihpTu{|j
zB^?S95E#u%u!N$}t{}IWrySK9S&qCMTbp3j`^rtKUj0l*PF*YU-6<w*Ad7!h<>cY{
zeDe?+yw-s@J6E_E`oP=!{jl5c`yLh47&*tFC>_6S>tkzuO>NR6Z~!*X;FH0=eJD>u
zBRE5FD+2T_J7P-|UT%ElC;4bf-^YD<hp?MhT}YDNvR$aXSC!j?ynjU%)tK&t@}emt
zGsmXdK*rrCmb0aq=2HwWxt)_hBNZ`FaFd2->!Hz0Xq!FOskYD*(wn+r$WR+Uzr@Ui
zw8kn@h#QsBs^qL+H=J5xJ#;anz17}80sjg^?&S~T1~n@d_NjWE_o^lGO<HGAPi3mt
zwVqJj)lFZ#vfd}1<v;41vrM?F{=z<b8l&<CB)t18N!WEFAD(l(6v=QB3G}opm?_O`
zDs&;WHqk|GRBelSX(xRYy_kc4y{HEYIX=Za_4qg}+5?JfJV@yF%u9Z2&vyVenzi|c
z9Z#OA`)!UZwQm7wv!}~EJ__fI-ew<=(<8k+Ho|r1v2WSgwjxMF-Ygc(K{wjzx48P2
zIZ=HIglK%sv5sJ0YdLVBMK**xDHdlK?E>uf{A4Fi;!gU@`N&5YtZ(cwshq@kad5Bz
z0H?S!!HsX|ZyHObK3Y$wr0uX4!3YzxL)=BYKw=}ShIZm2pU3$a;P3->yZovfv+_jL
znp6T^yB}0e<cbJ)lQ*k^?Zy1Y&n8^QUngDg&%J!BZnbQ~9jYWp7WpjF6ZBn5i01f7
zJ8uegq_eMsH%&->n9}IEke(~l=9iHCge)5yMKtKo6DJ%ijkHsLNoX`L>x>$xU?F+Z
zbOsRkL>*dU+GF47*2HSNz$Ly2zTwx!oInY`7w8#zq5`VF?h4An`Te`Pt@|e@DziQY
z;;}_@dC|p^(I%z-l=Dzp^B-iK9Z`tpP)P~z&=q8gOsZY=!Ml&9aP4lsC$E~EJ5G%a
z<7VRhHQqgOT2)Ld?C&`g5yLmGI>Aj?)UC~ur<Y-MmNP>zQfa+A#iwhzXHio3BT5Z7
zsi`?>eaB#KV@5ZxSr113iNfsMAh<!J3TrvO2zP;!Ya0YPd%FU<b^(qBPOL%@`+fs~
z@Fj)+(6x<UNUq)ijiqc>8!eYQbfb*#?$>5V(7u~|!&lB-aqNFgP2c)?{8pgI*yc4u
z_Zh$=T;gsuNEwd(*X|V(N?yJ|sf29nJ12Q!niKxx`!>|7#>I<pWhA>T(K|eRUZj#7
zk4-3aK2prmljGpcLVeO)ck73k#f1<GvW=)P1}Sn3>bm7=q%048?-Bz{zuK;)hyHns
zAywhIL$?ZuhqsuXDCCUff@ETMAYwJu#eQCG=YBXric55g;2<2Y?KRIlW7Icvod{7`
znlN6G$P>U|;Mbkv6GWQv&zrw`6bGdanhr2*Er==S^;N#ceY4AQ!=z4%_zk2~<M(t*
zet3JsF5aJ?@|>daVraZYsSq}2r!lW9g<e!SQfPfZv_Fw!f%P>?PS0PRD`A|(O6GWQ
zH$}B-%t4(4-B?74p+-)2fNEJ3bRpgLy-}1v7{E2<B#@-tpgvOvkw^#lql5AQ9h$Nx
zNSy4D3geXh{vPiWmCCC6X3MhLiP(Myc(OKfJA3SDjBLZWtGVQjJKF1NB2JVzFE6vg
zA&@*qtgAs}x`29kw^-SJHOB=*Sq;?vm&%N3yDcYUVwMws&8@<>WZg8B@g+~DNLPff
zrP+TePfr0%Li;!QH@@!*YqdZ9p#X)`bQP20>OWiMK<#s`cj?ub2#C+Pzq@~%EA`Qa
zbH<<r!EA4pB$FIJlVs)K1QB%{0$Km6&(o1u;~*4Ib~VYH>7<;i&;DB9&tvx%`E>YT
zn_G|)AxYJj+}*T)PjS3|o!1()Ec(K~Bf25<Ro>gJfV5+l;7JbNMGd)Vdpg`zc{ThG
z1T2oZ3pjzpPRb8N8@8t7@KQUeh3h8m@jeuez^F^v83NEgIpg+p_DA=%AH^d)_-1q-
zWL<gdeo9sh)4sv7(Wl3#g1xXjs3(D7ZbuaN2J|xuwQV&6Sh3F%MC4HstZ;f}1fgS+
zc#x>GB>6q<G+b05d!%T|flD^z0okBP6;`o`IRZnFIu~n`?bhqsvhHc-T26tUeW1|j
zfw`wP+lD`)egWlv8Z@QU5b?6h>&3L2moP>0-DXAhLg49ww2n2`!}c~R&_>X5sjxs;
zxdU%x8GYvu{h?L3d2PbWf%$S;jm{3K@5?mIHNM~Qoc%E|N{})8c4WE95Sa9#T*1-O
zg=e5w8rz`arlgPX+?{0M=#31cx>_aHion;!+{zeF%XHEhBiUV359BCtg;?vZ7_7CY
zt`H6NCnEOGh2~0~C#(uFVkFUmuR_EU&!gtPx8`Le7aH_Cf@0*8)`$&F@-$Ij1q}<L
zu8Zop!X0tH*Xr<}mR2eTtjFWY0OsD~(?txGrWo(ROf)ImI9vJs%eZB8Xl`HD27I?q
zsLKZ@Q>Hi9x=HDCNVcAhc3u=|T&aQqIW1lRG18m=U%5%<)`Ro>R6wCOa2d=OA>}4^
z6oOZ9GSww-t&via@~m=op}u~?oel&P>5aRuK`00aEl3DF1urkj^5_^iZKq<-L9rMr
zJ>eD(y4!e-dA*i<!k@t2{IQnQ9UFHHLN!!ch6q%c5w`Eu0>8SUJ|FaK3awd#<L%uO
z-W&+%)xIygJt8~B{XB)2E%|z-elg4ZvKyp1)`I4m=Mk~n0bTSw8!QE6Uwt0<jhdF$
z>372)Z4|oZE3eKfq{nNoY6rpf5s>fLQwDGK^(e1v1fG_SN}|a+JL9I;HVAO%sy+|d
zh#w{aus&5xprXff?chtm7wgH(kbhO2<qFOStUjDpI&lxJfWsZKGsaFI`H#S9Up11g
z1Wj1_(~aHgecZd`FoTPb&xzg3e#L(Xp}9;n#2xJ0uqtyR@(~kOiI_4#pHz<a$tH~$
zR7Idy(bnLA+IXK=trsT%F#Szg(|i;nY}}VJO+aucv}f~a?Ihbb3LkW&+~qh;;>AQU
z=P2sL9T%j`-0frQivt`UqhY-Ua`r=GcQJGUAut{=dJ_xqzN+gg2=uI>n2nE!hJ;LT
zTT5|ytKsCPBvOJXMLT`xB$iuRRW+Y$t20)$clRp;*`ea-khXm>FDEe=eb=>d4qJC<
zO^&BSkxlo-{D50omrC@>))iz9Q;b_^FAlV$A$@Wgwn(HRzWfHIT;0G^=ZmF021eD6
z;dJO58iv}3J560)@ub1vM{&E;n8|8saf%OsGB(UMjKV|og0R)*=9BcL>J+}E1UTRO
zh+UC%{}DmIM1OVXkiG@S^v<PbW{|AbbF3UJK=Ni6FWYJho72H==Dl^_&QW_xBl~hI
zfn-Zy{QAIf&zAsZ@S5N#Vfu#isFo&CZ_$bi>P6|*VJ!`%^(AT6T{h)@3OT*?_mI0u
ze{++w1NpaSC9T>6!$*oK3=?*YkK8_++$|K?s7J|LYSYu_OO+Z4F7196sG6kz=5UrA
zv!q+c--tzIjNYyO;0LB%TxX?tmMXe?*ewcM$oXcnpLq2};ftxuZ?-jE<>YUf`pP+H
z`}@fFwv*~dXXm;d2%X{!CC#1oCCR?O1!p=B0hR6~8tM~eywLz|;ZCsjaa7H#%vT^v
zgfhS=N+PNgc4sBz91e%3_4Sby-xKSLH&&d_X|-qwY@llr_)0do(}*#<B<v81K;cRk
zJ9i5I^-o!tc6@gfT^)PM$SQmq4`-JQYSryT&2#(}GlZzK8!0%#%%CT{^?P>({ITvg
zecu}<HiPH5%aOTD{0AgKuguNH^NPOI*QLLd5fG>TR6S+KK3UE&`fi?=j83v5t<oe4
z5H?Zk=&k7f%r;H%&xxi<Nj;X)JfRs@iNg@l8ib1RX4xokti;l*1i9m@@ZGs6(hPgI
z2l!yT0+h<W{%sz8A|r!t-q-!25Y{xgs1=?&46pCEGPs8cYfPe0nC0a~_QdS6!Sm)k
z-Aj8Sy5B&gcuuVfJL9NG_e{r=$x&w}Vu>rE`yG2QKNOVHO4=g&V#-q%mX6Pq6ZX8@
zp*f`<F;m_G1WGXQ=ud9D-UT|&7j|O!g(Yt=HH|d%;oR7TF621s26<h$1bX2PymXbQ
z9190gTNvZDS?Dr2+Q+$^J@H);B!t9hs=^=)DV6LZBiku{ov9>^Vba}$@IcxJR&D))
zRG@Y6F}IjCGOpFjnr~g$!iFPaog|@4Q>00HoZJdp`xqMI1%<^1XB^bq+|NrbgWSjn
z%u<%5_Fwm{jCtfhCfM^TCu=REc2i)@@;JSOBNYDB+<w7nI0B{^b`ithOj=t{(&nZw
z*p}i^lZ>?%+*v<lz}redQEdmRo$lld!t4GxUvPZ{>uW>TUrd&jZ^TT5)VD-xq-(G7
zAh-in5|cjyxboY5k&l^~IttsUCu&uZv|QQbrXl0`Us{s~8UQ2wRl+_%YAPT+2=CtB
zjS~OpVMVz~eZvd;BrB{OQ9LEf-J~RA&g0sti-v)zO9dPO?R_$q5(`~7K?_Q6Q-At#
zA7~zId?cS&quu2px8BQnUgbXT1Ct-mE{2?5`|#F}w`kDD%<Znreg642RyBpR#=;n#
zq-TQ`BocGS+}-ttc8bHrnxmGS?c7)VLWo8y^!bg5H2x7*+F@q}1y<U_1E9tORFH4u
zmLe80J4;{63_*tI%k>KhS%NdWCI_A&N_dqE=XHyaTJg(3OfIIHoM2bbDN8@!OFW}1
z+sz~8WWA<H*-5=IismGqPD%vjPmL1Fa^3XmB8{Z>E%dBFDi+1#+wsWoGZpgfs+t7|
zYpVXqJ0j?zL@PwRfFcx6vO>Q$;o#b$cD%iEh)7{n@Fv&)wS1w(fIoMwA9i<vQFq~>
zNPm%7iosHgAFI3Fr>>YVA(Nn>wAJvkYe4EIydLJ}_Ew)xu@$zA<%r^gzI+(k(AeUd
z3YqvC!cQx!LFtgj!eTGtF}%NWo`p@#Im#!-W8^?$GkyIbAYT0x+eUy_u@$T^&%EN?
z$khi9#ML_<2|hTi9c20Gs_XEWrKffL$mNXl;BB87HTN<L)vQ{;55fTbd#s7_Iu-l`
zZ*C(f+>x)S=>@!zuu6@#mvTMLeaMBYC$6WeCn7uttC$mXt%_V6qFM|>$wE#7#$8~8
zB*8dmLyqD?$D4Vs?(mA8NAAluHPJbUv89#X%^pt?iC{rNAU5ESPomLHs;_sF(!ZAK
z_%y-OE>6!#p-UbnTtiAeO!%JU$*1A3R#Dmwfewrg2S!h!M+amsgCU_ymmbbwyMT@w
zywb*Y2}Mx%@nsn834f-auVYEs_qxRnLj$M-#%wv9odE782nK%uQ6gPBc}-^*2a};J
zoJ?Ggn?(o)5N2nAtFX8bUMapH;;44b8UmmQHm@AodolDh)x?=E$9N|x*%U54D&@dW
zRm!dDyHcuEHL`e-g?-Nk06E_>2&2q&`|{<sPgygD^dZX%8MP5MI?rEEQvefeerQdv
z)lHlu*X)c!(e4v{X;MyQXxnDl<v%>2CD0|}UrF1nxbj+89v9WKGFxqxW=Vgf(U4?j
zMftHiHGlbooKwVXyYYv!EwTHqAEHAgEm8Kpf@eb@_0igJUNLqm@j1vwC{C%I`_e!#
z;vr*tNgC1&i1`j^xK_K%CKL=Bjvb~KH(qU~{<NpAlQfdQt!jH+pSk8pompgP)bC!f
zXWzl^V#*)<{>;41C^#mvm(Ks`iA3lYaISE69}NrV!bp+qvNvpbY+zKJ2h#l7&wFu#
zT|hCiz)a{4e$^c>wuy#8U$GCKa1MBKA1+RC<qKq|_a>)jD~|dZcoX(@u@7{Om*Jk@
z<#1^YS+ls*XsM$>4m}NkcKF??j{w=klT2b4x^3s4zDeA==Q&<H|J(rNV6;ED7ugeH
zmao>Oh3WRnM=NWtoN7i<`zlG5?99<!ebI>Rj>~?UZRX3NnOGn0mW9p`_Zz7Ab$c2f
z3OX<iXv~8~t#N`^C~5P%18&|zebv&lzQ*T*_!+?$f+Muw(B(sKS*!|o)Luf1*-rUx
zY)J*Qt9;YfaHL#TGO*DX1b4lsJ)>Omv%%JcBwCVfT4c!Sl4#s|W+B*vbUi^OTx|u@
zB<m&Cq&JXBJ_DpeYwdi^)_|NJq--l_?+Z@QI#H&P-U3|sS>U+lD(6K5e}lS|uLaYm
zI*Y_a?3kh2*nDsEX@S*fD%U&3^E!%I3&J_yK%f}&bBMn6@Il{;9Eu5_dEG!y(Jj8b
zRf<#*m4`;#b5B1~U4%{r9zLP`Y+d7Y^TH(bH3b4m6wbF0qGkCii*#3VPm-i|kL+wn
zcxiw!Nns^Qu7W4o38P5NcJfyTK1QCWa;nLu-m-E^dUsef?=%8h<LGq5ND=vr38L|k
zom10sJc>~ALFrCPPe}1ggO{+H!5kRgD#MqT!l+iLWVw`Nz4WkWy@TUruKM`F)Z_c^
zb0I*pVhs?<eC_kinDP}5Iyo5c?q;gzgQLUROB_9;MSXf1UpVE619Rxh4>GS}q{>u5
zWB@Eb2n0g%7yca-zz{#$j&BdwRD#H+sV6H*t^2!?eg~;+i09pQRFvQrtMEyCufn+j
zi@|RoswLgM<m7t9f-J>$GQpy#VzcHEb|7&d#paWMf%QTRDvY3Wmic*#c`?Q3DL<5&
zy@l8x9uC#AVDy>+aqz5f7-Cr-ibir1yF|OB;T>m-I}Ituax=@g%mlMYdJVTND{1HD
z_e^vFSe-<2d697uAXa6_dLuYNsje1C2pM=oJ+eX|ER+f)a>~k}%cuE(PstK|V#i<O
z@wtQQjlugfSgILrX}n>}GpePdHFJ6#9VEy)ZOnrW<dGPcbhp*gb6<S*&oNM(53FSi
zNWy96{!%z?cu2bsdhiXE9(GoH<cr+=JZFKF*#~PF7|qFjTaa?|FHz`oB8dpb5m_H=
zQo%A>mMXISO!K8#V5q=TUB<2Jkw-3$ro@ez&DJN{X?X`LxU~iU#fh|K+#q98fP3R^
zFBPO$9|VPI*30-m)@e^N%Hk~rQKV8SgKmqx{tj>Tzw5I#j!^04Z3VM;0pLv53i>Re
zktXmx%7}eVB*qXibXX3Mk=$#q^gv*b!$oz9R!a)OkG%$V8EXxXEx`6TQa>dbQ7Y(~
z=dq<sOdPDArkLQEi1sf#f5y-j`!%KvXif9yY|+VH*h?`d(1+NALw8wX*}p?&nPk$-
zG3-|EB+KzTuFLu+S<NdabAT<VO_;+~WJ6I2=d#fB!cm+^=F)1MyqzP?Q%pOF6i&ek
zKz)d3pufSc_ct_O;dYmK?EuD4Y2?&Omqf{SH4mdQIz_P>fOv+sa^o7DzB6!Uw;|Hu
z;sdJV#zUadSeDWv7KMJQuDEU}Bz>K<u|Cw)%sTdcKy~!xyRE82CY0clu16R8clkAB
zpWvjsf@Q7`O5Z+gf7~p{tZ6^f{f<i=kF@kkf08fNNV}iooTUtnF{G3|H$^rs4n;Uj
z&{9>BTxeI`)-5O>cKtGZYmC*(<Ce{G;G}iJJ5LBT)#H7JnPE^%RBRJZpuDr>FSK>?
z>z*2$Y`wDX(fFtsx=^wNp9BRjI;LH}IMg&G-6nW_!dmms^VC$KIbGG6R55A7`>3`<
z9}ar9#0_dmlCgd`&R92VJU!&WQ+|4e2V{b`EFrO-r1keamG^yBEfL;&OM7aQPc84R
zn;H%;1+F`wRx!jJYHhRxR}^YthR1ndLbD7CLMb4$O|1>dgQUHVn$fz>oA51^Lw}wL
zlx@eX7+y);5D;=Nczt3*+fBR4_ftbnL&Orn*Cj&*V)L|wfCYrgy2dvot)ru?ahIFO
zy@F>B4ipf2kTNk&2FBA;Ew-00g7n7N&*sy@?X#`ZqRf%G;xmBqBG`5A<D*~>HgF(Q
zoeXB9(Nn7!NS`zqDF^jbI!0(uZwasJge0l7l&#$4xNL5Kjn(4A^LoYCG7=P(uiXkw
zRVis~NtZ~b*K1<so5yG~?)V5@KGVfKSC>z1X~ui`W;q!pDC5&+tGYk)&J@>9iVLY4
zgcks0A=!#+ZfG>?=d%Uo36++?x>9t)rX&t1D%kQ+b8RRT_A%4}CzzaEu$O)+p-cGI
zn5NPothT8oNuID9ZbSVOY+ryfgY1X5g`AqpA785Q89bhD1uJn4enEEE4L;~KLpTYa
z97t08u2B%Wm&$sBn`CBT?)9g-FhrwOU5wyuTR0wQQ0}!%)X%V}X|2`fbonk)^}7OX
ztH*Eh)kWsPn$YcUH~<4QunxFol(d(|A}L-^1v%vQ=XUZ(TU*Ag8T6fk-zm<k^O^Li
zO*UD9tT3p<aJUSl<Jup~5`hZCvzTKgF%&_*#M<`F#gJSZxFcH5;sQ^uYDWRZh7*aD
zL`S1&M(312-PHM&i(6g2g7J}Riy!;cFr7f3{F12(R)i3%2la-($CpP+;FJyFo^%)c
zvoMAIa~s}S>=F3mMxVgoN{!4^$`!30m+2Gj3h~IbRH#Qq=~~Ptb{W5p>T%!KGIB=T
zK-EBD0$KfW7whESOuMnOpuAZ<{npY`R%wP7OJ+?p8cz+>L$P@j(Qjye*mJVLQFVLW
ziwv(kpxC-Yyw$k@VPAKvx$yeP*dLtJGm<R+aw32E#xBvq-Z!&8CDz2v^$!HkKu(Zt
z&_+rPYammGG+D5E7gN9B8P(nh(C`w@FC3lMJzU}dzSv)rvVmKUr(#YV+v}RMAh1d9
z2{4RvQS$1ID0;B-VXp@-QoT(Nh+l|n#YGo*5q4J4W5g!_qX)zIf~!xcYgX@szb+`c
zx+Aor>*Z}w<jL84DT%u4vo_MsKq+s^@^>L+v%47&;!S-`ROT%_F6VxFJJrO;-?Yb}
zi(ksL^&P<3=<DR>iY*e*ZKTm`bU_0A!f8m7xZ^HYyk+7P6lTvtN#-)_%IM7&-y{4U
zY8Phv^PCsfJv%G<IA#^IbozpBc&~W?^|X!p0SeeEYFfl>t>E|;VI$7j6!BmsIyXqo
z0l_kpT!MetC%BLq-bdHWJUDTNO=bEUNTunE+<RRG%TQ8@<{ys!-@U7&*2lr;-5_pU
zQQJq{M!QK(z5}cQ`!v85rIr}6tmy7W+_bZ97;xXjv(WIgo{za$Zh@~K91ICZfL2i-
zZ$=h+_^E!GVlQZ%u)iPjwd_`Ea)_S6oWM`EQAO(S8+J}v7MBm-l+K(@i3zEH{!Z94
zBwu^}bn82<PutCS3v{)_fmqK#-^g%Lic$bu9G*Kf;@?(e(uc~0K_+@$W?-PpT4q{T
z7xJVb1X-RE79vXt!Z4A<MkO?mjFTdgC4R+B$E2mU4cbvx14v?vnq%Ue3Zwhfwto7Q
zIAQ=d#TlW#$K6y-7jR#Tc;5rLKyIZj8p+c&vgL0IzIZ+8X-g)>lf_1T)LMHn%!Goi
zFj;UC8eOp13o}m|Q`jleTiDc&1+5AreWGea^*g_25n(YBpM{r}eXI|7v#**ZOCu>*
zq(+E@S)7IF1@aa}1-~OBVIZyFl+1f~@#L4?#U(`z%$a{ygJrj?D*PlfmJ{kRNL!Cg
zQ7&8KA^uF=T3;7t6r>#EVXtq-tA46lv#J^N$qGfxPrN`Ku*=}W-q<rO^ET74a)tGo
z7^y?d9WCCMcstjaAIeCrQtUh%9zEo({$xx{YO0JONF^j*i)1;dR8z&lO6pdr!h~)W
zNRT@6Sx6$<5NHX1mKU&4-1dozXM8N(P%dxP=Iq&_4>W6$YvQtHI(|_pyuxg+>g_64
z3}F+_tCMdiX~#=-<>enrMhfoY9eAgH@L9Ve(W^L$kBtPy5lXfhB6EID+~zkmvX=4{
zo`>Wa&dIWBSGRv4mH9ygStp}4BMHi#wU!_-c}ZaGVUrCZc}}$GQ{ha_lcehT0)f82
z;T&ZqR*57D0%3sgCIAx>_}NBd1Lj_f3@HzEuScw%&OO-d3l+-4G<T57_!8n+qYo2i
zSPCjO-h1S1ytJIxdeD%X`l=fJry5t!*dvUhUhu|LShx=fdI@^g4lDBRwZiRQb75B*
z>a*dQnvQkUm|1XI%5c><_@q45uVA++_cDoJKPA;jdY+)QO_m~=3Djn~t$4l_FG<Bb
zL=*lr`XC?hp(Pn2)&#;EJ+=HPAU_pQIYnK2`6`IH+OcGDY$&Zc3nM@P0p`!mwOiPb
zl$AciZDAr3TsLH9S^<e*0h#eKze2qoHwSlLN2=B)0vrS-PJ;ih0_hUIULV=|$=^U#
zWpzrVGe(O$h`NsX0~`acCVN$R*1RC3gXKJs=2Zwd&S<;gWKSh8mu?l|mO$_U8AFZE
zkKl=s$JXY025v0wRi@5~aja7b3@^T9euyAx?trQ<JcJ><W2JJrp~CL6)DM5KV0VeW
zvY~3?u<k&#ZCXHWSLprhaImL6Hhz3RJO5S{t)uaWfVA}#Q0Uz7+>jm|{xLxJ(1ZvL
z^*T*+LQN9Q3O21%?}h%+F<<H;b-QeFB^bAR0S}#<=j=#>KgZJbVLsai@@@BKxIo}4
zPxQRkI$x))ICW5`O4jbt>oFK^QCD(PkO&Gl<dfhlWn`DOZRA)*P+s0uiRzg71RrQ_
zxGsCX`_VjO%kz+f&7K`BIW65nZD}1!t01GqsUVw*UGDy}?Kz9ZTB$dkEagA3;*)W(
zPJXzd2C^`ZNRIkwf#)V3+L|f1zK1Y;jLWNAeSyS2oM&KrjSeEh+?#VDBL?}S0g(*Q
zE2O!kh>s;Oeftnwc2|*42icZ0dDTz>Ly!<7C_{`1d3}i3KKzufB@DB)jZ|<LqXLxj
zV~~arrbEp*RaQ#-=sK~b99-Mj#D6^BRu&$JTj1QsJs(TCF1P}@z=i3as7~|xN)N27
zv^F4*XjuHfjWRour}S*MdiB6;7#&{<nr9u<^*BF&E^(r$#72k4?0$M2%s=EzF6
zL{IOAc}`B2h5P<{SWlC7D(kgaSx<sfHAUf(#FP{rC8#9u0Z!?IzHQ7Q2b*ixq(t*1
zFW*w_ss8k;=N~oAH!_QOZJJ!m6AGkEEFL90U>(TzGcF)J!5I)ygzY4VSW8Spt9sOB
zr{ZXfV7`m{D}W8*|M!5hGMWi21wE%+#i3XiCgA1U-ST9Al{Q!ppK(ZK9n~k7I&B@@
zpQlB}aScX6rP9TydPB)ibQq*Um;*j80(~G0oHc1C>k+)3mM^iJ=)Ev^yY{_(W6J&!
z?o9hsP?y?mWg`s-B`WHN;qt}GNk(R&hukhO6>};DBjO5dZ0=To0y`4ru6N2QA_qd^
z6V_f-`9Y@egTa1qjK!zM_lHSqGLwtV2|tniIYGoo?1|YsDO5EEJ8qo%&xcx*!1Ik&
zQJEV47^^yfvi#*}{NMgZPzWYMqA#d(bQdR?kXYq~ML*qzvnVvGiRaQf-Y0U}^Rx4i
z)B5a-AJbEA0NT_o%PjBjND`E98djq<v`;)fMV!NV-<GN`9elKWL<Emd9aI-zt3TAG
zTzO?CMCHYt{kLdsAP|YO2BbL)h35DXedVWE$<TqN-9wdSB~Ds)Ih^+|TRLMNke>tp
zqcPVe&bo;&>4P04aEv4b+YHRqqYN}`{m;<*UZjwfzw%&=W{8kw-rmKk6qCXz1pc^F
zp%zva2pvRKWc|16Cg<f^kUk!I4D^v}jfTpgOb|jG8~`jsw~G_7YR^Y?Ue)3(7ZBb%
zHA$jBP+$Tihe70fP{2n6R*;5Z#yI13^kVyHrFCYs8EkUm<`I$~<&BF10>c8uj^(t&
zE-Jzwvx}<b==)j9(~&7}buYv>-fe3{J%7zfafIx|HIeNvlBiltA_>9)iKqkDOrmJS
zJOwvh){2K3&Uc(VhpfsFKv1fFI%X;|W#O@O=)X%AxGoPev)G*LE?&C(y!uvQRXaYS
zC)oE=XawUs26)5Y+gIQPFi(@ds<J_WZGCv+rMeSD48n*|0Rw;EAUguE8oKVOfCOCN
z)J&&4L;Cr<j<vB>NfSzcLPWunPhcmJX(4#NfsL`iI2$i?aOa4C2lmw9ErVQmk6UeO
z%%szIg&x7h9t<>Mq<KA?dN0RM{7E|x4F{|~ymO|{DD5p0MLGj%7612Kd;H&K|33x2
Bu$uq?