flexflow · jiazhihao · Aug 15, 2024 · Jun 29, 2024 · Jun 29, 2024 · Jun 29, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -301,6 +301,12 @@ if(NOT BUILD_LEGION_ONLY)
       LIST_DIRECTORIES False
       ${FLEXFLOW_ROOT}/src/*.cu)
 
+    # tensorrt_llm custom allreduce
+    if(FF_USE_NCCL)
+      list(APPEND FLEXFLOW_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm)
+      list(APPEND FLEXFLOW_GPU_SRC ${CMAKE_CURRENT_SOURCE_DIR}/deps/tensorrt_llm/tensorrt_llm/custom_allreduce_kernels.cu)
+    endif()
+
     add_compile_definitions(FF_USE_CUDA)
 
     if(BUILD_SHARED_LIBS)

diff --git a/deps/tensorrt_llm/README.md b/deps/tensorrt_llm/README.md
@@ -0,0 +1,5 @@
+## Custom AllReduce Implementation
+
+This is an adapted version of the custom AllReduce plugin from NVIDIA's [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) repository.
+
+To replace the NCCL AllReduce call, we should also add a CUDA IPC support to the custom AllReduce usage. Our IPC&AllReduce implementation is referenced from [mlc-ai/relax](https://github.com/mlc-ai/relax).