Merge branch 'peft' into peft_xinhao

xinhaoc · Apr 8, 2024 · 17257cd · 17257cd
2 parents dd1366f + 0ed889a
commit 17257cd
Show file tree

Hide file tree

Showing 64 changed files with 4,520 additions and 3,497 deletions.
diff --git a/.github/workflows/docker-build-skip.yml b/.github/workflows/docker-build-skip.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
+        cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
       fail-fast: false
     steps:
       - run: 'echo "No docker-build required"'
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
@@ -103,27 +103,27 @@ jobs:
     runs-on: ubuntu-20.04
     strategy:
       matrix:
-        cuda_version: ["11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8", "12.0"]
+        cuda_version: ["11.1", "11.6", "11.7", "11.8", "12.0", "12.1", "12.2"]
       fail-fast: false
     env:
       FF_GPU_BACKEND: "cuda"
       cuda_version: ${{ matrix.cuda_version }}
     steps:
       - name: Checkout Git Repository
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
       - name: Free additional space on runner
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: .github/workflows/helpers/free_space_on_runner.sh
 
       - name: Build Docker container
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         env:
           deploy_needed: ${{ ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' }}
-          build_needed: ${{ matrix.cuda_version == '11.8' }}
+          build_needed: ${{ matrix.cuda_version == '12.0' }}
         run: |
           # On push to inference, build for all compatible architectures, so that we can publish 
           # a pre-built general-purpose image. On all other cases, only build for one architecture
@@ -137,7 +137,7 @@ jobs:
           fi
 
       - name: Check availability of flexflow modules in Python
-        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '11.8' }}
+        if: ${{ ( ( github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' ) && github.ref_name == 'inference' ) || matrix.cuda_version == '12.0' }}
         run: docker run --entrypoint /bin/bash flexflow-${FF_GPU_BACKEND}-${cuda_version}:latest -c "export LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH; sudo ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1; python -c 'import flexflow.core; import flexflow.serve as ff; exit()'"
 
       - name: Publish Docker environment image (on push to inference)

diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -1,25 +1,8 @@
 name: "gpu-ci"
 on:
-  pull_request:
-    paths:
-      - "cmake/**"
-      - "config/**"
-      - "deps/**"
-      - "python/**"
-      - "setup.py"
-      - "include/**"
-      - "inference/**"
-      - "src/**"
-      - "tests/inference/**"
-      - "conda/flexflow.yml"
-      - ".github/workflows/gpu-ci.yml"
-      - "tests/cpp_gpu_tests.sh"
-      - "tests/inference_tests.sh"
-      - "tests/training_tests.sh"
-      - "tests/python_interface_test.sh"
   push:
     branches:
-      - "master"
+      - "inference"
     paths:
       - "cmake/**"
       - "config/**"
@@ -194,7 +177,7 @@ jobs:
       
       - name: Save inference output as an artifact
         if: always()
-        run: | 
+        run: |
           cd inference
           tar -zcvf output.tar.gz ./output
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -558,6 +558,7 @@ if(NOT BUILD_LEGION_ONLY)
   if(FF_BUILD_ALL_INFERENCE_EXAMPLES OR FF_BUILD_ALL_EXAMPLES)
     add_subdirectory(inference/spec_infer)
     add_subdirectory(inference/incr_decoding)
+    add_subdirectory(inference/peft)
   endif()
 
 

diff --git a/README.md b/README.md
@@ -35,7 +35,7 @@ If you run into any issue during the install, or if you would like to use the C+
 docker run --gpus all -it --rm --shm-size=8g ghcr.io/flexflow/flexflow-cuda-12.0:latest
 ```
 
-To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.2`, `cuda-11.3`, `cuda-11.4`, `cuda-11.5`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`). More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
+To download a Docker container for a backend other than CUDA v12.0, you can replace the `cuda-12.0` suffix with any of the following backends: `cuda-11.1`, `cuda-11.6`, `cuda-11.7`, `cuda-11.8`, `cuda-12.0`, `cuda-12.1`, `cuda-12.1`, and `hip_rocm-5.3`, `hip_rocm-5.4`, `hip_rocm-5.5`, `hip_rocm-5.6`. More info on the Docker images, with instructions to build a new image from source, or run with additional configurations, can be found [here](./docker/README.md).
 
 ### Build from source
 

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -13,8 +13,19 @@ if(CUDA_FOUND)
   # set cuda runtime and driver lib
   # override cublas and curand because the FindCUDA module may not find the correct libs  
   set(CUDADRV_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda${LIBEXT})
-  set(CUDA_CUBLAS_LIBRARIES ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas${LIBEXT})
-  set(CUDA_curand_LIBRARY ${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcurand${LIBEXT})
+  if(CUBLAS_PATH)
+    set(CUBLAS_ROOT ${CUBLAS_PATH})
+  else()
+  set(CUBLAS_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_CUBLAS_LIBRARIES ${CUBLAS_ROOT}/lib64/libcublas${LIBEXT})
+  if(CURAND_PATH)
+    set(CURAND_ROOT ${CURAND_PATH})
+  else()
+  set(CURAND_ROOT ${CUDA_TOOLKIT_ROOT_DIR})
+  endif()
+  set(CUDA_curand_LIBRARY ${CURAND_ROOT}/lib64/libcurand${LIBEXT})
+
   list(APPEND FLEXFLOW_EXT_LIBRARIES
     ${CUDADRV_LIBRARIES}
     ${CUDA_CUBLAS_LIBRARIES}

diff --git a/config/config.inc b/config/config.inc
@@ -62,6 +62,16 @@ if [ -n "$CUDA_DIR" ]; then
   SET_CUDA_LIB_PATH="CUDA_PATH=${CUDA_PATH}"
 fi
 
+# set cublas dir
+if [ -n "$CUBLAS_DIR" ]; then
+  SET_CUBLAS="-DCUBLAS_PATH=${CUBLAS_DIR}"
+fi
+
+# set curand dir
+if [ -n "$CURAND_DIR" ]; then
+  SET_CURAND="-DCURAND_PATH=${CURAND_DIR}"
+fi
+
 # set cudnn dir
 if [ -n "$CUDNN_DIR" ]; then
   SET_CUDNN="-DCUDNN_PATH=${CUDNN_DIR}"
@@ -231,7 +241,7 @@ if [ -n "$FF_GPU_BACKEND" ]; then
   fi
 fi
 
-CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
+CMAKE_FLAGS="-DCUDA_USE_STATIC_CUDA_RUNTIME=OFF -DLegion_HIJACK_CUDART=OFF ${SET_CC} ${SET_CXX} ${SET_INSTALL_DIR} ${SET_INFERENCE_TESTS} ${SET_LIBTORCH_PATH} ${SET_BUILD} ${SET_CUDA_ARCH} ${SET_CUDA} ${SET_CUBLAS} ${SET_CURAND} ${SET_CUDNN} ${SET_HIP_ARCH} ${SET_PYTHON} ${SET_BUILD_LEGION_ONLY} ${SET_NCCL} ${SET_NCCL_DIR} ${SET_LEGION_NETWORKS} ${SET_UCX} ${SET_EXAMPLES} ${SET_INFERENCE_EXAMPLES} ${SET_USE_PREBUILT_LEGION} ${SET_USE_PREBUILT_NCCL} ${SET_USE_ALL_PREBUILT_LIBRARIES} ${SET_BUILD_UNIT_TESTS} ${SET_AVX2} ${SET_MAX_DIM} ${SET_LEGION_MAX_RETURN_SIZE} ${SET_ROCM_PATH} ${SET_FF_GPU_BACKEND}"
 
 function run_cmake() {
 SRC_LOCATION=${SRC_LOCATION:=`dirname $0`/../}

diff --git a/config/config.linux b/config/config.linux
@@ -36,12 +36,18 @@ FF_CUDA_ARCH=${FF_CUDA_ARCH:-"autodetect"}
 # or all available architectures. TODO: support autodetect
 FF_HIP_ARCH=${FF_HIP_ARCH:-"all"}
 
-# set CUDNN dir in case cmake cannot autodetect a path
-CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
-
 # set CUDA dir in case cmake cannot autodetect a path
 CUDA_DIR=${CUDA_DIR:-"/usr/local/cuda"}
 
+# set CUBLAS dir in case it is not stored in the CUDA DIR
+CUBLAS_DIR=${CUBLAS_DIR:-"/usr/local/cuda"}
+
+# set CURAND dir in case it is not stored in the CUDA DIR
+CURAND_DIR=${CURAND_DIR:-"/usr/local/cuda"}
+
+# set CUDNN dir in case cmake cannot autodetect a path
+CUDNN_DIR=${CUDNN_DIR:-"/usr/local/cuda"}
+
 # if not use PREBUILD_NCCL, you can set NCCL_DIR to use external nccl lib,
 # otherwise, we will build nccl from source
 NCCL_DIR=${NCCL_DIR:-"/usr/local/cuda"}
@@ -102,7 +108,7 @@ fi
 
 function get_build_configs() {
     # Create a string with the values of the variables set in this script
-    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDNN_DIR=${CUDNN_DIR} CUDA_DIR=${CUDA_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
+    BUILD_CONFIGS="FF_CUDA_ARCH=${FF_CUDA_ARCH} FF_HIP_ARCH=${FF_HIP_ARCH} CUDA_DIR=${CUDA_DIR} CUDNN_DIR=${CUDNN_DIR} CUBLAS_DIR=${CUBLAS_DIR} CURAND_DIR=${CURAND_DIR} NCCL_DIR=${NCCL_DIR} FF_USE_PYTHON=${FF_USE_PYTHON} BUILD_LEGION_ONLY=${BUILD_LEGION_ONLY} FF_GASNET_CONDUIT=${FF_GASNET_CONDUIT} UCX_DIR=${UCX_DIR} FF_LEGION_NETWORKS=${FF_LEGION_NETWORKS} FF_BUILD_ALL_EXAMPLES=${FF_BUILD_ALL_EXAMPLES} FF_BUILD_ALL_INFERENCE_EXAMPLES=${FF_BUILD_ALL_INFERENCE_EXAMPLES} FF_BUILD_UNIT_TESTS=${FF_BUILD_UNIT_TESTS} FF_USE_PREBUILT_NCCL=${FF_USE_PREBUILT_NCCL} FF_USE_PREBUILT_LEGION=${FF_USE_PREBUILT_LEGION} FF_USE_ALL_PREBUILT_LIBRARIES=${FF_USE_ALL_PREBUILT_LIBRARIES} FF_USE_AVX2=${FF_USE_AVX2} FF_MAX_DIM=${FF_MAX_DIM} ROCM_PATH=${ROCM_PATH} FF_GPU_BACKEND=${FF_GPU_BACKEND} INSTALL_DIR=${INSTALL_DIR}"
 }
 
 if [[ -n "$1" && ( "$1" == "CMAKE_FLAGS" || "$1" == "CUDA_PATH" ) ]]; then

diff --git a/docker/README.md b/docker/README.md
@@ -7,7 +7,7 @@ You can build and run the FlexFlow Docker images on any machine, but if you want
 ## Downloading a pre-built package
 The fastest way to run FlexFlow is to use one of the pre-built containers, which we update for each commit to the `inference` branch (the `inference` branch is currently ahead of the `master` branch). The available containers are the following, and can be found [at this link](https://github.com/orgs/flexflow/packages?repo_name=FlexFlow):
 
-* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0). The CUDA images are named `flexflow-<GPU backend>-<GPU software version>`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or 
+* `flexflow`: the pre-built version of FlexFlow. We currently publish four version targeting AMD GPUs (ROCm versions: 5.3, 5.4, 5.5 and 5.6 ), and several versions for CUDA GPUs (CUDA versions: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, and 12.2). The CUDA images are named `flexflow-<GPU backend>-<GPU software version>`, e.g. [flexflow-hip_rocm-5.6](https://github.com/flexflow/FlexFlow/pkgs/container/flexflow-hip_rocm-5.6) or [flexflow-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-cuda-12.0) or 
 * `flexflow-environment`: this is the base layer for `flexflow`. The packages are used in CI or for internal use, and contain all the dependencies needed to build/run Flexflow. You may find them useful if you want to build FlexFlow yourself. We also publish four version of `flexflow-environment` for AMD GPUs and, for NVIDIA GPUs, one for each CUDA version in the list above. The naming convention is similar, too. For example, the `flexflow-environment` image for CUDA 12.0 is tagged [flexflow-environment-cuda-12.0](https://github.com/orgs/flexflow/packages/container/package/flexflow-environment-cuda-12.0).
 
 The easiest way to download any of the Docker containers above is to call:
@@ -19,7 +19,7 @@ The easiest way to download any of the Docker containers above is to call:
 where `CONTAINER_NAME` is `flexflow` (or `flexflow-environment`). By default, the script will assume a NVIDIA backend and attempt to detect the CUDA version on your machine, to download the relevant container. If your machine has AMD GPUs, or no GPUs, or if you want to specify  the CUDA/ROCM version to download, set the environment variables below:
 
 * `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be downloaded.
-* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1 and 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
 * `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
 
 
@@ -44,7 +44,7 @@ If you only want to build the `flexflow-environment` image (the base layers of t
 After having either built or downloaded a Docker container by following the instructions above, you can run it with the following command (image name argument of the run script can be omitted). Once again, you can set the `FF_GPU_BACKEND`, `cuda_version` and `hip_version` optional environment variables to run the docker image with the desired GPU backend and CUDA/HIP version:
 
 * `FF_GPU_BACKEND` (supported options: `cuda`, `hip_rocm`) to specify the GPU backend of the Docker container to be run.
-* `cuda_version` (supported options: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7, 11.8 and 12.0) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
+* `cuda_version` (supported options: 11.1, 11.6, 11.7, 11.8, 12.0, 12.1, 12.2) to specify the CUDA version, when using a `cuda` backend. If `FF_GPU_BACKEND` is set to `hip_rocm`, the `cuda_version` env will be ignored
 * `hip_version` (supported options: 5.3, 5.4, 5.5, 5.6) to specify the ROCm version, when using a HIP backend. If `FF_GPU_BACKEND` is set to `cuda`, the `hip_version` env will be ignored.
 
 Leaving these variables unset will assume a GPU backend, and instruct the script to autodetect the CUDA version installed on the current machine and run the Docker container with it if available.

diff --git a/docker/build.sh b/docker/build.sh
@@ -50,20 +50,20 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
   # Check that CUDA version is supported, and modify cuda version to include default subsubversion
   if [[ "$cuda_version" == @(11.1|11.3|11.7|12.0|12.1) ]]; then
     cuda_version_input=${cuda_version}.1
-  elif [[ "$cuda_version" == @(11.2|11.5|11.6) ]]; then 
+  elif [[ "$cuda_version" == @(11.2|11.5|11.6|12.2) ]]; then 
     cuda_version_input=${cuda_version}.2
   elif [[ "$cuda_version" == @(11.4) ]]; then 
     cuda_version_input=${cuda_version}.3
-  elif [[ "$cuda_version" == @(11.8|12.2) ]]; then 
+  elif [[ "$cuda_version" == @(11.8) ]]; then 
     cuda_version_input=${cuda_version}.0
   else
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.0 for all versions greater or equal to 12.0 for now
-  if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.0
-    cuda_version_input=${cuda_version}.1
+  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now (the Docker machine with CUDNN is not yet available)
+  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    cuda_version=12.2
+    cuda_version_input=${cuda_version}.2
   fi
   echo "Building $image docker image with CUDA $cuda_version"
   ff_environment_base_image="nvidia/cuda:${cuda_version_input}-cudnn8-devel-ubuntu20.04"

diff --git a/docker/pull.sh b/docker/pull.sh
@@ -45,13 +45,13 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     fi
   fi
   # Check that CUDA version is supported
-  if [[ "$cuda_version" != @(11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
-    echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
+  if [[ "$cuda_version" != @(11.1|11.6|11.7|11.8|12.0|12.1|12.2) ]]; then
+    echo "cuda_version is not available for download, please choose among {11.1|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.0 for all versions greater or equal to 12.0 for now
-  if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.0
+  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    cuda_version=12.2
   fi
   # Set cuda version suffix to docker image name
   echo "Downloading $image docker image with CUDA $cuda_version"

diff --git a/docker/run.sh b/docker/run.sh
@@ -62,9 +62,9 @@ if [[ "${FF_GPU_BACKEND}" == "cuda" || "${FF_GPU_BACKEND}" == "hip_cuda" ]]; the
     echo "cuda_version is not supported, please choose among {11.1|11.2|11.3|11.4|11.5|11.6|11.7|11.8|12.0|12.1|12.2}"
     exit 1
   fi
-  # Use CUDA 12.0 for all versions greater or equal to 12.0 for now
-  if [[ "$cuda_version" == @(12.1|12.2|12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
-    cuda_version=12.0
+  # Use CUDA 12.2 for all versions greater or equal to 12.2 for now
+  if [[ "$cuda_version" == @(12.3|12.4|12.5|12.6|12.7|12.8|12.9) ]]; then
+    cuda_version=12.2
   fi
   # Set cuda version suffix to docker image name
   echo "Running $image docker image with CUDA $cuda_version"

diff --git a/include/flexflow/batch_config.h b/include/flexflow/batch_config.h
@@ -51,6 +51,7 @@ class BatchConfig {
   static int max_requests_per_batch();
   static int max_tokens_per_batch();
   static int max_verify_tokens_per_batch();
+  static int max_spec_tree_token_num();
   static int max_sequence_length();
   friend std::ostream &operator<<(std::ostream &os, BatchConfig const &bc);
   void print() const;

diff --git a/include/flexflow/config.h b/include/flexflow/config.h
@@ -155,7 +155,7 @@ class FFConfig {
   Legion::Runtime *lg_hlr;
   Legion::IndexSpaceT<1> all_gpu_task_is;
   // Legion::FieldSpace field_space;
-  bool syntheticInput, profiling, perform_fusion;
+  bool benchmarking, profiling, perform_fusion;
   bool inference_debugging;
   size_t simulator_work_space_size;
   size_t search_budget;

diff --git a/include/flexflow/ffconst.h b/include/flexflow/ffconst.h
@@ -78,6 +78,11 @@ enum InferenceMode {
   TREE_VERIFY_MODE = 2003,
 };
 
+enum RequestType {
+  REQ_INFERENCE = 4001,
+  REQ_FINETUNING = 4002,
+};
+
 // This is consistent with TASO's OpType
 // https://github.com/jiazhihao/TASO/blob/master/include/taso/ops.h#L75-L138
 enum OperatorType {
@@ -179,8 +184,7 @@ enum OperatorType {
   OP_TREE_INC_MULTIHEAD_SELF_ATTENTION,
   OP_SAMPLING,
   // PEFT Ops
-  OP_LORA_MLP_FIRST,
-  OP_LORA_MLP_SECOND,
+  OP_LORA,
   // Parallel Ops
   OP_REPARTITION,
   OP_COMBINE,