Update LLAMA tokenizer (#1524)

* fix tokenizer conversion * update * update * update * fix * fix * lint * simplify api * fix * fix * fix * update to 12.1 (#1512) * fix deadlock? * remove barrier where not strictly needed --------- Co-authored-by: zhihao <email>
flexflow · Oct 19, 2024 · 2bfa56c · 2bfa56c
1 parent dbd4cf1
commit 2bfa56c
Show file tree

Hide file tree

Showing 28 changed files with 378 additions and 155 deletions.
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -56,7 +56,7 @@ jobs:
       CONDA: "3"    
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Keep alive
@@ -75,7 +75,7 @@ jobs:
       CONDA: "3"    
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -151,7 +151,7 @@ jobs:
       HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
     needs: gpu-ci-concierge
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version
@@ -239,7 +239,7 @@ jobs:
       CONDA: "3"
     needs: inference-tests
     container:
-      image: ghcr.io/flexflow/flexflow-environment-cuda-11.8:latest
+      image: ghcr.io/flexflow/flexflow-environment-cuda-12.1:latest
       options: --gpus all --shm-size=8192m
     steps:
       - name: Install updated git version

diff --git a/cmake/nccl.cmake b/cmake/nccl.cmake
@@ -36,11 +36,12 @@ if(NCCL_LIBRARY AND NCCL_INCLUDE_DIR)
     string(REGEX MATCH "([0-9]+)" NCCL_MAJOR ${NCCL_VERSION_DEFINES})
     string(REGEX MATCH "([0-9]+)" NCCL_MINOR ${NCCL_VERSION_DEFINES2})
     set(NCCL_VERSION "${NCCL_MAJOR}.${NCCL_MINOR}")
-    if(NCCL_VERSION VERSION_LESS 2.23)
-      set(NCCL_OLD TRUE)
-    else()
-      set(NCCL_OLD FALSE)
-    endif()
+    set(NCCL_OLD FALSE)
+    # if(NCCL_VERSION VERSION_LESS 2.23)
+    #   set(NCCL_OLD TRUE)
+    # else()
+    #   set(NCCL_OLD FALSE)
+    # endif()
     message(STATUS "Found NCCL version: ${NCCL_VERSION}")
   else()
     message(WARNING "NCCL header not found, unable to determine version")

diff --git a/docker/flexflow-environment/Dockerfile b/docker/flexflow-environment/Dockerfile
@@ -55,18 +55,18 @@ ENV CUDA_DIR /usr/local/cuda
 ARG FF_GPU_BACKEND "cuda"
 
 # Update NCCL if FF_GPU_BACKEND is cuda
-RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
-        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
-        ubuntu_version=$(lsb_release -rs); \
-        ubuntu_version=${ubuntu_version//./}; \
-        wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
-        DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
-        DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
-        rm -f cuda-keyring_1.0-1_all.deb; \
-        DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
-    else \
-        echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
-    fi'
+# RUN /bin/bash -c 'if [ "$FF_GPU_BACKEND" = "cuda" ]; then \
+#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Updating NCCL"; \
+#         ubuntu_version=$(lsb_release -rs); \
+#         ubuntu_version=${ubuntu_version//./}; \
+#         wget "https://developer.download.nvidia.com/compute/cuda/repos/ubuntu${ubuntu_version}/x86_64/cuda-keyring_1.0-1_all.deb"; \
+#         DEBIAN_FRONTEND=noninteractive dpkg -i cuda-keyring_1.0-1_all.deb; \
+#         DEBIAN_FRONTEND=noninteractive apt-get update -y --allow-change-held-packages; \
+#         rm -f cuda-keyring_1.0-1_all.deb; \
+#         DEBIAN_FRONTEND=noninteractive apt install -y --allow-change-held-packages libnccl2 libnccl-dev; \
+#     else \
+#         echo "FF_GPU_BACKEND: ${FF_GPU_BACKEND}. Skipping updating NCCL"; \
+#     fi'
 
 # Install hip dependencies if FF_GPU_BACKEND is hip_cuda or hip_rocm
 # Note that amd's docs say to also install the `hip-runtime-nvidia` package. This

diff --git a/include/flexflow/ops/kernels/lora_linear_kernels.h b/include/flexflow/ops/kernels/lora_linear_kernels.h
@@ -8,7 +8,8 @@
 #include "flexflow/ops/lora_linear.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 struct LoraLinearWeight {
   // weights
   void *w0_ptr, *w1_ptr;
@@ -46,7 +47,9 @@ void inference_kernel_wrapper(LoraLinearMeta *m,
                               BatchConfig const *bc,
                               GenericTensorAccessorR const &input,
                               GenericTensorAccessorW const &output);
-void peft_bwd_kernel_wrapper(LoraLinearMeta *m,
+void peft_bwd_kernel_wrapper(Context ctx,
+                             Runtime *runtime,
+                             LoraLinearMeta *m,
                              BatchConfig const *bc,
                              GenericTensorAccessorW const &input_grad,
                              GenericTensorAccessorR const &output_grad);
@@ -63,7 +66,9 @@ void inference_kernel(LoraLinearMeta *m,
                       int out_dim,
                       ffStream_t stream);
 template <typename DT>
-void peft_bwd_kernel(LoraLinearMeta *m,
+void peft_bwd_kernel(Context ctx,
+                     Runtime *runtime,
+                     LoraLinearMeta *m,
                      BatchConfig const *bc,
                      DT *input_grad_ptr,
                      DT const *output_grad_ptr,

diff --git a/include/flexflow/optimizer.h b/include/flexflow/optimizer.h
@@ -20,7 +20,8 @@
 #include "legion.h"
 
 namespace FlexFlow {
-
+using Legion::Context;
+using Legion::Runtime;
 class FFModel;
 class OpMeta;
 
@@ -60,7 +61,9 @@ class SGDOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(SGDOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   SGDOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,
@@ -103,7 +106,9 @@ class AdamOptimizer : public Optimizer {
                        std::vector<Legion::PhysicalRegion> const &regions,
                        Legion::Context ctx,
                        Legion::Runtime *runtime);
-  static void nccl_update_task_gpu(AdamOptimizer const *op,
+  static void nccl_update_task_gpu(Context ctx,
+                                   Runtime *runtime,
+                                   AdamOptimizer const *op,
                                    OpMeta const *meta,
                                    float const *w_grad_ptr,
                                    size_t size,

diff --git a/include/flexflow/request_manager.h b/include/flexflow/request_manager.h
@@ -68,7 +68,7 @@ struct Request {
   BatchConfig::RequestGuid guid;
   PEFTModelID peft_model_id = PEFTModelID::NO_ID;
   int max_length = -1;
-  int max_new_tokens = 128;
+  int max_new_tokens = -1;
   int initial_len;
   int ssm_cache_size = 0;
   int llm_cache_size = 0;
@@ -302,6 +302,7 @@ class RequestManager {
   ModelType model_type;
   int bos_token_id;
   int eos_token_id;
+  bool old_llama_tokenizer = false;
   std::string output_filepath;
   std::queue<Request> pending_infr_request_queue;
   std::queue<Request> pending_peft_request_queue;

diff --git a/inference/peft/peft.cc b/inference/peft/peft.cc
@@ -340,7 +340,7 @@ void FlexFlow::top_level_task(Task const *task,
         printf("Inference prompt[%d]: %s\n", total_num_requests, text.c_str());
         Request inference_req;
         inference_req.prompt = text;
-        inference_req.max_length = 128;
+        inference_req.max_new_tokens = 128;
         inference_req.peft_model_id =
             (peft_model_id != nullptr) ? *peft_model_id : PEFTModelID::NO_ID;
         requests.push_back(inference_req);

diff --git a/inference/python/ff_peft.py b/inference/python/ff_peft.py
@@ -162,7 +162,7 @@ def main():
             ff.Request(
                 ff.RequestType.REQ_INFERENCE,
                 prompt=prompt,
-                max_sequence_length=128,
+                max_new_tokens=128,
                 peft_model_id=llm.get_ff_peft_id(lora_inference_config),
             )
             for prompt in prompts
@@ -172,7 +172,6 @@ def main():
     if len(configs.finetuning_dataset) > 0:
         finetuning_request = ff.Request(
             ff.RequestType.REQ_FINETUNING,
-            max_sequence_length=128,
             peft_model_id=llm.get_ff_peft_id(lora_finetuning_config),
             dataset_filepath=configs.finetuning_dataset,
             max_training_steps=2,

diff --git a/inference/python/incr_decoding.py b/inference/python/incr_decoding.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -71,6 +71,7 @@ def get_configs():
             "full_precision": False,
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -106,9 +107,9 @@ def main():
         max_seq_length=256,
         max_tokens_per_batch=64,
     )
-    
+
     llm.start_server()
-    
+
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
         if "max_length" not in configs_dict:
@@ -119,8 +120,10 @@ def main():
         if "max_length" not in configs_dict:
             result = llm.generate("Three tips for staying healthy are: ")
         else:
-            result = llm.generate("Three tips for staying healthy are: ", max_length=configs.max_length)
-
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
 

diff --git a/inference/python/spec_infer.py b/inference/python/spec_infer.py
@@ -51,12 +51,12 @@ def get_configs():
             "tensor_parallelism_degree": 1,
             "pipeline_parallelism_degree": 2,
             "offload": False,
-            "offload_reserve_space_size": 8 * 1024, # 8GB
+            "offload_reserve_space_size": 8 * 1024,  # 8GB
             "use_4bit_quantization": False,
             "use_8bit_quantization": False,
             "enable_peft": False,
-            "peft_activation_reserve_space_size": 1024, # 1GB
-            "peft_weight_reserve_space_size": 1024, # 1GB
+            "peft_activation_reserve_space_size": 1024,  # 1GB
+            "peft_weight_reserve_space_size": 1024,  # 1GB
             "profiling": False,
             "benchmarking": False,
             "inference_debugging": False,
@@ -81,6 +81,7 @@ def get_configs():
             ],
             "prompt": "",
             "output_file": "",
+            "max_length": 128,
         }
         # Merge dictionaries
         ff_init_configs.update(llm_configs)
@@ -144,17 +145,26 @@ def main():
         max_tokens_per_batch=64,
         ssms=ssms,
     )
-    
+
     llm.start_server()
 
     if len(configs.prompt) > 0:
         prompts = [s for s in json.load(open(configs.prompt))]
-        results = llm.generate(prompts)
+        if "max_length" not in configs_dict:
+            results = llm.generate(prompts)
+        else:
+            results = llm.generate(prompts, max_length=configs.max_length)
     else:
-        result = llm.generate("Three tips for staying healthy are: ")
-
+        if "max_length" not in configs_dict:
+            result = llm.generate("Three tips for staying healthy are: ")
+        else:
+            result = llm.generate(
+                "Three tips for staying healthy are: ", max_length=configs.max_length
+            )
+
     llm.stop_server()
 
+
 if __name__ == "__main__":
     print("flexflow inference example (speculative inference)")
     main()