Fix trt llm (#308)

huggingface · Dec 12, 2024 · f12f642 · f12f642
1 parent 4c23a0d
commit f12f642
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 26 deletions.
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,tesnsorrt-llm]
+          pip install -e .[testing]
 
       - name: Run tests
         run: |
@@ -57,7 +57,6 @@ jobs:
           }}
         name: Run examples
         run: |
-          rm -rf /root/.cache/huggingface
           pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
 
   cli_cuda_tensorrt_llm_multi_gpu_tests:
@@ -84,7 +83,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing,tesnsorrt-llm]
+          pip install -e .[testing]
 
       - name: Run tests (sequential)
         run: |

diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
@@ -16,12 +16,12 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
-  no_weights: false # investigate later
+  no_weights: true
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
   input_shapes:
-    batch_size: 4
+    batch_size: 1
     sequence_length: 64
 
   generate_kwargs:

diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml
@@ -23,7 +23,7 @@ backend:
 
 scenario:
   input_shapes:
-    batch_size: 4
+    batch_size: 1
     sequence_length: 64
 
   generate_kwargs:

diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml
@@ -23,7 +23,7 @@ backend:
 
 scenario:
   input_shapes:
-    batch_size: 4
+    batch_size: 1
     sequence_length: 64
 
   generate_kwargs:

diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -55,7 +55,6 @@ def download_pretrained_model(self) -> None:
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = None
         self.generation_config.pad_token_id = None
-
         model_cache_folder = f"models/{self.config.model}".replace("/", "--")
         model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}"
         snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
@@ -91,7 +90,6 @@ def create_no_weights_model(self) -> None:
             self.logger.info("\t+ Modifying generation config for fixed length generation")
             self.generation_config.eos_token_id = None
             self.generation_config.pad_token_id = None
-            self.logger.info("\t+ Saving new pretrained generation config")
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
     def load_trtllm_with_no_weights(self) -> None:
@@ -128,6 +126,12 @@ def trtllm_kwargs(self):
         if self.config.gpus_per_node is not None:
             kwargs["gpus_per_node"] = self.config.gpus_per_node
 
+        if self.config.max_input_len is not None:
+            kwargs["max_input_len"] = self.config.max_input_len
+
+        if self.config.max_output_len is not None:
+            kwargs["max_output_len"] = self.config.max_output_len
+
         if self.config.max_batch_size is not None:
             kwargs["max_batch_size"] = self.config.max_batch_size
 

diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py
@@ -22,6 +22,8 @@ class TRTLLMConfig(BackendConfig):
     use_fp8: Optional[bool] = None
     world_size: Optional[int] = None
     gpus_per_node: Optional[int] = None
+    max_input_len: Optional[int] = None
+    max_output_len: Optional[int] = None
     max_batch_size: Optional[int] = None
     max_new_tokens: Optional[int] = None
     max_prompt_length: Optional[int] = None

diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py
@@ -242,13 +242,13 @@ def infer_task_from_model_name_or_path(
     elif library_name == "timm":
         inferred_task_name = "image-classification"
 
-    elif library_name == "diffusers":
-        diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
-        target_class_name = diffusers_config["_class_name"]
+    elif library_name == "transformers":
+        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        target_class_name = transformers_config["architectures"][0]
 
-        for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
-            for _, pipeline_class_name in pipeline_mapping.items():
-                if target_class_name == pipeline_class_name:
+        for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items():
+            for _, model_class_name in model_mapping.items():
+                if target_class_name == model_class_name:
                     inferred_task_name = task_name
                     break
             if inferred_task_name is not None:
@@ -257,13 +257,13 @@ def infer_task_from_model_name_or_path(
         if inferred_task_name is None:
             raise KeyError(f"Could not find the proper task name for target class name {target_class_name}.")
 
-    elif library_name == "transformers":
-        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
-        target_class_name = transformers_config["architectures"][0]
+    elif library_name == "diffusers":
+        diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
+        target_class_name = diffusers_config["_class_name"]
 
-        for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items():
-            for _, model_class_name in model_mapping.items():
-                if target_class_name == model_class_name:
+        for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
+            for _, pipeline_class_name in pipeline_mapping.items():
+                if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name):
                     inferred_task_name = task_name
                     break
             if inferred_task_name is not None:
@@ -293,13 +293,17 @@ def infer_model_type_from_model_name_or_path(
         timm_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
         inferred_model_type = timm_config["architecture"]
 
+    elif library_name == "transformers":
+        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
+        inferred_model_type = transformers_config["model_type"]
+
     elif library_name == "diffusers":
         diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision)
         target_class_name = diffusers_config["_class_name"]
 
         for _, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items():
             for pipeline_type, pipeline_class_name in pipeline_mapping.items():
-                if target_class_name == pipeline_class_name:
+                if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name):
                     inferred_model_type = pipeline_type
                     break
             if inferred_model_type is not None:
@@ -308,8 +312,4 @@ def infer_model_type_from_model_name_or_path(
         if inferred_model_type is None:
             raise KeyError(f"Could not find the proper model type for target class name {target_class_name}.")
 
-    elif library_name == "transformers":
-        transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision)
-        inferred_model_type = transformers_config["model_type"]
-
     return inferred_model_type