diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml index 5a199d5a..bc7fde4e 100644 --- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml @@ -44,7 +44,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing,tesnsorrt-llm] + pip install -e .[testing] - name: Run tests run: | @@ -57,7 +57,6 @@ jobs: }} name: Run examples run: | - rm -rf /root/.cache/huggingface pytest tests/test_examples.py -x -s -k "cli and cuda and trt" cli_cuda_tensorrt_llm_multi_gpu_tests: @@ -84,7 +83,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing,tesnsorrt-llm] + pip install -e .[testing] - name: Run tests (sequential) run: | diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index ac5bcdc3..a32060b1 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -16,12 +16,12 @@ backend: device: cuda device_ids: 0 cuda_graphs: 0 # remove for better perf but bigger memory footprint - no_weights: false # investigate later + no_weights: true model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: input_shapes: - batch_size: 4 + batch_size: 1 sequence_length: 64 generate_kwargs: diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml index 501280e8..eec83232 100644 --- a/examples/cuda_trt_llama.yaml +++ b/examples/cuda_trt_llama.yaml @@ -23,7 +23,7 @@ backend: scenario: input_shapes: - batch_size: 4 + batch_size: 1 sequence_length: 64 generate_kwargs: diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml index 4a624cc1..4f3aadd6 100644 --- a/examples/cuda_vllm_llama.yaml +++ b/examples/cuda_vllm_llama.yaml @@ -23,7 +23,7 @@ backend: scenario: input_shapes: - batch_size: 4 + batch_size: 1 sequence_length: 64 generate_kwargs: diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index 60b82675..ee982bb1 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -55,7 +55,6 @@ def download_pretrained_model(self) -> None: def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - model_cache_folder = f"models/{self.config.model}".replace("/", "--") model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}" snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" @@ -91,7 +90,6 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Modifying generation config for fixed length generation") self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) def load_trtllm_with_no_weights(self) -> None: @@ -128,6 +126,12 @@ def trtllm_kwargs(self): if self.config.gpus_per_node is not None: kwargs["gpus_per_node"] = self.config.gpus_per_node + if self.config.max_input_len is not None: + kwargs["max_input_len"] = self.config.max_input_len + + if self.config.max_output_len is not None: + kwargs["max_output_len"] = self.config.max_output_len + if self.config.max_batch_size is not None: kwargs["max_batch_size"] = self.config.max_batch_size diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py index 2497d5d4..a191ae78 100644 --- a/optimum_benchmark/backends/tensorrt_llm/config.py +++ b/optimum_benchmark/backends/tensorrt_llm/config.py @@ -22,6 +22,8 @@ class TRTLLMConfig(BackendConfig): use_fp8: Optional[bool] = None world_size: Optional[int] = None gpus_per_node: Optional[int] = None + max_input_len: Optional[int] = None + max_output_len: Optional[int] = None max_batch_size: Optional[int] = None max_new_tokens: Optional[int] = None max_prompt_length: Optional[int] = None diff --git a/optimum_benchmark/task_utils.py b/optimum_benchmark/task_utils.py index 51be2ee5..7c066d14 100644 --- a/optimum_benchmark/task_utils.py +++ b/optimum_benchmark/task_utils.py @@ -242,13 +242,13 @@ def infer_task_from_model_name_or_path( elif library_name == "timm": inferred_task_name = "image-classification" - elif library_name == "diffusers": - diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision) - target_class_name = diffusers_config["_class_name"] + elif library_name == "transformers": + transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + target_class_name = transformers_config["architectures"][0] - for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items(): - for _, pipeline_class_name in pipeline_mapping.items(): - if target_class_name == pipeline_class_name: + for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items(): + for _, model_class_name in model_mapping.items(): + if target_class_name == model_class_name: inferred_task_name = task_name break if inferred_task_name is not None: @@ -257,13 +257,13 @@ def infer_task_from_model_name_or_path( if inferred_task_name is None: raise KeyError(f"Could not find the proper task name for target class name {target_class_name}.") - elif library_name == "transformers": - transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) - target_class_name = transformers_config["architectures"][0] + elif library_name == "diffusers": + diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision) + target_class_name = diffusers_config["_class_name"] - for task_name, model_mapping in TASKS_TO_MODEL_TYPES_TO_MODEL_CLASS_NAMES.items(): - for _, model_class_name in model_mapping.items(): - if target_class_name == model_class_name: + for task_name, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items(): + for _, pipeline_class_name in pipeline_mapping.items(): + if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name): inferred_task_name = task_name break if inferred_task_name is not None: @@ -293,13 +293,17 @@ def infer_model_type_from_model_name_or_path( timm_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) inferred_model_type = timm_config["architecture"] + elif library_name == "transformers": + transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) + inferred_model_type = transformers_config["model_type"] + elif library_name == "diffusers": diffusers_config = get_repo_config(model_name_or_path, "model_index.json", token=token, revision=revision) target_class_name = diffusers_config["_class_name"] for _, pipeline_mapping in TASKS_TO_PIPELINE_TYPES_TO_PIPELINE_CLASS_NAMES.items(): for pipeline_type, pipeline_class_name in pipeline_mapping.items(): - if target_class_name == pipeline_class_name: + if target_class_name == pipeline_class_name or (pipeline_class_name in target_class_name): inferred_model_type = pipeline_type break if inferred_model_type is not None: @@ -308,8 +312,4 @@ def infer_model_type_from_model_name_or_path( if inferred_model_type is None: raise KeyError(f"Could not find the proper model type for target class name {target_class_name}.") - elif library_name == "transformers": - transformers_config = get_repo_config(model_name_or_path, "config.json", token=token, revision=revision) - inferred_model_type = transformers_config["model_type"] - return inferred_model_type