From 4c23a0dd5935e7600e61c861bcc6679e75f1084b Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 11 Dec 2024 18:32:53 +0100 Subject: [PATCH] Optional backend kwargs (#307) --- .github/workflows/test_api_rocm.yaml | 1 + .../workflows/test_cli_cuda_tensorrt_llm.yaml | 6 +- .github/workflows/test_cli_rocm_pytorch.yaml | 2 + examples/cpu_ipex_bert.yaml | 4 +- examples/cpu_ipex_llama.yaml | 4 +- examples/cpu_onnxruntime_timm.yaml | 20 --- examples/cpu_openvino_8bit_bert.yaml | 5 +- examples/cpu_openvino_diffusion.yaml | 5 + examples/cuda_tgi_llama.yaml | 1 + examples/cuda_trt_llama.yaml | 1 + examples/cuda_vllm_llama.yaml | 3 +- examples/mps_pytorch_bert.yaml | 12 +- optimum_benchmark/backends/base.py | 7 - optimum_benchmark/backends/ipex/backend.py | 39 ++--- optimum_benchmark/backends/ipex/config.py | 8 +- optimum_benchmark/backends/ipex/utils.py | 2 +- .../backends/llama_cpp/backend.py | 17 +- .../backends/llama_cpp/config.py | 2 + .../backends/onnxruntime/backend.py | 80 ++++----- .../backends/onnxruntime/config.py | 16 +- .../backends/onnxruntime/utils.py | 17 +- .../backends/openvino/backend.py | 150 ++++------------- optimum_benchmark/backends/openvino/config.py | 31 ++-- optimum_benchmark/backends/openvino/utils.py | 12 +- optimum_benchmark/backends/py_txi/config.py | 2 +- .../backends/tensorrt_llm/backend.py | 158 +++++++++++++----- .../backends/tensorrt_llm/config.py | 40 +++-- .../backends/tensorrt_llm/utils.py | 2 +- .../backends/torch_ort/backend.py | 14 +- .../backends/torch_ort/config.py | 5 +- optimum_benchmark/backends/vllm/backend.py | 31 ++-- optimum_benchmark/backends/vllm/config.py | 8 - .../scenarios/inference/scenario.py | 5 +- setup.py | 2 +- tests/configs/_diffusers_.yaml | 11 +- tests/configs/_export_.yaml | 2 + tests/configs/_inc_quant_.yaml | 3 - ...al_compressor_inc_quant_text_decoders.yaml | 12 -- ...al_compressor_inc_quant_text_encoders.yaml | 12 -- .../cpu_inference_onnxruntime_diffusers.yaml | 1 + .../cpu_inference_onnxruntime_ort_quant.yaml | 1 + ...u_inference_onnxruntime_text_decoders.yaml | 1 + ...u_inference_onnxruntime_text_encoders.yaml | 1 + ...ce_onnxruntime_text_encoders_decoders.yaml | 1 + .../cpu_inference_onnxruntime_timm.yaml | 1 + ...a_inference_onnxruntime_text_decoders.yaml | 1 + ...a_inference_onnxruntime_text_encoders.yaml | 1 + 47 files changed, 350 insertions(+), 410 deletions(-) delete mode 100644 examples/cpu_onnxruntime_timm.yaml create mode 100644 tests/configs/_export_.yaml delete mode 100644 tests/configs/_inc_quant_.yaml delete mode 100644 tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml delete mode 100644 tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml index f6f20aa4..5c795cbd 100644 --- a/.github/workflows/test_api_rocm.yaml +++ b/.github/workflows/test_api_rocm.yaml @@ -33,6 +33,7 @@ jobs: with: machine_type: single-gpu install_extras: testing,timm,diffusers,codecarbon + test_file: test_api.py pytest_keywords: api and cuda secrets: HF_TOKEN: ${{ secrets.HF_TOKEN }} diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml index c75aac92..5a199d5a 100644 --- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml +++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml @@ -44,7 +44,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing] + pip install -e .[testing,tesnsorrt-llm] - name: Run tests run: | @@ -57,7 +57,7 @@ jobs: }} name: Run examples run: | - huggingface-cli delete-cache + rm -rf /root/.cache/huggingface pytest tests/test_examples.py -x -s -k "cli and cuda and trt" cli_cuda_tensorrt_llm_multi_gpu_tests: @@ -84,7 +84,7 @@ jobs: - name: Install dependencies run: | - pip install -e .[testing] + pip install -e .[testing,tesnsorrt-llm] - name: Run tests (sequential) run: | diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index 3057b726..47008b46 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -35,6 +35,7 @@ jobs: with: machine_type: single-gpu install_extras: testing,diffusers,timm,peft,autoawq,auto-gptq + test_file: test_cli.py pytest_keywords: cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb run_cli_rocm_pytorch_multi_gpu_tests: @@ -52,4 +53,5 @@ jobs: with: machine_type: multi-gpu install_extras: testing,diffusers,timm,peft + test_file: test_cli.py pytest_keywords: cli and cuda and pytorch and (dp or ddp or device_map) diff --git a/examples/cpu_ipex_bert.yaml b/examples/cpu_ipex_bert.yaml index 0e7ed37b..1974e9c3 100644 --- a/examples/cpu_ipex_bert.yaml +++ b/examples/cpu_ipex_bert.yaml @@ -17,8 +17,8 @@ launcher: backend: device: cpu export: true - no_weights: false # because on multi-node machines, intializing weights could harm performance - torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs + no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance + torch_dtype: float32 # use bfloat16 on compatible Intel CPUs model: google-bert/bert-base-uncased scenario: diff --git a/examples/cpu_ipex_llama.yaml b/examples/cpu_ipex_llama.yaml index 898ed0df..50e23c55 100644 --- a/examples/cpu_ipex_llama.yaml +++ b/examples/cpu_ipex_llama.yaml @@ -17,8 +17,8 @@ launcher: backend: device: cpu export: true - no_weights: false # because on multi-node machines, intializing weights could harm performance - torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs + no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance + torch_dtype: float32 # use bfloat16 on compatible Intel CPUs model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: diff --git a/examples/cpu_onnxruntime_timm.yaml b/examples/cpu_onnxruntime_timm.yaml deleted file mode 100644 index 963f44f0..00000000 --- a/examples/cpu_onnxruntime_timm.yaml +++ /dev/null @@ -1,20 +0,0 @@ -defaults: - - benchmark - - backend: onnxruntime - - scenario: inference - - launcher: process - - _base_ - - _self_ - -name: onnxruntime_timm - -backend: - device: cpu - export: true - model: timm/tiny_vit_21m_224.in1k - -scenario: - memory: true - latency: true - input_shapes: - batch_size: 2 diff --git a/examples/cpu_openvino_8bit_bert.yaml b/examples/cpu_openvino_8bit_bert.yaml index 73ef474d..a3c33327 100644 --- a/examples/cpu_openvino_8bit_bert.yaml +++ b/examples/cpu_openvino_8bit_bert.yaml @@ -12,8 +12,11 @@ backend: device: cpu reshape: true no_weights: true - load_in_8bit: false # enable 8bit on compatible Intel CPU machines + load_in_8bit: true model: google-bert/bert-base-uncased + reshape_kwargs: + batch_size: 1 + sequence_length: 128 scenario: memory: true diff --git a/examples/cpu_openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml index 30d21935..37b4ee71 100644 --- a/examples/cpu_openvino_diffusion.yaml +++ b/examples/cpu_openvino_diffusion.yaml @@ -11,9 +11,14 @@ name: openvino_diffusion backend: device: cpu export: true + task: text-to-image model: stabilityai/stable-diffusion-2-1 half: false # enable half-precision on compatible Intel CPU machines scenario: input_shapes: batch_size: 1 + sequence_length: 16 + + call_kwargs: + num_inference_steps: 4 diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index 297403c8..ac5bcdc3 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -16,6 +16,7 @@ backend: device: cuda device_ids: 0 cuda_graphs: 0 # remove for better perf but bigger memory footprint + no_weights: false # investigate later model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 scenario: diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml index c483fc2f..501280e8 100644 --- a/examples/cuda_trt_llama.yaml +++ b/examples/cuda_trt_llama.yaml @@ -15,6 +15,7 @@ launcher: backend: device: cuda device_ids: 0 + no_weights: true max_batch_size: 4 max_new_tokens: 32 max_prompt_length: 64 diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml index 5ec4b5a8..4a624cc1 100644 --- a/examples/cuda_vllm_llama.yaml +++ b/examples/cuda_vllm_llama.yaml @@ -15,7 +15,8 @@ launcher: backend: device: cuda device_ids: 0 - serving_mode: online # server-like + no_weights: true + serving_mode: online model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 engine_args: enforce_eager: true # remove for better perf but bigger memory footprint diff --git a/examples/mps_pytorch_bert.yaml b/examples/mps_pytorch_bert.yaml index 27368eb1..f805abed 100644 --- a/examples/mps_pytorch_bert.yaml +++ b/examples/mps_pytorch_bert.yaml @@ -8,14 +8,14 @@ defaults: name: mps_pytorch_bert +backend: + device: mps + no_weights: true + model: bert-base-uncased + scenario: - latency: true memory: true + latency: true input_shapes: batch_size: 1 sequence_length: 128 - -backend: - device: mps - no_weights: true - model: bert-base-uncased diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py index 78d0bef7..f79ea1a2 100644 --- a/optimum_benchmark/backends/base.py +++ b/optimum_benchmark/backends/base.py @@ -106,13 +106,6 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Saving no weights model's config") self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]: - """ - This method is used to prepare and register the input shapes before using them by the model. - It can be used to pad the inputs to the correct shape, or compile it to the correct format. - """ - return input_shapes - def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: """ This method is used to prepare and register the inputs before passing them to the model. diff --git a/optimum_benchmark/backends/ipex/backend.py b/optimum_benchmark/backends/ipex/backend.py index 7e4983a9..a637ced0 100644 --- a/optimum_benchmark/backends/ipex/backend.py +++ b/optimum_benchmark/backends/ipex/backend.py @@ -9,7 +9,7 @@ from ..base import Backend from ..transformers_utils import fast_weights_init from .config import IPEXConfig -from .utils import TASKS_TO_IPEXMODEL +from .utils import TASKS_TO_IPEXMODELS if is_accelerate_available(): from accelerate import Accelerator @@ -24,8 +24,8 @@ class IPEXBackend(Backend[IPEXConfig]): def __init__(self, config: IPEXConfig) -> None: super().__init__(config) - if self.config.task in TASKS_TO_IPEXMODEL: - self.ipexmodel_class = get_class(TASKS_TO_IPEXMODEL[self.config.task]) + if self.config.task in TASKS_TO_IPEXMODELS: + self.ipexmodel_class = get_class(TASKS_TO_IPEXMODELS[self.config.task]) self.logger.info(f"\t+ Using IPEXModel class {self.ipexmodel_class.__name__}") else: raise NotImplementedError(f"IPEXBackend does not support task {self.config.task}") @@ -38,48 +38,35 @@ def load(self) -> None: self.logger.info("\t+ Creating no weights IPEXModel") self.create_no_weights_model() self.logger.info("\t+ Loading no weights IPEXModel") - self._load_ipexmodel_with_no_weights() + self.load_ipexmodel_with_no_weights() else: self.logger.info("\t+ Loading pretrained IPEXModel") - self._load_ipexmodel_from_pretrained() + self.load_ipexmodel_from_pretrained() self.tmpdir.cleanup() - def _load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def _load_automodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self._load_automodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - - self.config.model = original_model - - def _load_ipexmodel_from_pretrained(self) -> None: + def load_ipexmodel_from_pretrained(self) -> None: self.pretrained_model = self.ipexmodel_class.from_pretrained( self.config.model, - export=self.config.export, **self.config.model_kwargs, - **self.automodel_kwargs, + **self.ipexmodel_kwargs, ) - def _load_ipexmodel_with_no_weights(self) -> None: + def load_ipexmodel_with_no_weights(self) -> None: with fast_weights_init(): original_model, self.config.model = self.config.model, self.no_weights_model original_export, self.config.export = self.config.export, True - self.logger.info("\t+ Loading no weights IPEXModel") - self._load_ipexmodel_from_pretrained() + self.load_ipexmodel_from_pretrained() self.config.export = original_export self.config.model = original_model @property - def automodel_kwargs(self) -> Dict[str, Any]: + def ipexmodel_kwargs(self) -> Dict[str, Any]: kwargs = {} + if self.config.export: + kwargs["export"] = self.config.export + if self.config.torch_dtype is not None: kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype) diff --git a/optimum_benchmark/backends/ipex/config.py b/optimum_benchmark/backends/ipex/config.py index 5ee4aad1..4fb553da 100644 --- a/optimum_benchmark/backends/ipex/config.py +++ b/optimum_benchmark/backends/ipex/config.py @@ -13,17 +13,17 @@ class IPEXConfig(BackendConfig): version: Optional[str] = ipex_version() _target_: str = "optimum_benchmark.backends.ipex.backend.IPEXBackend" - # load options no_weights: bool = False - torch_dtype: Optional[str] = None - # export options - export: bool = True + # ipexmodel kwargs + export: Optional[bool] = None + torch_dtype: Optional[str] = None def __post_init__(self): super().__post_init__() self.device = self.device.lower() + if self.device not in ["cpu", "gpu"]: raise ValueError(f"IPEXBackend only supports CPU devices, got {self.device}") diff --git a/optimum_benchmark/backends/ipex/utils.py b/optimum_benchmark/backends/ipex/utils.py index dd68428e..7b4a83f8 100644 --- a/optimum_benchmark/backends/ipex/utils.py +++ b/optimum_benchmark/backends/ipex/utils.py @@ -1,4 +1,4 @@ -TASKS_TO_IPEXMODEL = { +TASKS_TO_IPEXMODELS = { "fill-mask": "optimum.intel.IPEXModelForMaskedLM", "text-generation": "optimum.intel.IPEXModelForCausalLM", "feature-extraction": "optimum.intel.IPEXModel", diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py index c9d6bbf8..ef888ddd 100644 --- a/optimum_benchmark/backends/llama_cpp/backend.py +++ b/optimum_benchmark/backends/llama_cpp/backend.py @@ -28,8 +28,7 @@ def load_model_from_pretrained(self) -> None: """ self.pretrained_model = Llama.from_pretrained( - repo_id=self.config.model, - filename=self.config.filename, + self.config.model, **self.llama_cpp_kwargs, ) @@ -37,6 +36,7 @@ def load_model_from_pretrained(self) -> None: def llama_cpp_kwargs(self) -> Dict[str, Any]: return { "embedding": self.config.task == "feature-extraction", + "filename": self.config.filename, "verbose": False, "echo": False, } @@ -44,21 +44,22 @@ def llama_cpp_kwargs(self) -> Dict[str, Any]: def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task == "text-generation": if inputs["input_ids"].shape[0] != 1: - raise ValueError("Batch size must be 1 for LlamaCpp text generation") + raise ValueError("Batch size must be 1 for Text Generation with llama-cpp-python") return {"tokens": inputs["input_ids"].squeeze(0).tolist()} - elif self.config.task == "feature-extraction": return {"input": [self.pretrained_model.detokenize(x).decode("utf-8") for x in inputs["input_ids"]]} - - raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") + else: + raise ValueError(f"Task {self.config.task} not supported by {self.NAME}") def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any: self.pretrained_model.embed(**inputs) def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]: - next(self.pretrained_model.generate(**inputs)) + generator = self.pretrained_model.generate(**inputs, reset=True) + for _ in range(kwargs["max_new_tokens"]): + next(generator) def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]: - generator = self.pretrained_model.generate(**inputs) + generator = self.pretrained_model.generate(**inputs, reset=True) for _ in range(kwargs["max_new_tokens"]): next(generator) diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py index d2902860..183a86f8 100644 --- a/optimum_benchmark/backends/llama_cpp/config.py +++ b/optimum_benchmark/backends/llama_cpp/config.py @@ -12,6 +12,8 @@ class LlamaCppConfig(BackendConfig): _target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend" no_weights: bool = False + + # llamamodel kwargs filename: Optional[str] = None def __post_init__(self): diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py index 2fffcc36..ce7386b3 100644 --- a/optimum_benchmark/backends/onnxruntime/backend.py +++ b/optimum_benchmark/backends/onnxruntime/backend.py @@ -23,13 +23,12 @@ from ...generators.dataset_generator import DatasetGenerator from ...import_utils import is_accelerate_available, is_torch_distributed_available -from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend from ..transformers_utils import fast_weights_init from .config import ORTConfig from .utils import ( - TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES, TASKS_TO_ORTMODELS, + TASKS_TO_ORTPIPELINES, format_calibration_config, format_quantization_config, ) @@ -47,28 +46,15 @@ class ORTBackend(Backend[ORTConfig]): def __init__(self, config: ORTConfig) -> None: super().__init__(config) - if self.config.task in TASKS_TO_ORTMODELS: + if self.config.library != "diffusers" and self.config.task in TASKS_TO_ORTMODELS: self.ort_model_loader = get_class(TASKS_TO_ORTMODELS[self.config.task]) - self.logger.info(f"Using ORT Model class {self.ort_model_loader.__name__}") - elif self.config.task in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES: - if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task]: - self.ort_model_loader = get_class( - TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task][self.config.model_type] - ) - self.logger.info(f"Using ORT Pipeline class {self.ort_model_loader.__name__}") - else: - raise NotImplementedError( - f"ORTBackend does not support model {self.config.model_type} for task {self.config.task}" - ) + self.logger.info(f"Using ORTModel class {self.ort_model_loader.__name__}") + elif self.config.library == "diffusers" and self.config.task in TASKS_TO_ORTPIPELINES: + self.ort_model_loader = get_class(TASKS_TO_ORTPIPELINES[self.config.task]) + self.logger.info(f"Using ORTDiffusionPipeline class {self.ort_model_loader.__name__}") else: raise NotImplementedError(f"ORTBackend does not support task {self.config.task}") - self.session_options = SessionOptions() - if self.config.session_options: - self.logger.info("\t+ Processing session options") - for key, value in self.config.session_options.items(): - setattr(self.session_options, key, value) - def validate_execution_provider(self) -> None: if not self.pretrained_model.providers[0] == self.config.provider: raise ValueError( @@ -117,22 +103,18 @@ def load(self) -> None: def load_ortmodel_from_pretrained(self) -> None: self.pretrained_model = self.ort_model_loader.from_pretrained( self.config.model, - export=self.config.export, - session_options=self.session_options, - provider_options=self.config.provider_options, - use_io_binding=self.config.use_io_binding, - provider=self.config.provider, **self.config.model_kwargs, **self.ortmodel_kwargs, ) def load_ortmodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - with fast_weights_init(): + original_model, self.config.model = self.config.model, self.no_weights_model + original_export, self.config.export = self.config.export, True + self.logger.info("\t+ Loading no weights ORTModel") self.load_ortmodel_from_pretrained() - - self.config.model = original_model + self.config.export = original_export + self.config.model = original_model @property def is_optimized(self) -> bool: @@ -146,18 +128,36 @@ def is_quantized(self) -> bool: def is_calibrated(self) -> bool: return (self.config.auto_calibration is not None) or self.config.calibration - @property - def is_dp_distributed(self) -> bool: - return is_torch_distributed_available() and torch.distributed.is_initialized() - @property def ortmodel_kwargs(self) -> Dict[str, Any]: kwargs = {} - if self.config.task in TEXT_GENERATION_TASKS: + if self.config.export is not None: + kwargs["export"] = self.config.export + + if self.config.provider is not None: + kwargs["provider"] = self.config.provider + + if self.config.use_cache is not None: kwargs["use_cache"] = self.config.use_cache + + if self.config.use_merged is not None: kwargs["use_merged"] = self.config.use_merged + if self.config.torch_dtype is not None: + kwargs["torch_dtype"] = self.config.torch_dtype + + if self.config.use_io_binding is not None: + kwargs["use_io_binding"] = self.config.use_io_binding + + if self.config.session_options: + kwargs["session_options"] = SessionOptions() + for key, value in self.config.session_options.items(): + setattr(kwargs["session_options"], key, value) + + if self.config.provider_options: + kwargs["provider_options"] = self.config.provider_options + return kwargs @property @@ -223,7 +223,7 @@ def quantize_onnx_files(self) -> None: if self.is_calibrated: self.logger.info("\t+ Generating calibration dataset") - dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} + dataset_shapes = {"dataset_size": 2, "sequence_length": 2, "num_choices": 2} calibration_dataset = DatasetGenerator( task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes )() @@ -275,8 +275,10 @@ def quantize_onnx_files(self) -> None: preprocessor=None, file_suffix="", ) + if self.pretrained_processor is not None: self.pretrained_processor.save_pretrained(self.quantized_model) + if self.pretrained_config is not None: self.pretrained_config.save_pretrained(self.quantized_model) @@ -289,14 +291,14 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs: inputs = process_inputs - for key in list(inputs.keys()): - if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names: - inputs.pop(key) - for key, value in inputs.items(): if isinstance(value, torch.Tensor): inputs[key] = value.to(self.config.device) + for key in list(inputs.keys()): + if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names: + inputs.pop(key) + return inputs @torch.inference_mode() diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py index 07101f78..4c171a3c 100644 --- a/optimum_benchmark/backends/onnxruntime/config.py +++ b/optimum_benchmark/backends/onnxruntime/config.py @@ -35,19 +35,15 @@ class ORTConfig(BackendConfig): # load options no_weights: bool = False - # export options - export: bool = True - use_cache: bool = True - use_merged: bool = False - torch_dtype: Optional[str] = None - - # provider options + # ortmodel kwargs + export: Optional[bool] = None provider: Optional[str] = None - provider_options: Dict[str, Any] = field(default_factory=dict) - - # inference options + use_cache: Optional[bool] = None + use_merged: Optional[bool] = None + torch_dtype: Optional[str] = None use_io_binding: Optional[bool] = None session_options: Dict[str, Any] = field(default_factory=dict) + provider_options: Dict[str, Any] = field(default_factory=dict) # null, O1, O2, O3, O4 auto_optimization: Optional[str] = None diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py index 6177ae8e..e8cbe1eb 100644 --- a/optimum_benchmark/backends/onnxruntime/utils.py +++ b/optimum_benchmark/backends/onnxruntime/utils.py @@ -7,19 +7,10 @@ task: f"optimum.onnxruntime.{task_dict['class'][0].__name__}" for task, task_dict in ORT_SUPPORTED_TASKS.items() } -TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES = { - "text-to-image": { - "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline", - "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline", - "latent-consistency": "optimum.onnxruntime.ORTLatentConsistencyModelPipeline", - }, - "image-to-image": { - "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionImg2ImgPipeline", - "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionImg2ImgXLPipeline", - }, - "inpainting": { - "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionInpaintingPipeline", - }, +TASKS_TO_ORTPIPELINES = { + "inpainting": "optimum.onnxruntime.ORTPipelineForInpainting", + "text-to-image": "optimum.onnxruntime.ORTPipelineForText2Image", + "image-to-image": "optimum.onnxruntime.ORTPipelineForImage2Image", } diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py index f0aa1925..11be6e13 100644 --- a/optimum_benchmark/backends/openvino/backend.py +++ b/optimum_benchmark/backends/openvino/backend.py @@ -1,21 +1,15 @@ -import inspect from collections import OrderedDict from tempfile import TemporaryDirectory from typing import Any, Dict import torch from hydra.utils import get_class -from openvino.runtime import properties -from optimum.intel.openvino import OVConfig as OVQuantizationConfig # naming conflict -from optimum.intel.openvino import OVQuantizer -from ...generators.dataset_generator import DatasetGenerator from ...import_utils import is_accelerate_available, is_torch_distributed_available -from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend from ..transformers_utils import fast_weights_init -from .config import OVConfig -from .utils import TASKS_TO_MODEL_TYPES_TO_OVPIPELINE, TASKS_TO_OVMODEL +from .config import OVConfig as OVBackendConfig +from .utils import TASKS_TO_OVMODELS, TASKS_TO_OVPIPELINES if is_accelerate_available(): from accelerate import Accelerator @@ -24,73 +18,37 @@ import torch.distributed -class OVBackend(Backend[OVConfig]): +class OVBackend(Backend[OVBackendConfig]): NAME: str = "openvino" - def __init__(self, config: OVConfig) -> None: + def __init__(self, config: OVBackendConfig) -> None: super().__init__(config) - if self.config.task in TASKS_TO_OVMODEL: - self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task]) + if self.config.library != "diffusers" and self.config.task in TASKS_TO_OVMODELS: + self.ovmodel_class = get_class(TASKS_TO_OVMODELS[self.config.task]) self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}") - elif self.config.task in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE: - if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task]: - self.ovmodel_class = get_class( - TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task][self.config.model_type] - ) - self.logger.info(f"\t+ Using OVPipeline class {self.ovmodel_class.__name__}") - else: - raise NotImplementedError( - f"OVBackend does not support model {self.config.model_type} for task {self.config.task}" - ) + elif self.config.library == "diffusers" and self.config.task in TASKS_TO_OVPIPELINES: + self.ovmodel_class = get_class(TASKS_TO_OVPIPELINES[self.config.task]) + self.logger.info(f"\t+ Using OVDiffusionPipeline class {self.ovmodel_class.__name__}") else: raise NotImplementedError(f"OVBackend does not support task {self.config.task}") - if self.config.inter_op_num_threads is not None: - self.logger.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}") - self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads - def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - if self.config.quantization: - if self.config.no_weights: - self.logger.info("\t+ Creating no weights AutoModel") - self.create_no_weights_model() - self.logger.info("\t+ Loading no weights AutoModel") - self._load_automodel_with_no_weights() - else: - self.logger.info("\t+ Loading pretrained AutoModel") - self._load_automodel_from_pretrained() - self.logger.info("\t+ Applying post-training quantization") - self.quantize_automodel() - original_model, self.config.model = self.config.model, self.quantized_model - original_export, self.config.export = self.config.export, False - self.logger.info("\t+ Loading quantized OVModel") - self._load_ovmodel_from_pretrained() - self.config.model, self.config.export = original_model, original_export - elif self.config.no_weights: + if self.config.no_weights: self.logger.info("\t+ Creating no weights OVModel") self.create_no_weights_model() self.logger.info("\t+ Loading no weights OVModel") - self._load_ovmodel_with_no_weights() + self.load_ovmodel_with_no_weights() else: self.logger.info("\t+ Loading pretrained OVModel") - self._load_ovmodel_from_pretrained() + self.load_ovmodel_from_pretrained() if self.config.reshape: - static_shapes = { - key: value - for key, value in self.model_shapes.items() - if key in inspect.getfullargspec(self.pretrained_model.reshape).args - } - if ("sequence_length" in static_shapes) and ("height" in static_shapes) and ("width" in static_shapes): - # for vision models, sequence_length is the number of channels - static_shapes["sequence_length"] = self.model_shapes.get("num_channels") - - self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}") - self.pretrained_model.reshape(**static_shapes) + self.logger.info("\t+ Reshaping model with static shapes") + self.pretrained_model.reshape(**self.config.reshape_kwargs) if self.config.half: self.logger.info("\t+ Converting model to half precision") @@ -102,79 +60,43 @@ def load(self) -> None: self.tmpdir.cleanup() - def _load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def _load_automodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self._load_automodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - - self.config.model = original_model - - def _load_ovmodel_from_pretrained(self) -> None: + def load_ovmodel_from_pretrained(self) -> None: self.pretrained_model = self.ovmodel_class.from_pretrained( self.config.model, - export=self.config.export, - ov_config=self.config.openvino_config, - device=self.config.device, **self.config.model_kwargs, **self.ovmodel_kwargs, ) - def _load_ovmodel_with_no_weights(self) -> None: + def load_ovmodel_with_no_weights(self) -> None: with fast_weights_init(): original_model, self.config.model = self.config.model, self.no_weights_model original_export, self.config.export = self.config.export, True - self.logger.info("\t+ Loading no weights OVModel") - self._load_ovmodel_from_pretrained() + self.load_ovmodel_from_pretrained() self.config.export = original_export self.config.model = original_model - def quantize_automodel(self) -> None: - self.logger.info("\t+ Attempting quantization") - self.quantized_model = f"{self.tmpdir.name}/quantized_model" - self.logger.info("\t+ Processing quantization config") - quantization_config = OVQuantizationConfig(**self.config.quantization_config) - self.logger.info("\t+ Creating quantizer") - quantizer = OVQuantizer.from_pretrained(self.pretrained_model, task=self.config.task, seed=self.config.seed) - - if self.config.calibration: - self.logger.info("\t+ Generating calibration dataset") - dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} - calibration_dataset = DatasetGenerator( - task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes - )() - columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names)) - calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) - else: - calibration_dataset = None - - self.logger.info("\t+ Quantizing model") - quantizer.quantize( - save_directory=self.quantized_model, - quantization_config=quantization_config, - calibration_dataset=calibration_dataset, - # TODO: add support for these (maybe) - remove_unused_columns=True, - data_collator=None, - weights_only=False, - file_name=None, - batch_size=1, - ) - @property def ovmodel_kwargs(self) -> Dict[str, Any]: kwargs = {} - if self.config.task in TEXT_GENERATION_TASKS: + if self.config.export is not None: + kwargs["export"] = self.config.export + + if self.config.use_cache is not None: kwargs["use_cache"] = self.config.use_cache + + if self.config.use_merged is not None: kwargs["use_merged"] = self.config.use_merged + if self.config.load_in_8bit is not None: + kwargs["load_in_8bit"] = self.config.load_in_8bit + + if self.config.load_in_4bit is not None: + kwargs["load_in_4bit"] = self.config.load_in_4bit + + if self.config.ov_config: + kwargs["ov_config"] = self.config.ov_config + return kwargs @property @@ -190,14 +112,6 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names: inputs.pop(key) - if "input_ids" in inputs: - self.model_shapes.update(dict(zip(["batch_size", "sequence_length"], inputs["input_ids"].shape))) - - if "pixel_values" in inputs: - self.model_shapes.update( - dict(zip(["batch_size", "num_channels", "height", "width"], inputs["pixel_values"].shape)) - ) - return inputs def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py index 7e6eac25..e6716b86 100644 --- a/optimum_benchmark/backends/openvino/config.py +++ b/optimum_benchmark/backends/openvino/config.py @@ -11,28 +11,21 @@ class OVConfig(BackendConfig): version: Optional[str] = openvino_version() _target_: str = "optimum_benchmark.backends.openvino.backend.OVBackend" - # load options no_weights: bool = False - # export options - export: bool = True - use_cache: bool = True - use_merged: bool = False - - # openvino config - openvino_config: Dict[str, Any] = field(default_factory=dict) + # ovmodel kwargs + export: Optional[bool] = None + use_cache: Optional[bool] = None + use_merged: Optional[bool] = None + load_in_8bit: Optional[bool] = None + load_in_4bit: Optional[bool] = None + ov_config: Dict[str, Any] = field(default_factory=dict) # compilation options half: bool = False + compile: bool = False reshape: bool = False - - # quantization options - quantization: bool = False - quantization_config: Dict[str, Any] = field(default_factory=dict) - - # calibration options - calibration: bool = False - calibration_config: Dict[str, Any] = field(default_factory=dict) + reshape_kwargs: Dict[str, int] = field(default_factory=dict) def __post_init__(self): super().__post_init__() @@ -42,7 +35,7 @@ def __post_init__(self): raise ValueError(f"OVBackend only supports CPU devices, got {self.device}") if self.intra_op_num_threads is not None: - raise NotImplementedError("OVBackend does not support intra_op_num_threads") + raise NotImplementedError("OVBackend does not support intra_op_num_threads. Please use the ov_config") - if self.quantization and not self.calibration: - raise ValueError("OpenVINO quantization requires enabling calibration.") + if self.inter_op_num_threads is not None: + raise NotImplementedError("OVBackend does not support inter_op_num_threads. Please use the ov_config") diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py index 35518346..e382d724 100644 --- a/optimum_benchmark/backends/openvino/utils.py +++ b/optimum_benchmark/backends/openvino/utils.py @@ -1,4 +1,4 @@ -TASKS_TO_OVMODEL = { +TASKS_TO_OVMODELS = { "fill-mask": "optimum.intel.openvino.OVModelForMaskedLM", "text-generation": "optimum.intel.openvino.OVModelForCausalLM", "text2text-generation": "optimum.intel.openvino.OVModelForSeq2SeqLM", @@ -10,10 +10,8 @@ "audio-classification": "optimum.intel.openvino.OVModelForAudioClassification", "pix2struct": "optimum.intel.openvino.OVModelForPix2Struct", } -TASKS_TO_MODEL_TYPES_TO_OVPIPELINE = { - "text-to-image": { - "lcm": "optimum.intel.openvino.OVLatentConsistencyModelPipeline", - "stable-diffusion": "optimum.intel.openvino.OVStableDiffusionPipeline", - "stable-diffusion-xl": "optimum.intel.openvino.OVStableDiffusionXLPipeline", - }, +TASKS_TO_OVPIPELINES = { + "inpainting": "optimum.intel.openvino.OVPipelineForInpainting", + "text-to-image": "optimum.intel.openvino.OVPipelineForText2Image", + "image-to-image": "optimum.intel.openvino.OVPipelineForImage2Image", } diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py index 73b75b75..dae410c4 100644 --- a/optimum_benchmark/backends/py_txi/config.py +++ b/optimum_benchmark/backends/py_txi/config.py @@ -51,8 +51,8 @@ class PyTXIConfig(BackendConfig): num_shard: Optional[int] = None speculate: Optional[int] = None cuda_graphs: Optional[int] = None - disable_custom_kernels: Optional[bool] = None trust_remote_code: Optional[bool] = None + disable_custom_kernels: Optional[bool] = None # TEI specific pooling: Optional[str] = None diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py index a05187c3..60b82675 100644 --- a/optimum_benchmark/backends/tensorrt_llm/backend.py +++ b/optimum_benchmark/backends/tensorrt_llm/backend.py @@ -1,12 +1,18 @@ +import os from collections import OrderedDict from tempfile import TemporaryDirectory from typing import Any, Dict +import torch +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from hydra.utils import get_class +from safetensors.torch import save_file +from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend +from ..transformers_utils import fast_weights_init from .config import TRTLLMConfig -from .utils import MODEL_TYPE_TO_TRTLLMMODEL +from .utils import MODEL_TYPE_TO_TRTLLMMODELS class TRTLLMBackend(Backend[TRTLLMConfig]): @@ -15,8 +21,8 @@ class TRTLLMBackend(Backend[TRTLLMConfig]): def __init__(self, config: TRTLLMConfig): super().__init__(config) - if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODEL: - self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.config.model_type]) + if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODELS: + self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODELS[self.config.model_type]) self.logger.info(f"\t+ Using TRTLLMModel class {self.trtllm_loader.__name__}") else: raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.config.model_type}") @@ -25,62 +31,134 @@ def load(self) -> None: self.logger.info("\t+ Creating backend temporary directory") self.tmpdir = TemporaryDirectory() - self.logger.info("\t+ Loading pretrained TRTLLMModel") - self.load_trtmodel_from_pretrained() + if self.config.no_weights: + self.logger.info("\t+ Creating no weights model") + self.create_no_weights_model() + self.logger.info("\t+ Loading no weights model") + self.load_trtllm_with_no_weights() + else: + self.logger.info("\t+ Downloading pretrained model") + self.download_pretrained_model() + if self.config.task in TEXT_GENERATION_TASKS: + self.logger.info("\t+ Preparing generation config") + self.prepare_generation_config() + self.logger.info("\t+ Loading pretrained model") + self.load_trtllm_from_pretrained() self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() - def load_trtmodel_from_pretrained(self) -> None: + def download_pretrained_model(self) -> None: + with torch.device("meta"): + self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) + + def prepare_generation_config(self) -> None: + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + + model_cache_folder = f"models/{self.config.model}".replace("/", "--") + model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}" + snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" + snapshot_ref = open(snapshot_file, "r").read().strip() + model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" + self.generation_config.save_pretrained(save_directory=model_snapshot_path) + + def create_no_weights_model(self) -> None: + self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + self.logger.info("\t+ Creating no weights model directory") + os.makedirs(self.no_weights_model, exist_ok=True) + self.logger.info("\t+ Creating no weights model state dict") + state_dict = torch.nn.Linear(1, 1).state_dict() + self.logger.info("\t+ Saving no weights model safetensors") + safetensor = os.path.join(self.no_weights_model, "model.safetensors") + save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"}) + self.logger.info("\t+ Saving no weights model pretrained config") + self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) + self.logger.info("\t+ Saving no weights model pretrained processor") + self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) + # unlike Transformers, TRT-LLM won't accept any missing tensors so we need to materialize the model + self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}") + with fast_weights_init(): + self.pretrained_model = self.automodel_loader.from_pretrained( + self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False + ) + self.logger.info("\t+ Saving no weights model") + self.pretrained_model.save_pretrained(save_directory=self.no_weights_model) + del self.pretrained_model + torch.cuda.empty_cache() + + if self.config.task in TEXT_GENERATION_TASKS: + self.logger.info("\t+ Modifying generation config for fixed length generation") + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + self.logger.info("\t+ Saving new pretrained generation config") + self.generation_config.save_pretrained(save_directory=self.no_weights_model) + + def load_trtllm_with_no_weights(self) -> None: + original_model, self.config.model = self.config.model, self.no_weights_model + self.load_trtllm_from_pretrained() + self.config.model = original_model + + def load_trtllm_from_pretrained(self) -> None: self.pretrained_model = self.trtllm_loader.from_pretrained( self.config.model, - tp=self.config.tp, - pp=self.config.pp, - dtype=self.config.dtype, - use_fp8=self.config.use_fp8, - world_size=self.config.world_size, - gpus_per_node=self.config.gpus_per_node, - use_cuda_graph=self.config.use_cuda_graph, - optimization_level=self.config.optimization_level, - max_prompt_length=self.config.max_prompt_length, - max_batch_size=self.config.max_batch_size, - max_new_tokens=self.config.max_new_tokens, - max_beam_width=self.config.max_beam_width, **self.config.model_kwargs, + **self.trtllm_kwargs, ) + @property + def trtllm_kwargs(self): + kwargs = {} + + if self.config.tp is not None: + kwargs["tp"] = self.config.tp + + if self.config.pp is not None: + kwargs["pp"] = self.config.pp + + if self.config.dtype is not None: + kwargs["dtype"] = self.config.dtype + + if self.config.use_fp8 is not None: + kwargs["use_fp8"] = self.config.use_fp8 + + if self.config.world_size is not None: + kwargs["world_size"] = self.config.world_size + + if self.config.gpus_per_node is not None: + kwargs["gpus_per_node"] = self.config.gpus_per_node + + if self.config.max_batch_size is not None: + kwargs["max_batch_size"] = self.config.max_batch_size + + if self.config.max_new_tokens is not None: + kwargs["max_new_tokens"] = self.config.max_new_tokens + + if self.config.max_prompt_length is not None: + kwargs["max_prompt_length"] = self.config.max_prompt_length + + if self.config.optimization_level is not None: + kwargs["optimization_level"] = self.config.optimization_level + + if self.config.use_cuda_graph is not None: + kwargs["use_cuda_graph"] = self.config.use_cuda_graph + + return kwargs + def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.generate( input_ids=inputs.get("input_ids"), attention_mask=inputs.get("attention_mask"), - min_length=kwargs.get("min_new_tokens", -1), - max_new_tokens=kwargs.get("max_new_tokens", -1), - repetition_penalty=kwargs.get("repetition_penalty", 1.0), - length_penalty=kwargs.get("length_penalty", 1.0), pad_token_id=kwargs.get("pad_token_id", 0), - bos_token_id=kwargs.get("bos_token_id", 1), - eos_token_id=kwargs.get("eos_token_id", 2), - temperature=kwargs.get("temperature", 1.0), - num_beams=kwargs.get("num_beams", 1), - top_p=kwargs.get("top_p", 1.0), - top_k=kwargs.get("top_k", 50), - seed=kwargs.get("seed", 42), + eos_token_id=kwargs.get("eos_token_id", 1), + **kwargs, ) def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: return self.pretrained_model.generate( input_ids=inputs.get("input_ids"), attention_mask=inputs.get("attention_mask"), - min_length=kwargs.get("min_new_tokens", -1), - max_new_tokens=kwargs.get("max_new_tokens", -1), - repetition_penalty=kwargs.get("repetition_penalty", 1.0), - length_penalty=kwargs.get("length_penalty", 1.0), pad_token_id=kwargs.get("pad_token_id", 0), - bos_token_id=kwargs.get("bos_token_id", 1), - eos_token_id=kwargs.get("eos_token_id", 2), - temperature=kwargs.get("temperature", 1.0), - num_beams=kwargs.get("num_beams", 1), - top_p=kwargs.get("top_p", 1.0), - top_k=kwargs.get("top_k", 50), - seed=kwargs.get("seed", 42), + eos_token_id=kwargs.get("eos_token_id", 1), + **kwargs, ) diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py index d7f4b1cb..2497d5d4 100644 --- a/optimum_benchmark/backends/tensorrt_llm/config.py +++ b/optimum_benchmark/backends/tensorrt_llm/config.py @@ -4,7 +4,7 @@ from ...import_utils import tesnorrt_llm_version from ..config import BackendConfig -SUPPORTED_DTYPES = ["float16", "bfloat16", "float32"] +SUPPORTED_DTYPES = [None, "float16", "bfloat16", "float32"] @dataclass @@ -13,21 +13,20 @@ class TRTLLMConfig(BackendConfig): version: Optional[str] = tesnorrt_llm_version() _target_: str = "optimum_benchmark.backends.tensorrt_llm.backend.TRTLLMBackend" - # build config - tp: int = 1 - pp: int = 1 - use_fp8: bool = False - dtype: str = "float16" - optimization_level: int = 2 - use_cuda_graph: bool = False - - world_size: int = 1 - gpus_per_node: int = 1 - - max_prompt_length: int = 128 - max_new_tokens: int = -1 - max_batch_size: int = 1 - max_beam_width: int = 1 + no_weights: bool = False + + # trtllm kwargs + tp: Optional[int] = None + pp: Optional[int] = None + dtype: Optional[str] = None + use_fp8: Optional[bool] = None + world_size: Optional[int] = None + gpus_per_node: Optional[int] = None + max_batch_size: Optional[int] = None + max_new_tokens: Optional[int] = None + max_prompt_length: Optional[int] = None + optimization_level: Optional[int] = None + use_cuda_graph: Optional[bool] = None def __post_init__(self) -> None: super().__post_init__() @@ -38,8 +37,13 @@ def __post_init__(self) -> None: if self.dtype not in SUPPORTED_DTYPES: raise ValueError(f"dtype must be one of float16, bfloat16, float32, got {self.dtype}") - if self.gpus_per_node != self.world_size: + if self.gpus_per_node is not None and self.world_size is not None and self.gpus_per_node != self.world_size: raise ValueError(f"gpus_per_node ({self.gpus_per_node}) != world_size ({self.world_size})") - if self.world_size != self.pp * self.tp: + if ( + self.world_size is not None + and self.pp is not None + and self.tp is not None + and self.world_size != self.pp * self.tp + ): raise ValueError(f"world_size ({self.gpus_per_node}) != pp ({self.pp}) * tp ({self.tp})") diff --git a/optimum_benchmark/backends/tensorrt_llm/utils.py b/optimum_benchmark/backends/tensorrt_llm/utils.py index 4574da53..01b6ed0e 100644 --- a/optimum_benchmark/backends/tensorrt_llm/utils.py +++ b/optimum_benchmark/backends/tensorrt_llm/utils.py @@ -1 +1 @@ -MODEL_TYPE_TO_TRTLLMMODEL = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"} +MODEL_TYPE_TO_TRTLLMMODELS = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"} diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py index 61401a75..7b7c1c04 100644 --- a/optimum_benchmark/backends/torch_ort/backend.py +++ b/optimum_benchmark/backends/torch_ort/backend.py @@ -39,19 +39,17 @@ def load(self) -> None: self.tmpdir.cleanup() def load_automodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - with fast_weights_init(): + original_model, self.config.model = self.config.model, self.no_weights_model self.load_automodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - - self.config.model = original_model + self.pretrained_model.tie_weights() + self.config.model = original_model def load_automodel_from_pretrained(self) -> None: self.pretrained_model = self.automodel_loader.from_pretrained( - self.config.model, **self.automodel_kwargs, **self.config.model_kwargs + self.config.model, + **self.config.model_kwargs, + **self.automodel_kwargs, ).to(self.config.device) @property diff --git a/optimum_benchmark/backends/torch_ort/config.py b/optimum_benchmark/backends/torch_ort/config.py index adc37288..17d2895d 100644 --- a/optimum_benchmark/backends/torch_ort/config.py +++ b/optimum_benchmark/backends/torch_ort/config.py @@ -14,8 +14,9 @@ class TorchORTConfig(BackendConfig): # load options no_weights: bool = False torch_dtype: Optional[str] = None - # sdpa, which has became default of many architectures, fails with torch ort - attn_implementation: Optional[str] = "eager" + attn_implementation: Optional[str] = ( + "eager" # we pin eager because sdpa became default of many architectures, which fails with torch-ort + ) # peft options peft_type: Optional[str] = None diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py index eadd6c0a..7405d4dc 100644 --- a/optimum_benchmark/backends/vllm/backend.py +++ b/optimum_benchmark/backends/vllm/backend.py @@ -6,7 +6,10 @@ import torch from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from safetensors.torch import save_file -from vllm import AsyncEngineArgs, AsyncLLMEngine, EngineArgs, LLMEngine, SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.engine.llm_engine import LLMEngine +from vllm.sampling_params import SamplingParams from ...task_utils import TEXT_GENERATION_TASKS from ..base import Backend @@ -32,7 +35,7 @@ def load(self) -> None: self.logger.info("\t+ Creating no weights model") self.create_no_weights_model() self.logger.info("\t+ Loading no weights model") - self.load_model_with_no_weights() + self.load_vllm_with_no_weights() else: self.logger.info("\t+ Downloading pretrained model") self.download_pretrained_model() @@ -40,7 +43,7 @@ def load(self) -> None: self.logger.info("\t+ Preparing generation config") self.prepare_generation_config() self.logger.info("\t+ Loading pretrained model") - self.load_model_from_pretrained() + self.load_vllm_from_pretrained() self.logger.info("\t+ Cleaning up backend temporary directory") self.tmpdir.cleanup() @@ -52,13 +55,11 @@ def download_pretrained_model(self) -> None: def prepare_generation_config(self) -> None: self.generation_config.eos_token_id = None self.generation_config.pad_token_id = None - model_cache_folder = f"models/{self.config.model}".replace("/", "--") model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}" snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" snapshot_ref = open(snapshot_file, "r").read().strip() model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" - self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=model_snapshot_path) def create_no_weights_model(self) -> None: @@ -92,17 +93,25 @@ def create_no_weights_model(self) -> None: self.logger.info("\t+ Saving new pretrained generation config") self.generation_config.save_pretrained(save_directory=self.no_weights_model) - def load_model_with_no_weights(self) -> None: + def load_vllm_with_no_weights(self) -> None: original_model, self.config.model = self.config.model, self.no_weights_model - self.logger.info("\t+ Loading no weights model") - self.load_model_from_pretrained() + self.load_vllm_from_pretrained() self.config.model = original_model - def load_model_from_pretrained(self) -> None: + def load_vllm_from_pretrained(self) -> None: if self.config.serving_mode == "offline": - self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.config.to_engine_args())) + self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.vllm_kwargs)) else: - self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config.to_engine_args())) + self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.vllm_kwargs)) + + @property + def vllm_kwargs(self): + return { + "model": self.config.model, + "tokenizer": self.config.processor, + "device": self.config.device, + **self.config.engine_args, + } def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: if self.config.task in TEXT_GENERATION_TASKS: diff --git a/optimum_benchmark/backends/vllm/config.py b/optimum_benchmark/backends/vllm/config.py index 47ae475b..00157220 100644 --- a/optimum_benchmark/backends/vllm/config.py +++ b/optimum_benchmark/backends/vllm/config.py @@ -54,11 +54,3 @@ def __post_init__(self): if self.serving_mode == "online": if self.engine_args.get("disable_log_requests", None) is None: self.engine_args["disable_log_requests"] = True - - def to_engine_args(self) -> Dict[str, Any]: - return dict( - model=self.model, - tokenizer=self.processor, - device=self.device, - **self.engine_args, - ) diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py index 45461714..d18761b3 100644 --- a/optimum_benchmark/scenarios/inference/scenario.py +++ b/optimum_benchmark/scenarios/inference/scenario.py @@ -118,8 +118,6 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: device_ids=self.backend.config.device_ids, ) - self.run_model_loading_tracking() - self.logger.info(f"\t+ Generating inputs for task {self.backend.config.task}") self.inputs = InputGenerator( task=self.backend.config.task, @@ -127,6 +125,9 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport: model_type=self.backend.config.model_type, input_shapes=self.config.input_shapes, )() + + self.run_model_loading_tracking() + self.logger.info(f"\t+ Preparing inputs for backend {self.backend.config.name}") self.inputs = self.backend.prepare_inputs(inputs=self.inputs) diff --git a/setup.py b/setup.py index 46a1ed60..c7f81246 100644 --- a/setup.py +++ b/setup.py @@ -65,10 +65,10 @@ "testing": ["pytest", "hydra-joblib-launcher"], # optimum backends "ipex": [f"optimum[ipex]>={MIN_OPTIMUM_VERSION}"], + "tensorrt-llm": [f"optimum[nvidia]>={MIN_OPTIMUM_VERSION}"], "openvino": [f"optimum[openvino,nncf]>={MIN_OPTIMUM_VERSION}"], "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"], "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"], - "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"], "torch-ort": ["torch-ort", "onnxruntime-training", f"optimum>={MIN_OPTIMUM_VERSION}"], # other backends "llama-cpp": ["llama-cpp-python"], diff --git a/tests/configs/_diffusers_.yaml b/tests/configs/_diffusers_.yaml index 607b2502..25bcc6be 100644 --- a/tests/configs/_diffusers_.yaml +++ b/tests/configs/_diffusers_.yaml @@ -1,4 +1,7 @@ -backend: - library: diffusers - task: text-to-image - model: hf-internal-testing/tiny-stable-diffusion-torch +hydra: + mode: MULTIRUN + sweeper: + params: + backend.library: diffusers + backend.task: text-to-image + backend.model: hf-internal-testing/tiny-stable-diffusion-torch diff --git a/tests/configs/_export_.yaml b/tests/configs/_export_.yaml new file mode 100644 index 00000000..50f1bb0f --- /dev/null +++ b/tests/configs/_export_.yaml @@ -0,0 +1,2 @@ +backend: + export: true diff --git a/tests/configs/_inc_quant_.yaml b/tests/configs/_inc_quant_.yaml deleted file mode 100644 index 1347abfc..00000000 --- a/tests/configs/_inc_quant_.yaml +++ /dev/null @@ -1,3 +0,0 @@ -backend: - ptq_quantization: true - calibration: true diff --git a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml b/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml deleted file mode 100644 index 7865da6e..00000000 --- a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml +++ /dev/null @@ -1,12 +0,0 @@ -defaults: - # order of inheritance, last one overrides previous ones - - _base_ # inherits from base config - - _cpu_ # inherits from cpu config - - _inference_ # inherits from inference config - - _inc_quant_ # inherits from incremental quantization config - - _text_decoders_ # inherits from text decoders config - - _no_weights_ # inherits from no weights config - - _self_ # hydra 1.1 compatibility - - override backend: neural-compressor - -name: cpu_inference_neural_compressor_text_decoders diff --git a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml b/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml deleted file mode 100644 index 91451cf1..00000000 --- a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml +++ /dev/null @@ -1,12 +0,0 @@ -defaults: - # order of inheritance, last one overrides previous ones - - _base_ # inherits from base config - - _cpu_ # inherits from cpu config - - _inference_ # inherits from inference config - - _inc_quant_ # inherits from incremental quantization config - - _text_encoders_ # inherits from text encoders config - - _no_weights_ # inherits from no weights config - - _self_ # hydra 1.1 compatibility - - override backend: neural-compressor - -name: cpu_inference_neural_compressor_text_encoders diff --git a/tests/configs/cpu_inference_onnxruntime_diffusers.yaml b/tests/configs/cpu_inference_onnxruntime_diffusers.yaml index 5b44c0f2..852f6a4f 100644 --- a/tests/configs/cpu_inference_onnxruntime_diffusers.yaml +++ b/tests/configs/cpu_inference_onnxruntime_diffusers.yaml @@ -4,6 +4,7 @@ defaults: - _cpu_ # inherits from cpu config - _inference_ # inherits from inference config - _diffusers_ # inherits from diffusers config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml b/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml index 0f0e095c..628a8c40 100644 --- a/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml +++ b/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml @@ -5,6 +5,7 @@ defaults: - _inference_ # inherits from inference config - _ort_quant_ # inherits from ort static quant config - _no_weights_ # inherits from no weights sweep config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml index d87c4552..48a61019 100644 --- a/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml +++ b/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml @@ -5,6 +5,7 @@ defaults: - _inference_ # inherits from inference config - _text_decoders_ # inherits from text decoders config - _no_weights_ # inherits from no weights config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml index 5e9bdb9f..a846360b 100644 --- a/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml +++ b/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml @@ -5,6 +5,7 @@ defaults: - _inference_ # inherits from inference config - _text_encoders_ # inherits from text encoders config - _no_weights_ # inherits from no weights config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml index a9b725d4..5c8eeb50 100644 --- a/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml +++ b/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml @@ -5,6 +5,7 @@ defaults: - _inference_ # inherits from inference config - _text_encoders_decoders_ # inherits from text encoders decoders config - _no_weights_ # inherits from no weights config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cpu_inference_onnxruntime_timm.yaml b/tests/configs/cpu_inference_onnxruntime_timm.yaml index 9859c540..5e487cf9 100644 --- a/tests/configs/cpu_inference_onnxruntime_timm.yaml +++ b/tests/configs/cpu_inference_onnxruntime_timm.yaml @@ -3,6 +3,7 @@ defaults: - _base_ # inherits from base config - _cpu_ # inherits from cpu config - _inference_ # inherits from inference config + - _export_ # inherits from export config - _timm_ # inherits from timm config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml b/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml index d43725e8..b751e282 100644 --- a/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml +++ b/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml @@ -6,6 +6,7 @@ defaults: - _text_decoders_ # inherits from text decoders sweep config - _device_isolation_ # inherits from device isolation config - _no_weights_ # inherits from no weights config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime diff --git a/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml b/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml index 2ac7133b..ad36c13a 100644 --- a/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml +++ b/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml @@ -6,6 +6,7 @@ defaults: - _text_encoders_ # inherits from text encoders sweep config - _device_isolation_ # inherits from device isolation config - _no_weights_ # inherits from no weights config + - _export_ # inherits from export config - _self_ # hydra 1.1 compatibility - override backend: onnxruntime