From 4c23a0dd5935e7600e61c861bcc6679e75f1084b Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Wed, 11 Dec 2024 18:32:53 +0100
Subject: [PATCH] Optional backend kwargs (#307)

---
 .github/workflows/test_api_rocm.yaml          |   1 +
 .../workflows/test_cli_cuda_tensorrt_llm.yaml |   6 +-
 .github/workflows/test_cli_rocm_pytorch.yaml  |   2 +
 examples/cpu_ipex_bert.yaml                   |   4 +-
 examples/cpu_ipex_llama.yaml                  |   4 +-
 examples/cpu_onnxruntime_timm.yaml            |  20 ---
 examples/cpu_openvino_8bit_bert.yaml          |   5 +-
 examples/cpu_openvino_diffusion.yaml          |   5 +
 examples/cuda_tgi_llama.yaml                  |   1 +
 examples/cuda_trt_llama.yaml                  |   1 +
 examples/cuda_vllm_llama.yaml                 |   3 +-
 examples/mps_pytorch_bert.yaml                |  12 +-
 optimum_benchmark/backends/base.py            |   7 -
 optimum_benchmark/backends/ipex/backend.py    |  39 ++---
 optimum_benchmark/backends/ipex/config.py     |   8 +-
 optimum_benchmark/backends/ipex/utils.py      |   2 +-
 .../backends/llama_cpp/backend.py             |  17 +-
 .../backends/llama_cpp/config.py              |   2 +
 .../backends/onnxruntime/backend.py           |  80 ++++-----
 .../backends/onnxruntime/config.py            |  16 +-
 .../backends/onnxruntime/utils.py             |  17 +-
 .../backends/openvino/backend.py              | 150 ++++-------------
 optimum_benchmark/backends/openvino/config.py |  31 ++--
 optimum_benchmark/backends/openvino/utils.py  |  12 +-
 optimum_benchmark/backends/py_txi/config.py   |   2 +-
 .../backends/tensorrt_llm/backend.py          | 158 +++++++++++++-----
 .../backends/tensorrt_llm/config.py           |  40 +++--
 .../backends/tensorrt_llm/utils.py            |   2 +-
 .../backends/torch_ort/backend.py             |  14 +-
 .../backends/torch_ort/config.py              |   5 +-
 optimum_benchmark/backends/vllm/backend.py    |  31 ++--
 optimum_benchmark/backends/vllm/config.py     |   8 -
 .../scenarios/inference/scenario.py           |   5 +-
 setup.py                                      |   2 +-
 tests/configs/_diffusers_.yaml                |  11 +-
 tests/configs/_export_.yaml                   |   2 +
 tests/configs/_inc_quant_.yaml                |   3 -
 ...al_compressor_inc_quant_text_decoders.yaml |  12 --
 ...al_compressor_inc_quant_text_encoders.yaml |  12 --
 .../cpu_inference_onnxruntime_diffusers.yaml  |   1 +
 .../cpu_inference_onnxruntime_ort_quant.yaml  |   1 +
 ...u_inference_onnxruntime_text_decoders.yaml |   1 +
 ...u_inference_onnxruntime_text_encoders.yaml |   1 +
 ...ce_onnxruntime_text_encoders_decoders.yaml |   1 +
 .../cpu_inference_onnxruntime_timm.yaml       |   1 +
 ...a_inference_onnxruntime_text_decoders.yaml |   1 +
 ...a_inference_onnxruntime_text_encoders.yaml |   1 +
 47 files changed, 350 insertions(+), 410 deletions(-)
 delete mode 100644 examples/cpu_onnxruntime_timm.yaml
 create mode 100644 tests/configs/_export_.yaml
 delete mode 100644 tests/configs/_inc_quant_.yaml
 delete mode 100644 tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml
 delete mode 100644 tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml

diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
index f6f20aa4..5c795cbd 100644
--- a/.github/workflows/test_api_rocm.yaml
+++ b/.github/workflows/test_api_rocm.yaml
@@ -33,6 +33,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,timm,diffusers,codecarbon
+      test_file: test_api.py
       pytest_keywords: api and cuda
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
index c75aac92..5a199d5a 100644
--- a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
+++ b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests
         run: |
@@ -57,7 +57,7 @@ jobs:
           }}
         name: Run examples
         run: |
-          huggingface-cli delete-cache
+          rm -rf /root/.cache/huggingface
           pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
 
   cli_cuda_tensorrt_llm_multi_gpu_tests:
@@ -84,7 +84,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests (sequential)
         run: |
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
index 3057b726..47008b46 100644
--- a/.github/workflows/test_cli_rocm_pytorch.yaml
+++ b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -35,6 +35,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,diffusers,timm,peft,autoawq,auto-gptq
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb
 
   run_cli_rocm_pytorch_multi_gpu_tests:
@@ -52,4 +53,5 @@ jobs:
     with:
       machine_type: multi-gpu
       install_extras: testing,diffusers,timm,peft
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and (dp or ddp or device_map)
diff --git a/examples/cpu_ipex_bert.yaml b/examples/cpu_ipex_bert.yaml
index 0e7ed37b..1974e9c3 100644
--- a/examples/cpu_ipex_bert.yaml
+++ b/examples/cpu_ipex_bert.yaml
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: google-bert/bert-base-uncased
 
 scenario:
diff --git a/examples/cpu_ipex_llama.yaml b/examples/cpu_ipex_llama.yaml
index 898ed0df..50e23c55 100644
--- a/examples/cpu_ipex_llama.yaml
+++ b/examples/cpu_ipex_llama.yaml
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
diff --git a/examples/cpu_onnxruntime_timm.yaml b/examples/cpu_onnxruntime_timm.yaml
deleted file mode 100644
index 963f44f0..00000000
--- a/examples/cpu_onnxruntime_timm.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-defaults:
-  - benchmark
-  - backend: onnxruntime
-  - scenario: inference
-  - launcher: process
-  - _base_
-  - _self_
-
-name: onnxruntime_timm
-
-backend:
-  device: cpu
-  export: true
-  model: timm/tiny_vit_21m_224.in1k
-
-scenario:
-  memory: true
-  latency: true
-  input_shapes:
-    batch_size: 2
diff --git a/examples/cpu_openvino_8bit_bert.yaml b/examples/cpu_openvino_8bit_bert.yaml
index 73ef474d..a3c33327 100644
--- a/examples/cpu_openvino_8bit_bert.yaml
+++ b/examples/cpu_openvino_8bit_bert.yaml
@@ -12,8 +12,11 @@ backend:
   device: cpu
   reshape: true
   no_weights: true
-  load_in_8bit: false # enable 8bit on compatible Intel CPU machines
+  load_in_8bit: true
   model: google-bert/bert-base-uncased
+  reshape_kwargs:
+    batch_size: 1
+    sequence_length: 128
 
 scenario:
   memory: true
diff --git a/examples/cpu_openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml
index 30d21935..37b4ee71 100644
--- a/examples/cpu_openvino_diffusion.yaml
+++ b/examples/cpu_openvino_diffusion.yaml
@@ -11,9 +11,14 @@ name: openvino_diffusion
 backend:
   device: cpu
   export: true
+  task: text-to-image
   model: stabilityai/stable-diffusion-2-1
   half: false # enable half-precision on compatible Intel CPU machines
 
 scenario:
   input_shapes:
     batch_size: 1
+    sequence_length: 16
+
+  call_kwargs:
+    num_inference_steps: 4
diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
index 297403c8..ac5bcdc3 100644
--- a/examples/cuda_tgi_llama.yaml
+++ b/examples/cuda_tgi_llama.yaml
@@ -16,6 +16,7 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
+  no_weights: false # investigate later
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:
diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml
index c483fc2f..501280e8 100644
--- a/examples/cuda_trt_llama.yaml
+++ b/examples/cuda_trt_llama.yaml
@@ -15,6 +15,7 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
+  no_weights: true
   max_batch_size: 4
   max_new_tokens: 32
   max_prompt_length: 64
diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml
index 5ec4b5a8..4a624cc1 100644
--- a/examples/cuda_vllm_llama.yaml
+++ b/examples/cuda_vllm_llama.yaml
@@ -15,7 +15,8 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  serving_mode: online # server-like
+  no_weights: true
+  serving_mode: online
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
   engine_args:
     enforce_eager: true # remove for better perf but bigger memory footprint
diff --git a/examples/mps_pytorch_bert.yaml b/examples/mps_pytorch_bert.yaml
index 27368eb1..f805abed 100644
--- a/examples/mps_pytorch_bert.yaml
+++ b/examples/mps_pytorch_bert.yaml
@@ -8,14 +8,14 @@ defaults:
 
 name: mps_pytorch_bert
 
+backend:
+  device: mps
+  no_weights: true
+  model: bert-base-uncased
+
 scenario:
-  latency: true
   memory: true
+  latency: true
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: mps
-  no_weights: true
-  model: bert-base-uncased
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
index 78d0bef7..f79ea1a2 100644
--- a/optimum_benchmark/backends/base.py
+++ b/optimum_benchmark/backends/base.py
@@ -106,13 +106,6 @@ def create_no_weights_model(self) -> None:
         self.logger.info("\t+ Saving no weights model's config")
         self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
 
-    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        This method is used to prepare and register the input shapes before using them by the model.
-        It can be used to pad the inputs to the correct shape, or compile it to the correct format.
-        """
-        return input_shapes
-
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
         This method is used to prepare and register the inputs before passing them to the model.
diff --git a/optimum_benchmark/backends/ipex/backend.py b/optimum_benchmark/backends/ipex/backend.py
index 7e4983a9..a637ced0 100644
--- a/optimum_benchmark/backends/ipex/backend.py
+++ b/optimum_benchmark/backends/ipex/backend.py
@@ -9,7 +9,7 @@
 from ..base import Backend
 from ..transformers_utils import fast_weights_init
 from .config import IPEXConfig
-from .utils import TASKS_TO_IPEXMODEL
+from .utils import TASKS_TO_IPEXMODELS
 
 if is_accelerate_available():
     from accelerate import Accelerator
@@ -24,8 +24,8 @@ class IPEXBackend(Backend[IPEXConfig]):
     def __init__(self, config: IPEXConfig) -> None:
         super().__init__(config)
 
-        if self.config.task in TASKS_TO_IPEXMODEL:
-            self.ipexmodel_class = get_class(TASKS_TO_IPEXMODEL[self.config.task])
+        if self.config.task in TASKS_TO_IPEXMODELS:
+            self.ipexmodel_class = get_class(TASKS_TO_IPEXMODELS[self.config.task])
             self.logger.info(f"\t+ Using IPEXModel class {self.ipexmodel_class.__name__}")
         else:
             raise NotImplementedError(f"IPEXBackend does not support task {self.config.task}")
@@ -38,48 +38,35 @@ def load(self) -> None:
             self.logger.info("\t+ Creating no weights IPEXModel")
             self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights IPEXModel")
-            self._load_ipexmodel_with_no_weights()
+            self.load_ipexmodel_with_no_weights()
         else:
             self.logger.info("\t+ Loading pretrained IPEXModel")
-            self._load_ipexmodel_from_pretrained()
+            self.load_ipexmodel_from_pretrained()
 
         self.tmpdir.cleanup()
 
-    def _load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def _load_automodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        with fast_weights_init():
-            self._load_automodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.tie_weights()
-
-        self.config.model = original_model
-
-    def _load_ipexmodel_from_pretrained(self) -> None:
+    def load_ipexmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.ipexmodel_class.from_pretrained(
             self.config.model,
-            export=self.config.export,
             **self.config.model_kwargs,
-            **self.automodel_kwargs,
+            **self.ipexmodel_kwargs,
         )
 
-    def _load_ipexmodel_with_no_weights(self) -> None:
+    def load_ipexmodel_with_no_weights(self) -> None:
         with fast_weights_init():
             original_model, self.config.model = self.config.model, self.no_weights_model
             original_export, self.config.export = self.config.export, True
-            self.logger.info("\t+ Loading no weights IPEXModel")
-            self._load_ipexmodel_from_pretrained()
+            self.load_ipexmodel_from_pretrained()
             self.config.export = original_export
             self.config.model = original_model
 
     @property
-    def automodel_kwargs(self) -> Dict[str, Any]:
+    def ipexmodel_kwargs(self) -> Dict[str, Any]:
         kwargs = {}
 
+        if self.config.export:
+            kwargs["export"] = self.config.export
+
         if self.config.torch_dtype is not None:
             kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
 
diff --git a/optimum_benchmark/backends/ipex/config.py b/optimum_benchmark/backends/ipex/config.py
index 5ee4aad1..4fb553da 100644
--- a/optimum_benchmark/backends/ipex/config.py
+++ b/optimum_benchmark/backends/ipex/config.py
@@ -13,17 +13,17 @@ class IPEXConfig(BackendConfig):
     version: Optional[str] = ipex_version()
     _target_: str = "optimum_benchmark.backends.ipex.backend.IPEXBackend"
 
-    # load options
     no_weights: bool = False
-    torch_dtype: Optional[str] = None
 
-    # export options
-    export: bool = True
+    # ipexmodel kwargs
+    export: Optional[bool] = None
+    torch_dtype: Optional[str] = None
 
     def __post_init__(self):
         super().__post_init__()
 
         self.device = self.device.lower()
+
         if self.device not in ["cpu", "gpu"]:
             raise ValueError(f"IPEXBackend only supports CPU devices, got {self.device}")
 
diff --git a/optimum_benchmark/backends/ipex/utils.py b/optimum_benchmark/backends/ipex/utils.py
index dd68428e..7b4a83f8 100644
--- a/optimum_benchmark/backends/ipex/utils.py
+++ b/optimum_benchmark/backends/ipex/utils.py
@@ -1,4 +1,4 @@
-TASKS_TO_IPEXMODEL = {
+TASKS_TO_IPEXMODELS = {
     "fill-mask": "optimum.intel.IPEXModelForMaskedLM",
     "text-generation": "optimum.intel.IPEXModelForCausalLM",
     "feature-extraction": "optimum.intel.IPEXModel",
diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py
index c9d6bbf8..ef888ddd 100644
--- a/optimum_benchmark/backends/llama_cpp/backend.py
+++ b/optimum_benchmark/backends/llama_cpp/backend.py
@@ -28,8 +28,7 @@ def load_model_from_pretrained(self) -> None:
         """
 
         self.pretrained_model = Llama.from_pretrained(
-            repo_id=self.config.model,
-            filename=self.config.filename,
+            self.config.model,
             **self.llama_cpp_kwargs,
         )
 
@@ -37,6 +36,7 @@ def load_model_from_pretrained(self) -> None:
     def llama_cpp_kwargs(self) -> Dict[str, Any]:
         return {
             "embedding": self.config.task == "feature-extraction",
+            "filename": self.config.filename,
             "verbose": False,
             "echo": False,
         }
@@ -44,21 +44,22 @@ def llama_cpp_kwargs(self) -> Dict[str, Any]:
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task == "text-generation":
             if inputs["input_ids"].shape[0] != 1:
-                raise ValueError("Batch size must be 1 for LlamaCpp text generation")
+                raise ValueError("Batch size must be 1 for Text Generation with llama-cpp-python")
             return {"tokens": inputs["input_ids"].squeeze(0).tolist()}
-
         elif self.config.task == "feature-extraction":
             return {"input": [self.pretrained_model.detokenize(x).decode("utf-8") for x in inputs["input_ids"]]}
-
-        raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
+        else:
+            raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
         self.pretrained_model.embed(**inputs)
 
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        next(self.pretrained_model.generate(**inputs))
+        generator = self.pretrained_model.generate(**inputs, reset=True)
+        for _ in range(kwargs["max_new_tokens"]):
+            next(generator)
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        generator = self.pretrained_model.generate(**inputs)
+        generator = self.pretrained_model.generate(**inputs, reset=True)
         for _ in range(kwargs["max_new_tokens"]):
             next(generator)
diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py
index d2902860..183a86f8 100644
--- a/optimum_benchmark/backends/llama_cpp/config.py
+++ b/optimum_benchmark/backends/llama_cpp/config.py
@@ -12,6 +12,8 @@ class LlamaCppConfig(BackendConfig):
     _target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend"
 
     no_weights: bool = False
+
+    # llamamodel kwargs
     filename: Optional[str] = None
 
     def __post_init__(self):
diff --git a/optimum_benchmark/backends/onnxruntime/backend.py b/optimum_benchmark/backends/onnxruntime/backend.py
index 2fffcc36..ce7386b3 100644
--- a/optimum_benchmark/backends/onnxruntime/backend.py
+++ b/optimum_benchmark/backends/onnxruntime/backend.py
@@ -23,13 +23,12 @@
 
 from ...generators.dataset_generator import DatasetGenerator
 from ...import_utils import is_accelerate_available, is_torch_distributed_available
-from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
 from ..transformers_utils import fast_weights_init
 from .config import ORTConfig
 from .utils import (
-    TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES,
     TASKS_TO_ORTMODELS,
+    TASKS_TO_ORTPIPELINES,
     format_calibration_config,
     format_quantization_config,
 )
@@ -47,28 +46,15 @@ class ORTBackend(Backend[ORTConfig]):
     def __init__(self, config: ORTConfig) -> None:
         super().__init__(config)
 
-        if self.config.task in TASKS_TO_ORTMODELS:
+        if self.config.library != "diffusers" and self.config.task in TASKS_TO_ORTMODELS:
             self.ort_model_loader = get_class(TASKS_TO_ORTMODELS[self.config.task])
-            self.logger.info(f"Using ORT Model class {self.ort_model_loader.__name__}")
-        elif self.config.task in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES:
-            if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task]:
-                self.ort_model_loader = get_class(
-                    TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES[self.config.task][self.config.model_type]
-                )
-                self.logger.info(f"Using ORT Pipeline class {self.ort_model_loader.__name__}")
-            else:
-                raise NotImplementedError(
-                    f"ORTBackend does not support model {self.config.model_type} for task {self.config.task}"
-                )
+            self.logger.info(f"Using ORTModel class {self.ort_model_loader.__name__}")
+        elif self.config.library == "diffusers" and self.config.task in TASKS_TO_ORTPIPELINES:
+            self.ort_model_loader = get_class(TASKS_TO_ORTPIPELINES[self.config.task])
+            self.logger.info(f"Using ORTDiffusionPipeline class {self.ort_model_loader.__name__}")
         else:
             raise NotImplementedError(f"ORTBackend does not support task {self.config.task}")
 
-        self.session_options = SessionOptions()
-        if self.config.session_options:
-            self.logger.info("\t+ Processing session options")
-            for key, value in self.config.session_options.items():
-                setattr(self.session_options, key, value)
-
     def validate_execution_provider(self) -> None:
         if not self.pretrained_model.providers[0] == self.config.provider:
             raise ValueError(
@@ -117,22 +103,18 @@ def load(self) -> None:
     def load_ortmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.ort_model_loader.from_pretrained(
             self.config.model,
-            export=self.config.export,
-            session_options=self.session_options,
-            provider_options=self.config.provider_options,
-            use_io_binding=self.config.use_io_binding,
-            provider=self.config.provider,
             **self.config.model_kwargs,
             **self.ortmodel_kwargs,
         )
 
     def load_ortmodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
         with fast_weights_init():
+            original_model, self.config.model = self.config.model, self.no_weights_model
+            original_export, self.config.export = self.config.export, True
+            self.logger.info("\t+ Loading no weights ORTModel")
             self.load_ortmodel_from_pretrained()
-
-        self.config.model = original_model
+            self.config.export = original_export
+            self.config.model = original_model
 
     @property
     def is_optimized(self) -> bool:
@@ -146,18 +128,36 @@ def is_quantized(self) -> bool:
     def is_calibrated(self) -> bool:
         return (self.config.auto_calibration is not None) or self.config.calibration
 
-    @property
-    def is_dp_distributed(self) -> bool:
-        return is_torch_distributed_available() and torch.distributed.is_initialized()
-
     @property
     def ortmodel_kwargs(self) -> Dict[str, Any]:
         kwargs = {}
 
-        if self.config.task in TEXT_GENERATION_TASKS:
+        if self.config.export is not None:
+            kwargs["export"] = self.config.export
+
+        if self.config.provider is not None:
+            kwargs["provider"] = self.config.provider
+
+        if self.config.use_cache is not None:
             kwargs["use_cache"] = self.config.use_cache
+
+        if self.config.use_merged is not None:
             kwargs["use_merged"] = self.config.use_merged
 
+        if self.config.torch_dtype is not None:
+            kwargs["torch_dtype"] = self.config.torch_dtype
+
+        if self.config.use_io_binding is not None:
+            kwargs["use_io_binding"] = self.config.use_io_binding
+
+        if self.config.session_options:
+            kwargs["session_options"] = SessionOptions()
+            for key, value in self.config.session_options.items():
+                setattr(kwargs["session_options"], key, value)
+
+        if self.config.provider_options:
+            kwargs["provider_options"] = self.config.provider_options
+
         return kwargs
 
     @property
@@ -223,7 +223,7 @@ def quantize_onnx_files(self) -> None:
 
         if self.is_calibrated:
             self.logger.info("\t+ Generating calibration dataset")
-            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
+            dataset_shapes = {"dataset_size": 2, "sequence_length": 2, "num_choices": 2}
             calibration_dataset = DatasetGenerator(
                 task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
             )()
@@ -275,8 +275,10 @@ def quantize_onnx_files(self) -> None:
                 preprocessor=None,
                 file_suffix="",
             )
+
         if self.pretrained_processor is not None:
             self.pretrained_processor.save_pretrained(self.quantized_model)
+
         if self.pretrained_config is not None:
             self.pretrained_config.save_pretrained(self.quantized_model)
 
@@ -289,14 +291,14 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
             with Accelerator().split_between_processes(inputs=inputs, apply_padding=False) as process_inputs:
                 inputs = process_inputs
 
-        for key in list(inputs.keys()):
-            if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names:
-                inputs.pop(key)
-
         for key, value in inputs.items():
             if isinstance(value, torch.Tensor):
                 inputs[key] = value.to(self.config.device)
 
+        for key in list(inputs.keys()):
+            if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names:
+                inputs.pop(key)
+
         return inputs
 
     @torch.inference_mode()
diff --git a/optimum_benchmark/backends/onnxruntime/config.py b/optimum_benchmark/backends/onnxruntime/config.py
index 07101f78..4c171a3c 100644
--- a/optimum_benchmark/backends/onnxruntime/config.py
+++ b/optimum_benchmark/backends/onnxruntime/config.py
@@ -35,19 +35,15 @@ class ORTConfig(BackendConfig):
     # load options
     no_weights: bool = False
 
-    # export options
-    export: bool = True
-    use_cache: bool = True
-    use_merged: bool = False
-    torch_dtype: Optional[str] = None
-
-    # provider options
+    # ortmodel kwargs
+    export: Optional[bool] = None
     provider: Optional[str] = None
-    provider_options: Dict[str, Any] = field(default_factory=dict)
-
-    # inference options
+    use_cache: Optional[bool] = None
+    use_merged: Optional[bool] = None
+    torch_dtype: Optional[str] = None
     use_io_binding: Optional[bool] = None
     session_options: Dict[str, Any] = field(default_factory=dict)
+    provider_options: Dict[str, Any] = field(default_factory=dict)
 
     # null, O1, O2, O3, O4
     auto_optimization: Optional[str] = None
diff --git a/optimum_benchmark/backends/onnxruntime/utils.py b/optimum_benchmark/backends/onnxruntime/utils.py
index 6177ae8e..e8cbe1eb 100644
--- a/optimum_benchmark/backends/onnxruntime/utils.py
+++ b/optimum_benchmark/backends/onnxruntime/utils.py
@@ -7,19 +7,10 @@
     task: f"optimum.onnxruntime.{task_dict['class'][0].__name__}" for task, task_dict in ORT_SUPPORTED_TASKS.items()
 }
 
-TASKS_TO_MODEL_TYPES_TO_ORTPIPELINES = {
-    "text-to-image": {
-        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionPipeline",
-        "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionXLPipeline",
-        "latent-consistency": "optimum.onnxruntime.ORTLatentConsistencyModelPipeline",
-    },
-    "image-to-image": {
-        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionImg2ImgPipeline",
-        "stable-diffusion-xl": "optimum.onnxruntime.ORTStableDiffusionImg2ImgXLPipeline",
-    },
-    "inpainting": {
-        "stable-diffusion": "optimum.onnxruntime.ORTStableDiffusionInpaintingPipeline",
-    },
+TASKS_TO_ORTPIPELINES = {
+    "inpainting": "optimum.onnxruntime.ORTPipelineForInpainting",
+    "text-to-image": "optimum.onnxruntime.ORTPipelineForText2Image",
+    "image-to-image": "optimum.onnxruntime.ORTPipelineForImage2Image",
 }
 
 
diff --git a/optimum_benchmark/backends/openvino/backend.py b/optimum_benchmark/backends/openvino/backend.py
index f0aa1925..11be6e13 100644
--- a/optimum_benchmark/backends/openvino/backend.py
+++ b/optimum_benchmark/backends/openvino/backend.py
@@ -1,21 +1,15 @@
-import inspect
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Dict
 
 import torch
 from hydra.utils import get_class
-from openvino.runtime import properties
-from optimum.intel.openvino import OVConfig as OVQuantizationConfig  # naming conflict
-from optimum.intel.openvino import OVQuantizer
 
-from ...generators.dataset_generator import DatasetGenerator
 from ...import_utils import is_accelerate_available, is_torch_distributed_available
-from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
 from ..transformers_utils import fast_weights_init
-from .config import OVConfig
-from .utils import TASKS_TO_MODEL_TYPES_TO_OVPIPELINE, TASKS_TO_OVMODEL
+from .config import OVConfig as OVBackendConfig
+from .utils import TASKS_TO_OVMODELS, TASKS_TO_OVPIPELINES
 
 if is_accelerate_available():
     from accelerate import Accelerator
@@ -24,73 +18,37 @@
     import torch.distributed
 
 
-class OVBackend(Backend[OVConfig]):
+class OVBackend(Backend[OVBackendConfig]):
     NAME: str = "openvino"
 
-    def __init__(self, config: OVConfig) -> None:
+    def __init__(self, config: OVBackendConfig) -> None:
         super().__init__(config)
 
-        if self.config.task in TASKS_TO_OVMODEL:
-            self.ovmodel_class = get_class(TASKS_TO_OVMODEL[self.config.task])
+        if self.config.library != "diffusers" and self.config.task in TASKS_TO_OVMODELS:
+            self.ovmodel_class = get_class(TASKS_TO_OVMODELS[self.config.task])
             self.logger.info(f"\t+ Using OVModel class {self.ovmodel_class.__name__}")
-        elif self.config.task in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE:
-            if self.config.model_type in TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task]:
-                self.ovmodel_class = get_class(
-                    TASKS_TO_MODEL_TYPES_TO_OVPIPELINE[self.config.task][self.config.model_type]
-                )
-                self.logger.info(f"\t+ Using OVPipeline class {self.ovmodel_class.__name__}")
-            else:
-                raise NotImplementedError(
-                    f"OVBackend does not support model {self.config.model_type} for task {self.config.task}"
-                )
+        elif self.config.library == "diffusers" and self.config.task in TASKS_TO_OVPIPELINES:
+            self.ovmodel_class = get_class(TASKS_TO_OVPIPELINES[self.config.task])
+            self.logger.info(f"\t+ Using OVDiffusionPipeline class {self.ovmodel_class.__name__}")
         else:
             raise NotImplementedError(f"OVBackend does not support task {self.config.task}")
 
-        if self.config.inter_op_num_threads is not None:
-            self.logger.info(f"\t+ Setting inter_op_num_threads to {self.config.inter_op_num_threads}")
-            self.config.openvino_config[properties.inference_num_threads()] = self.config.inter_op_num_threads
-
     def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
-        if self.config.quantization:
-            if self.config.no_weights:
-                self.logger.info("\t+ Creating no weights AutoModel")
-                self.create_no_weights_model()
-                self.logger.info("\t+ Loading no weights AutoModel")
-                self._load_automodel_with_no_weights()
-            else:
-                self.logger.info("\t+ Loading pretrained AutoModel")
-                self._load_automodel_from_pretrained()
-            self.logger.info("\t+ Applying post-training quantization")
-            self.quantize_automodel()
-            original_model, self.config.model = self.config.model, self.quantized_model
-            original_export, self.config.export = self.config.export, False
-            self.logger.info("\t+ Loading quantized OVModel")
-            self._load_ovmodel_from_pretrained()
-            self.config.model, self.config.export = original_model, original_export
-        elif self.config.no_weights:
+        if self.config.no_weights:
             self.logger.info("\t+ Creating no weights OVModel")
             self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights OVModel")
-            self._load_ovmodel_with_no_weights()
+            self.load_ovmodel_with_no_weights()
         else:
             self.logger.info("\t+ Loading pretrained OVModel")
-            self._load_ovmodel_from_pretrained()
+            self.load_ovmodel_from_pretrained()
 
         if self.config.reshape:
-            static_shapes = {
-                key: value
-                for key, value in self.model_shapes.items()
-                if key in inspect.getfullargspec(self.pretrained_model.reshape).args
-            }
-            if ("sequence_length" in static_shapes) and ("height" in static_shapes) and ("width" in static_shapes):
-                # for vision models, sequence_length is the number of channels
-                static_shapes["sequence_length"] = self.model_shapes.get("num_channels")
-
-            self.logger.info(f"\t+ Reshaping model with static shapes: {static_shapes}")
-            self.pretrained_model.reshape(**static_shapes)
+            self.logger.info("\t+ Reshaping model with static shapes")
+            self.pretrained_model.reshape(**self.config.reshape_kwargs)
 
         if self.config.half:
             self.logger.info("\t+ Converting model to half precision")
@@ -102,79 +60,43 @@ def load(self) -> None:
 
         self.tmpdir.cleanup()
 
-    def _load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def _load_automodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        with fast_weights_init():
-            self._load_automodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.tie_weights()
-
-        self.config.model = original_model
-
-    def _load_ovmodel_from_pretrained(self) -> None:
+    def load_ovmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.ovmodel_class.from_pretrained(
             self.config.model,
-            export=self.config.export,
-            ov_config=self.config.openvino_config,
-            device=self.config.device,
             **self.config.model_kwargs,
             **self.ovmodel_kwargs,
         )
 
-    def _load_ovmodel_with_no_weights(self) -> None:
+    def load_ovmodel_with_no_weights(self) -> None:
         with fast_weights_init():
             original_model, self.config.model = self.config.model, self.no_weights_model
             original_export, self.config.export = self.config.export, True
-            self.logger.info("\t+ Loading no weights OVModel")
-            self._load_ovmodel_from_pretrained()
+            self.load_ovmodel_from_pretrained()
             self.config.export = original_export
             self.config.model = original_model
 
-    def quantize_automodel(self) -> None:
-        self.logger.info("\t+ Attempting quantization")
-        self.quantized_model = f"{self.tmpdir.name}/quantized_model"
-        self.logger.info("\t+ Processing quantization config")
-        quantization_config = OVQuantizationConfig(**self.config.quantization_config)
-        self.logger.info("\t+ Creating quantizer")
-        quantizer = OVQuantizer.from_pretrained(self.pretrained_model, task=self.config.task, seed=self.config.seed)
-
-        if self.config.calibration:
-            self.logger.info("\t+ Generating calibration dataset")
-            dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes}
-            calibration_dataset = DatasetGenerator(
-                task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes
-            )()
-            columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._export_input_names))
-            calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed)
-        else:
-            calibration_dataset = None
-
-        self.logger.info("\t+ Quantizing model")
-        quantizer.quantize(
-            save_directory=self.quantized_model,
-            quantization_config=quantization_config,
-            calibration_dataset=calibration_dataset,
-            # TODO: add support for these (maybe)
-            remove_unused_columns=True,
-            data_collator=None,
-            weights_only=False,
-            file_name=None,
-            batch_size=1,
-        )
-
     @property
     def ovmodel_kwargs(self) -> Dict[str, Any]:
         kwargs = {}
 
-        if self.config.task in TEXT_GENERATION_TASKS:
+        if self.config.export is not None:
+            kwargs["export"] = self.config.export
+
+        if self.config.use_cache is not None:
             kwargs["use_cache"] = self.config.use_cache
+
+        if self.config.use_merged is not None:
             kwargs["use_merged"] = self.config.use_merged
 
+        if self.config.load_in_8bit is not None:
+            kwargs["load_in_8bit"] = self.config.load_in_8bit
+
+        if self.config.load_in_4bit is not None:
+            kwargs["load_in_4bit"] = self.config.load_in_4bit
+
+        if self.config.ov_config:
+            kwargs["ov_config"] = self.config.ov_config
+
         return kwargs
 
     @property
@@ -190,14 +112,6 @@ def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
             if hasattr(self.pretrained_model, "input_names") and key not in self.pretrained_model.input_names:
                 inputs.pop(key)
 
-        if "input_ids" in inputs:
-            self.model_shapes.update(dict(zip(["batch_size", "sequence_length"], inputs["input_ids"].shape)))
-
-        if "pixel_values" in inputs:
-            self.model_shapes.update(
-                dict(zip(["batch_size", "num_channels", "height", "width"], inputs["pixel_values"].shape))
-            )
-
         return inputs
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
diff --git a/optimum_benchmark/backends/openvino/config.py b/optimum_benchmark/backends/openvino/config.py
index 7e6eac25..e6716b86 100644
--- a/optimum_benchmark/backends/openvino/config.py
+++ b/optimum_benchmark/backends/openvino/config.py
@@ -11,28 +11,21 @@ class OVConfig(BackendConfig):
     version: Optional[str] = openvino_version()
     _target_: str = "optimum_benchmark.backends.openvino.backend.OVBackend"
 
-    # load options
     no_weights: bool = False
 
-    # export options
-    export: bool = True
-    use_cache: bool = True
-    use_merged: bool = False
-
-    # openvino config
-    openvino_config: Dict[str, Any] = field(default_factory=dict)
+    # ovmodel kwargs
+    export: Optional[bool] = None
+    use_cache: Optional[bool] = None
+    use_merged: Optional[bool] = None
+    load_in_8bit: Optional[bool] = None
+    load_in_4bit: Optional[bool] = None
+    ov_config: Dict[str, Any] = field(default_factory=dict)
 
     # compilation options
     half: bool = False
+    compile: bool = False
     reshape: bool = False
-
-    # quantization options
-    quantization: bool = False
-    quantization_config: Dict[str, Any] = field(default_factory=dict)
-
-    # calibration options
-    calibration: bool = False
-    calibration_config: Dict[str, Any] = field(default_factory=dict)
+    reshape_kwargs: Dict[str, int] = field(default_factory=dict)
 
     def __post_init__(self):
         super().__post_init__()
@@ -42,7 +35,7 @@ def __post_init__(self):
             raise ValueError(f"OVBackend only supports CPU devices, got {self.device}")
 
         if self.intra_op_num_threads is not None:
-            raise NotImplementedError("OVBackend does not support intra_op_num_threads")
+            raise NotImplementedError("OVBackend does not support intra_op_num_threads. Please use the ov_config")
 
-        if self.quantization and not self.calibration:
-            raise ValueError("OpenVINO quantization requires enabling calibration.")
+        if self.inter_op_num_threads is not None:
+            raise NotImplementedError("OVBackend does not support inter_op_num_threads. Please use the ov_config")
diff --git a/optimum_benchmark/backends/openvino/utils.py b/optimum_benchmark/backends/openvino/utils.py
index 35518346..e382d724 100644
--- a/optimum_benchmark/backends/openvino/utils.py
+++ b/optimum_benchmark/backends/openvino/utils.py
@@ -1,4 +1,4 @@
-TASKS_TO_OVMODEL = {
+TASKS_TO_OVMODELS = {
     "fill-mask": "optimum.intel.openvino.OVModelForMaskedLM",
     "text-generation": "optimum.intel.openvino.OVModelForCausalLM",
     "text2text-generation": "optimum.intel.openvino.OVModelForSeq2SeqLM",
@@ -10,10 +10,8 @@
     "audio-classification": "optimum.intel.openvino.OVModelForAudioClassification",
     "pix2struct": "optimum.intel.openvino.OVModelForPix2Struct",
 }
-TASKS_TO_MODEL_TYPES_TO_OVPIPELINE = {
-    "text-to-image": {
-        "lcm": "optimum.intel.openvino.OVLatentConsistencyModelPipeline",
-        "stable-diffusion": "optimum.intel.openvino.OVStableDiffusionPipeline",
-        "stable-diffusion-xl": "optimum.intel.openvino.OVStableDiffusionXLPipeline",
-    },
+TASKS_TO_OVPIPELINES = {
+    "inpainting": "optimum.intel.openvino.OVPipelineForInpainting",
+    "text-to-image": "optimum.intel.openvino.OVPipelineForText2Image",
+    "image-to-image": "optimum.intel.openvino.OVPipelineForImage2Image",
 }
diff --git a/optimum_benchmark/backends/py_txi/config.py b/optimum_benchmark/backends/py_txi/config.py
index 73b75b75..dae410c4 100644
--- a/optimum_benchmark/backends/py_txi/config.py
+++ b/optimum_benchmark/backends/py_txi/config.py
@@ -51,8 +51,8 @@ class PyTXIConfig(BackendConfig):
     num_shard: Optional[int] = None
     speculate: Optional[int] = None
     cuda_graphs: Optional[int] = None
-    disable_custom_kernels: Optional[bool] = None
     trust_remote_code: Optional[bool] = None
+    disable_custom_kernels: Optional[bool] = None
 
     # TEI specific
     pooling: Optional[str] = None
diff --git a/optimum_benchmark/backends/tensorrt_llm/backend.py b/optimum_benchmark/backends/tensorrt_llm/backend.py
index a05187c3..60b82675 100644
--- a/optimum_benchmark/backends/tensorrt_llm/backend.py
+++ b/optimum_benchmark/backends/tensorrt_llm/backend.py
@@ -1,12 +1,18 @@
+import os
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
 from typing import Any, Dict
 
+import torch
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from hydra.utils import get_class
+from safetensors.torch import save_file
 
+from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
+from ..transformers_utils import fast_weights_init
 from .config import TRTLLMConfig
-from .utils import MODEL_TYPE_TO_TRTLLMMODEL
+from .utils import MODEL_TYPE_TO_TRTLLMMODELS
 
 
 class TRTLLMBackend(Backend[TRTLLMConfig]):
@@ -15,8 +21,8 @@ class TRTLLMBackend(Backend[TRTLLMConfig]):
     def __init__(self, config: TRTLLMConfig):
         super().__init__(config)
 
-        if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODEL:
-            self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODEL[self.config.model_type])
+        if self.config.model_type in MODEL_TYPE_TO_TRTLLMMODELS:
+            self.trtllm_loader = get_class(MODEL_TYPE_TO_TRTLLMMODELS[self.config.model_type])
             self.logger.info(f"\t+ Using TRTLLMModel class {self.trtllm_loader.__name__}")
         else:
             raise NotImplementedError(f"TRTLLMBackend does not support model_type {self.config.model_type}")
@@ -25,62 +31,134 @@ def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
 
-        self.logger.info("\t+ Loading pretrained TRTLLMModel")
-        self.load_trtmodel_from_pretrained()
+        if self.config.no_weights:
+            self.logger.info("\t+ Creating no weights model")
+            self.create_no_weights_model()
+            self.logger.info("\t+ Loading no weights model")
+            self.load_trtllm_with_no_weights()
+        else:
+            self.logger.info("\t+ Downloading pretrained model")
+            self.download_pretrained_model()
+            if self.config.task in TEXT_GENERATION_TASKS:
+                self.logger.info("\t+ Preparing generation config")
+                self.prepare_generation_config()
+            self.logger.info("\t+ Loading pretrained model")
+            self.load_trtllm_from_pretrained()
 
         self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
 
-    def load_trtmodel_from_pretrained(self) -> None:
+    def download_pretrained_model(self) -> None:
+        with torch.device("meta"):
+            self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
+
+    def prepare_generation_config(self) -> None:
+        self.generation_config.eos_token_id = None
+        self.generation_config.pad_token_id = None
+
+        model_cache_folder = f"models/{self.config.model}".replace("/", "--")
+        model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}"
+        snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
+        snapshot_ref = open(snapshot_file, "r").read().strip()
+        model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
+        self.generation_config.save_pretrained(save_directory=model_snapshot_path)
+
+    def create_no_weights_model(self) -> None:
+        self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model")
+        self.logger.info("\t+ Creating no weights model directory")
+        os.makedirs(self.no_weights_model, exist_ok=True)
+        self.logger.info("\t+ Creating no weights model state dict")
+        state_dict = torch.nn.Linear(1, 1).state_dict()
+        self.logger.info("\t+ Saving no weights model safetensors")
+        safetensor = os.path.join(self.no_weights_model, "model.safetensors")
+        save_file(tensors=state_dict, filename=safetensor, metadata={"format": "pt"})
+        self.logger.info("\t+ Saving no weights model pretrained config")
+        self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
+        self.logger.info("\t+ Saving no weights model pretrained processor")
+        self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model)
+        # unlike Transformers, TRT-LLM won't accept any missing tensors so we need to materialize the model
+        self.logger.info(f"\t+ Loading no weights model from {self.no_weights_model}")
+        with fast_weights_init():
+            self.pretrained_model = self.automodel_loader.from_pretrained(
+                self.no_weights_model, **self.config.model_kwargs, device_map="auto", _fast_init=False
+            )
+        self.logger.info("\t+ Saving no weights model")
+        self.pretrained_model.save_pretrained(save_directory=self.no_weights_model)
+        del self.pretrained_model
+        torch.cuda.empty_cache()
+
+        if self.config.task in TEXT_GENERATION_TASKS:
+            self.logger.info("\t+ Modifying generation config for fixed length generation")
+            self.generation_config.eos_token_id = None
+            self.generation_config.pad_token_id = None
+            self.logger.info("\t+ Saving new pretrained generation config")
+            self.generation_config.save_pretrained(save_directory=self.no_weights_model)
+
+    def load_trtllm_with_no_weights(self) -> None:
+        original_model, self.config.model = self.config.model, self.no_weights_model
+        self.load_trtllm_from_pretrained()
+        self.config.model = original_model
+
+    def load_trtllm_from_pretrained(self) -> None:
         self.pretrained_model = self.trtllm_loader.from_pretrained(
             self.config.model,
-            tp=self.config.tp,
-            pp=self.config.pp,
-            dtype=self.config.dtype,
-            use_fp8=self.config.use_fp8,
-            world_size=self.config.world_size,
-            gpus_per_node=self.config.gpus_per_node,
-            use_cuda_graph=self.config.use_cuda_graph,
-            optimization_level=self.config.optimization_level,
-            max_prompt_length=self.config.max_prompt_length,
-            max_batch_size=self.config.max_batch_size,
-            max_new_tokens=self.config.max_new_tokens,
-            max_beam_width=self.config.max_beam_width,
             **self.config.model_kwargs,
+            **self.trtllm_kwargs,
         )
 
+    @property
+    def trtllm_kwargs(self):
+        kwargs = {}
+
+        if self.config.tp is not None:
+            kwargs["tp"] = self.config.tp
+
+        if self.config.pp is not None:
+            kwargs["pp"] = self.config.pp
+
+        if self.config.dtype is not None:
+            kwargs["dtype"] = self.config.dtype
+
+        if self.config.use_fp8 is not None:
+            kwargs["use_fp8"] = self.config.use_fp8
+
+        if self.config.world_size is not None:
+            kwargs["world_size"] = self.config.world_size
+
+        if self.config.gpus_per_node is not None:
+            kwargs["gpus_per_node"] = self.config.gpus_per_node
+
+        if self.config.max_batch_size is not None:
+            kwargs["max_batch_size"] = self.config.max_batch_size
+
+        if self.config.max_new_tokens is not None:
+            kwargs["max_new_tokens"] = self.config.max_new_tokens
+
+        if self.config.max_prompt_length is not None:
+            kwargs["max_prompt_length"] = self.config.max_prompt_length
+
+        if self.config.optimization_level is not None:
+            kwargs["optimization_level"] = self.config.optimization_level
+
+        if self.config.use_cuda_graph is not None:
+            kwargs["use_cuda_graph"] = self.config.use_cuda_graph
+
+        return kwargs
+
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(
             input_ids=inputs.get("input_ids"),
             attention_mask=inputs.get("attention_mask"),
-            min_length=kwargs.get("min_new_tokens", -1),
-            max_new_tokens=kwargs.get("max_new_tokens", -1),
-            repetition_penalty=kwargs.get("repetition_penalty", 1.0),
-            length_penalty=kwargs.get("length_penalty", 1.0),
             pad_token_id=kwargs.get("pad_token_id", 0),
-            bos_token_id=kwargs.get("bos_token_id", 1),
-            eos_token_id=kwargs.get("eos_token_id", 2),
-            temperature=kwargs.get("temperature", 1.0),
-            num_beams=kwargs.get("num_beams", 1),
-            top_p=kwargs.get("top_p", 1.0),
-            top_k=kwargs.get("top_k", 50),
-            seed=kwargs.get("seed", 42),
+            eos_token_id=kwargs.get("eos_token_id", 1),
+            **kwargs,
         )
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict:
         return self.pretrained_model.generate(
             input_ids=inputs.get("input_ids"),
             attention_mask=inputs.get("attention_mask"),
-            min_length=kwargs.get("min_new_tokens", -1),
-            max_new_tokens=kwargs.get("max_new_tokens", -1),
-            repetition_penalty=kwargs.get("repetition_penalty", 1.0),
-            length_penalty=kwargs.get("length_penalty", 1.0),
             pad_token_id=kwargs.get("pad_token_id", 0),
-            bos_token_id=kwargs.get("bos_token_id", 1),
-            eos_token_id=kwargs.get("eos_token_id", 2),
-            temperature=kwargs.get("temperature", 1.0),
-            num_beams=kwargs.get("num_beams", 1),
-            top_p=kwargs.get("top_p", 1.0),
-            top_k=kwargs.get("top_k", 50),
-            seed=kwargs.get("seed", 42),
+            eos_token_id=kwargs.get("eos_token_id", 1),
+            **kwargs,
         )
diff --git a/optimum_benchmark/backends/tensorrt_llm/config.py b/optimum_benchmark/backends/tensorrt_llm/config.py
index d7f4b1cb..2497d5d4 100644
--- a/optimum_benchmark/backends/tensorrt_llm/config.py
+++ b/optimum_benchmark/backends/tensorrt_llm/config.py
@@ -4,7 +4,7 @@
 from ...import_utils import tesnorrt_llm_version
 from ..config import BackendConfig
 
-SUPPORTED_DTYPES = ["float16", "bfloat16", "float32"]
+SUPPORTED_DTYPES = [None, "float16", "bfloat16", "float32"]
 
 
 @dataclass
@@ -13,21 +13,20 @@ class TRTLLMConfig(BackendConfig):
     version: Optional[str] = tesnorrt_llm_version()
     _target_: str = "optimum_benchmark.backends.tensorrt_llm.backend.TRTLLMBackend"
 
-    # build config
-    tp: int = 1
-    pp: int = 1
-    use_fp8: bool = False
-    dtype: str = "float16"
-    optimization_level: int = 2
-    use_cuda_graph: bool = False
-
-    world_size: int = 1
-    gpus_per_node: int = 1
-
-    max_prompt_length: int = 128
-    max_new_tokens: int = -1
-    max_batch_size: int = 1
-    max_beam_width: int = 1
+    no_weights: bool = False
+
+    # trtllm kwargs
+    tp: Optional[int] = None
+    pp: Optional[int] = None
+    dtype: Optional[str] = None
+    use_fp8: Optional[bool] = None
+    world_size: Optional[int] = None
+    gpus_per_node: Optional[int] = None
+    max_batch_size: Optional[int] = None
+    max_new_tokens: Optional[int] = None
+    max_prompt_length: Optional[int] = None
+    optimization_level: Optional[int] = None
+    use_cuda_graph: Optional[bool] = None
 
     def __post_init__(self) -> None:
         super().__post_init__()
@@ -38,8 +37,13 @@ def __post_init__(self) -> None:
         if self.dtype not in SUPPORTED_DTYPES:
             raise ValueError(f"dtype must be one of float16, bfloat16, float32, got {self.dtype}")
 
-        if self.gpus_per_node != self.world_size:
+        if self.gpus_per_node is not None and self.world_size is not None and self.gpus_per_node != self.world_size:
             raise ValueError(f"gpus_per_node ({self.gpus_per_node}) != world_size ({self.world_size})")
 
-        if self.world_size != self.pp * self.tp:
+        if (
+            self.world_size is not None
+            and self.pp is not None
+            and self.tp is not None
+            and self.world_size != self.pp * self.tp
+        ):
             raise ValueError(f"world_size ({self.gpus_per_node}) != pp ({self.pp}) * tp ({self.tp})")
diff --git a/optimum_benchmark/backends/tensorrt_llm/utils.py b/optimum_benchmark/backends/tensorrt_llm/utils.py
index 4574da53..01b6ed0e 100644
--- a/optimum_benchmark/backends/tensorrt_llm/utils.py
+++ b/optimum_benchmark/backends/tensorrt_llm/utils.py
@@ -1 +1 @@
-MODEL_TYPE_TO_TRTLLMMODEL = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"}
+MODEL_TYPE_TO_TRTLLMMODELS = {"llama": "optimum.nvidia.models.llama.LlamaForCausalLM"}
diff --git a/optimum_benchmark/backends/torch_ort/backend.py b/optimum_benchmark/backends/torch_ort/backend.py
index 61401a75..7b7c1c04 100644
--- a/optimum_benchmark/backends/torch_ort/backend.py
+++ b/optimum_benchmark/backends/torch_ort/backend.py
@@ -39,19 +39,17 @@ def load(self) -> None:
         self.tmpdir.cleanup()
 
     def load_automodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
         with fast_weights_init():
+            original_model, self.config.model = self.config.model, self.no_weights_model
             self.load_automodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.tie_weights()
-
-        self.config.model = original_model
+            self.pretrained_model.tie_weights()
+            self.config.model = original_model
 
     def load_automodel_from_pretrained(self) -> None:
         self.pretrained_model = self.automodel_loader.from_pretrained(
-            self.config.model, **self.automodel_kwargs, **self.config.model_kwargs
+            self.config.model,
+            **self.config.model_kwargs,
+            **self.automodel_kwargs,
         ).to(self.config.device)
 
     @property
diff --git a/optimum_benchmark/backends/torch_ort/config.py b/optimum_benchmark/backends/torch_ort/config.py
index adc37288..17d2895d 100644
--- a/optimum_benchmark/backends/torch_ort/config.py
+++ b/optimum_benchmark/backends/torch_ort/config.py
@@ -14,8 +14,9 @@ class TorchORTConfig(BackendConfig):
     # load options
     no_weights: bool = False
     torch_dtype: Optional[str] = None
-    # sdpa, which has became default of many architectures, fails with torch ort
-    attn_implementation: Optional[str] = "eager"
+    attn_implementation: Optional[str] = (
+        "eager"  # we pin eager because sdpa became default of many architectures, which fails with torch-ort
+    )
 
     # peft options
     peft_type: Optional[str] = None
diff --git a/optimum_benchmark/backends/vllm/backend.py b/optimum_benchmark/backends/vllm/backend.py
index eadd6c0a..7405d4dc 100644
--- a/optimum_benchmark/backends/vllm/backend.py
+++ b/optimum_benchmark/backends/vllm/backend.py
@@ -6,7 +6,10 @@
 import torch
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from safetensors.torch import save_file
-from vllm import AsyncEngineArgs, AsyncLLMEngine, EngineArgs, LLMEngine, SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.llm_engine import LLMEngine
+from vllm.sampling_params import SamplingParams
 
 from ...task_utils import TEXT_GENERATION_TASKS
 from ..base import Backend
@@ -32,7 +35,7 @@ def load(self) -> None:
             self.logger.info("\t+ Creating no weights model")
             self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights model")
-            self.load_model_with_no_weights()
+            self.load_vllm_with_no_weights()
         else:
             self.logger.info("\t+ Downloading pretrained model")
             self.download_pretrained_model()
@@ -40,7 +43,7 @@ def load(self) -> None:
                 self.logger.info("\t+ Preparing generation config")
                 self.prepare_generation_config()
             self.logger.info("\t+ Loading pretrained model")
-            self.load_model_from_pretrained()
+            self.load_vllm_from_pretrained()
 
         self.logger.info("\t+ Cleaning up backend temporary directory")
         self.tmpdir.cleanup()
@@ -52,13 +55,11 @@ def download_pretrained_model(self) -> None:
     def prepare_generation_config(self) -> None:
         self.generation_config.eos_token_id = None
         self.generation_config.pad_token_id = None
-
         model_cache_folder = f"models/{self.config.model}".replace("/", "--")
         model_cache_path = f"{HUGGINGFACE_HUB_CACHE}/{model_cache_folder}"
         snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}"
         snapshot_ref = open(snapshot_file, "r").read().strip()
         model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}"
-        self.logger.info("\t+ Saving new pretrained generation config")
         self.generation_config.save_pretrained(save_directory=model_snapshot_path)
 
     def create_no_weights_model(self) -> None:
@@ -92,17 +93,25 @@ def create_no_weights_model(self) -> None:
             self.logger.info("\t+ Saving new pretrained generation config")
             self.generation_config.save_pretrained(save_directory=self.no_weights_model)
 
-    def load_model_with_no_weights(self) -> None:
+    def load_vllm_with_no_weights(self) -> None:
         original_model, self.config.model = self.config.model, self.no_weights_model
-        self.logger.info("\t+ Loading no weights model")
-        self.load_model_from_pretrained()
+        self.load_vllm_from_pretrained()
         self.config.model = original_model
 
-    def load_model_from_pretrained(self) -> None:
+    def load_vllm_from_pretrained(self) -> None:
         if self.config.serving_mode == "offline":
-            self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.config.to_engine_args()))
+            self.pretrained_model = LLMEngine.from_engine_args(EngineArgs(**self.vllm_kwargs))
         else:
-            self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.config.to_engine_args()))
+            self.pretrained_model = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**self.vllm_kwargs))
+
+    @property
+    def vllm_kwargs(self):
+        return {
+            "model": self.config.model,
+            "tokenizer": self.config.processor,
+            "device": self.config.device,
+            **self.config.engine_args,
+        }
 
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task in TEXT_GENERATION_TASKS:
diff --git a/optimum_benchmark/backends/vllm/config.py b/optimum_benchmark/backends/vllm/config.py
index 47ae475b..00157220 100644
--- a/optimum_benchmark/backends/vllm/config.py
+++ b/optimum_benchmark/backends/vllm/config.py
@@ -54,11 +54,3 @@ def __post_init__(self):
         if self.serving_mode == "online":
             if self.engine_args.get("disable_log_requests", None) is None:
                 self.engine_args["disable_log_requests"] = True
-
-    def to_engine_args(self) -> Dict[str, Any]:
-        return dict(
-            model=self.model,
-            tokenizer=self.processor,
-            device=self.device,
-            **self.engine_args,
-        )
diff --git a/optimum_benchmark/scenarios/inference/scenario.py b/optimum_benchmark/scenarios/inference/scenario.py
index 45461714..d18761b3 100644
--- a/optimum_benchmark/scenarios/inference/scenario.py
+++ b/optimum_benchmark/scenarios/inference/scenario.py
@@ -118,8 +118,6 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
                 device_ids=self.backend.config.device_ids,
             )
 
-        self.run_model_loading_tracking()
-
         self.logger.info(f"\t+ Generating inputs for task {self.backend.config.task}")
         self.inputs = InputGenerator(
             task=self.backend.config.task,
@@ -127,6 +125,9 @@ def run(self, backend: Backend[BackendConfigT]) -> BenchmarkReport:
             model_type=self.backend.config.model_type,
             input_shapes=self.config.input_shapes,
         )()
+
+        self.run_model_loading_tracking()
+
         self.logger.info(f"\t+ Preparing inputs for backend {self.backend.config.name}")
         self.inputs = self.backend.prepare_inputs(inputs=self.inputs)
 
diff --git a/setup.py b/setup.py
index 46a1ed60..c7f81246 100644
--- a/setup.py
+++ b/setup.py
@@ -65,10 +65,10 @@
     "testing": ["pytest", "hydra-joblib-launcher"],
     # optimum backends
     "ipex": [f"optimum[ipex]>={MIN_OPTIMUM_VERSION}"],
+    "tensorrt-llm": [f"optimum[nvidia]>={MIN_OPTIMUM_VERSION}"],
     "openvino": [f"optimum[openvino,nncf]>={MIN_OPTIMUM_VERSION}"],
     "onnxruntime": [f"optimum[onnxruntime]>={MIN_OPTIMUM_VERSION}"],
     "onnxruntime-gpu": [f"optimum[onnxruntime-gpu]>={MIN_OPTIMUM_VERSION}"],
-    "neural-compressor": [f"optimum[neural-compressor]>={MIN_OPTIMUM_VERSION}"],
     "torch-ort": ["torch-ort", "onnxruntime-training", f"optimum>={MIN_OPTIMUM_VERSION}"],
     # other backends
     "llama-cpp": ["llama-cpp-python"],
diff --git a/tests/configs/_diffusers_.yaml b/tests/configs/_diffusers_.yaml
index 607b2502..25bcc6be 100644
--- a/tests/configs/_diffusers_.yaml
+++ b/tests/configs/_diffusers_.yaml
@@ -1,4 +1,7 @@
-backend:
-  library: diffusers
-  task: text-to-image
-  model: hf-internal-testing/tiny-stable-diffusion-torch
+hydra:
+  mode: MULTIRUN
+  sweeper:
+    params:
+      backend.library: diffusers
+      backend.task: text-to-image
+      backend.model: hf-internal-testing/tiny-stable-diffusion-torch
diff --git a/tests/configs/_export_.yaml b/tests/configs/_export_.yaml
new file mode 100644
index 00000000..50f1bb0f
--- /dev/null
+++ b/tests/configs/_export_.yaml
@@ -0,0 +1,2 @@
+backend:
+  export: true
diff --git a/tests/configs/_inc_quant_.yaml b/tests/configs/_inc_quant_.yaml
deleted file mode 100644
index 1347abfc..00000000
--- a/tests/configs/_inc_quant_.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-backend:
-  ptq_quantization: true
-  calibration: true
diff --git a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml b/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml
deleted file mode 100644
index 7865da6e..00000000
--- a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_decoders.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-defaults:
-  # order of inheritance, last one overrides previous ones
-  - _base_ # inherits from base config
-  - _cpu_ # inherits from cpu config
-  - _inference_ # inherits from inference config
-  - _inc_quant_ # inherits from incremental quantization config
-  - _text_decoders_ # inherits from text decoders config
-  - _no_weights_ # inherits from no weights config
-  - _self_ # hydra 1.1 compatibility
-  - override backend: neural-compressor
-
-name: cpu_inference_neural_compressor_text_decoders
diff --git a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml b/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml
deleted file mode 100644
index 91451cf1..00000000
--- a/tests/configs/cpu_inference_neural_compressor_inc_quant_text_encoders.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-defaults:
-  # order of inheritance, last one overrides previous ones
-  - _base_ # inherits from base config
-  - _cpu_ # inherits from cpu config
-  - _inference_ # inherits from inference config
-  - _inc_quant_ # inherits from incremental quantization config
-  - _text_encoders_ # inherits from text encoders config
-  - _no_weights_ # inherits from no weights config
-  - _self_ # hydra 1.1 compatibility
-  - override backend: neural-compressor
-
-name: cpu_inference_neural_compressor_text_encoders
diff --git a/tests/configs/cpu_inference_onnxruntime_diffusers.yaml b/tests/configs/cpu_inference_onnxruntime_diffusers.yaml
index 5b44c0f2..852f6a4f 100644
--- a/tests/configs/cpu_inference_onnxruntime_diffusers.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_diffusers.yaml
@@ -4,6 +4,7 @@ defaults:
   - _cpu_ # inherits from cpu config
   - _inference_ # inherits from inference config
   - _diffusers_ # inherits from diffusers config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml b/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml
index 0f0e095c..628a8c40 100644
--- a/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_ort_quant.yaml
@@ -5,6 +5,7 @@ defaults:
   - _inference_ # inherits from inference config
   - _ort_quant_ # inherits from ort static quant config
   - _no_weights_ # inherits from no weights sweep config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml
index d87c4552..48a61019 100644
--- a/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_text_decoders.yaml
@@ -5,6 +5,7 @@ defaults:
   - _inference_ # inherits from inference config
   - _text_decoders_ # inherits from text decoders config
   - _no_weights_ # inherits from no weights config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml
index 5e9bdb9f..a846360b 100644
--- a/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_text_encoders.yaml
@@ -5,6 +5,7 @@ defaults:
   - _inference_ # inherits from inference config
   - _text_encoders_ # inherits from text encoders config
   - _no_weights_ # inherits from no weights config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml b/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml
index a9b725d4..5c8eeb50 100644
--- a/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_text_encoders_decoders.yaml
@@ -5,6 +5,7 @@ defaults:
   - _inference_ # inherits from inference config
   - _text_encoders_decoders_ # inherits from text encoders decoders config
   - _no_weights_ # inherits from no weights config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cpu_inference_onnxruntime_timm.yaml b/tests/configs/cpu_inference_onnxruntime_timm.yaml
index 9859c540..5e487cf9 100644
--- a/tests/configs/cpu_inference_onnxruntime_timm.yaml
+++ b/tests/configs/cpu_inference_onnxruntime_timm.yaml
@@ -3,6 +3,7 @@ defaults:
   - _base_ # inherits from base config
   - _cpu_ # inherits from cpu config
   - _inference_ # inherits from inference config
+  - _export_ # inherits from export config
   - _timm_ # inherits from timm config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
diff --git a/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml b/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml
index d43725e8..b751e282 100644
--- a/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml
+++ b/tests/configs/cuda_inference_onnxruntime_text_decoders.yaml
@@ -6,6 +6,7 @@ defaults:
   - _text_decoders_ # inherits from text decoders sweep config
   - _device_isolation_ # inherits from device isolation config
   - _no_weights_ # inherits from no weights config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime
 
diff --git a/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml b/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml
index 2ac7133b..ad36c13a 100644
--- a/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml
+++ b/tests/configs/cuda_inference_onnxruntime_text_encoders.yaml
@@ -6,6 +6,7 @@ defaults:
   - _text_encoders_ # inherits from text encoders sweep config
   - _device_isolation_ # inherits from device isolation config
   - _no_weights_ # inherits from no weights config
+  - _export_ # inherits from export config
   - _self_ # hydra 1.1 compatibility
   - override backend: onnxruntime