Optional backend kwargs (#307)

huggingface · Dec 11, 2024 · 4c23a0d · 4c23a0d
1 parent a2700a8
commit 4c23a0d
Show file tree

Hide file tree

Showing 47 changed files with 350 additions and 410 deletions.
diff --git a/.github/workflows/test_api_rocm.yaml b/.github/workflows/test_api_rocm.yaml
@@ -33,6 +33,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,timm,diffusers,codecarbon
+      test_file: test_api.py
       pytest_keywords: api and cuda
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/test_cli_cuda_tensorrt_llm.yaml b/.github/workflows/test_cli_cuda_tensorrt_llm.yaml
@@ -44,7 +44,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests
         run: |
@@ -57,7 +57,7 @@ jobs:
           }}
         name: Run examples
         run: |
-          huggingface-cli delete-cache
+          rm -rf /root/.cache/huggingface
           pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
 
   cli_cuda_tensorrt_llm_multi_gpu_tests:
@@ -84,7 +84,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          pip install -e .[testing]
+          pip install -e .[testing,tesnsorrt-llm]
 
       - name: Run tests (sequential)
         run: |

diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -35,6 +35,7 @@ jobs:
     with:
       machine_type: single-gpu
       install_extras: testing,diffusers,timm,peft,autoawq,auto-gptq
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb
 
   run_cli_rocm_pytorch_multi_gpu_tests:
@@ -52,4 +53,5 @@ jobs:
     with:
       machine_type: multi-gpu
       install_extras: testing,diffusers,timm,peft
+      test_file: test_cli.py
       pytest_keywords: cli and cuda and pytorch and (dp or ddp or device_map)
diff --git a/examples/cpu_ipex_bert.yaml b/examples/cpu_ipex_bert.yaml
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: google-bert/bert-base-uncased
 
 scenario:

diff --git a/examples/cpu_ipex_llama.yaml b/examples/cpu_ipex_llama.yaml
@@ -17,8 +17,8 @@ launcher:
 backend:
   device: cpu
   export: true
-  no_weights: false # because on multi-node machines, intializing weights could harm performance
-  torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
+  no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
+  torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:

diff --git a/examples/cpu_onnxruntime_timm.yaml b/examples/cpu_onnxruntime_timm.yaml
diff --git a/examples/cpu_openvino_8bit_bert.yaml b/examples/cpu_openvino_8bit_bert.yaml
@@ -12,8 +12,11 @@ backend:
   device: cpu
   reshape: true
   no_weights: true
-  load_in_8bit: false # enable 8bit on compatible Intel CPU machines
+  load_in_8bit: true
   model: google-bert/bert-base-uncased
+  reshape_kwargs:
+    batch_size: 1
+    sequence_length: 128
 
 scenario:
   memory: true

diff --git a/examples/cpu_openvino_diffusion.yaml b/examples/cpu_openvino_diffusion.yaml
@@ -11,9 +11,14 @@ name: openvino_diffusion
 backend:
   device: cpu
   export: true
+  task: text-to-image
   model: stabilityai/stable-diffusion-2-1
   half: false # enable half-precision on compatible Intel CPU machines
 
 scenario:
   input_shapes:
     batch_size: 1
+    sequence_length: 16
+
+  call_kwargs:
+    num_inference_steps: 4
diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml
@@ -16,6 +16,7 @@ backend:
   device: cuda
   device_ids: 0
   cuda_graphs: 0 # remove for better perf but bigger memory footprint
+  no_weights: false # investigate later
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 
 scenario:

diff --git a/examples/cuda_trt_llama.yaml b/examples/cuda_trt_llama.yaml
@@ -15,6 +15,7 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
+  no_weights: true
   max_batch_size: 4
   max_new_tokens: 32
   max_prompt_length: 64

diff --git a/examples/cuda_vllm_llama.yaml b/examples/cuda_vllm_llama.yaml
@@ -15,7 +15,8 @@ launcher:
 backend:
   device: cuda
   device_ids: 0
-  serving_mode: online # server-like
+  no_weights: true
+  serving_mode: online
   model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
   engine_args:
     enforce_eager: true # remove for better perf but bigger memory footprint

diff --git a/examples/mps_pytorch_bert.yaml b/examples/mps_pytorch_bert.yaml
@@ -8,14 +8,14 @@ defaults:
 
 name: mps_pytorch_bert
 
+backend:
+  device: mps
+  no_weights: true
+  model: bert-base-uncased
+
 scenario:
-  latency: true
   memory: true
+  latency: true
   input_shapes:
     batch_size: 1
     sequence_length: 128
-
-backend:
-  device: mps
-  no_weights: true
-  model: bert-base-uncased
diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -106,13 +106,6 @@ def create_no_weights_model(self) -> None:
         self.logger.info("\t+ Saving no weights model's config")
         self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)
 
-    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        This method is used to prepare and register the input shapes before using them by the model.
-        It can be used to pad the inputs to the correct shape, or compile it to the correct format.
-        """
-        return input_shapes
-
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         """
         This method is used to prepare and register the inputs before passing them to the model.

diff --git a/optimum_benchmark/backends/ipex/backend.py b/optimum_benchmark/backends/ipex/backend.py
@@ -9,7 +9,7 @@
 from ..base import Backend
 from ..transformers_utils import fast_weights_init
 from .config import IPEXConfig
-from .utils import TASKS_TO_IPEXMODEL
+from .utils import TASKS_TO_IPEXMODELS
 
 if is_accelerate_available():
     from accelerate import Accelerator
@@ -24,8 +24,8 @@ class IPEXBackend(Backend[IPEXConfig]):
     def __init__(self, config: IPEXConfig) -> None:
         super().__init__(config)
 
-        if self.config.task in TASKS_TO_IPEXMODEL:
-            self.ipexmodel_class = get_class(TASKS_TO_IPEXMODEL[self.config.task])
+        if self.config.task in TASKS_TO_IPEXMODELS:
+            self.ipexmodel_class = get_class(TASKS_TO_IPEXMODELS[self.config.task])
             self.logger.info(f"\t+ Using IPEXModel class {self.ipexmodel_class.__name__}")
         else:
             raise NotImplementedError(f"IPEXBackend does not support task {self.config.task}")
@@ -38,48 +38,35 @@ def load(self) -> None:
             self.logger.info("\t+ Creating no weights IPEXModel")
             self.create_no_weights_model()
             self.logger.info("\t+ Loading no weights IPEXModel")
-            self._load_ipexmodel_with_no_weights()
+            self.load_ipexmodel_with_no_weights()
         else:
             self.logger.info("\t+ Loading pretrained IPEXModel")
-            self._load_ipexmodel_from_pretrained()
+            self.load_ipexmodel_from_pretrained()
 
         self.tmpdir.cleanup()
 
-    def _load_automodel_from_pretrained(self) -> None:
-        self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)
-
-    def _load_automodel_with_no_weights(self) -> None:
-        original_model, self.config.model = self.config.model, self.no_weights_model
-
-        with fast_weights_init():
-            self._load_automodel_from_pretrained()
-
-        self.logger.info("\t+ Tying model weights")
-        self.pretrained_model.tie_weights()
-
-        self.config.model = original_model
-
-    def _load_ipexmodel_from_pretrained(self) -> None:
+    def load_ipexmodel_from_pretrained(self) -> None:
         self.pretrained_model = self.ipexmodel_class.from_pretrained(
             self.config.model,
-            export=self.config.export,
             **self.config.model_kwargs,
-            **self.automodel_kwargs,
+            **self.ipexmodel_kwargs,
         )
 
-    def _load_ipexmodel_with_no_weights(self) -> None:
+    def load_ipexmodel_with_no_weights(self) -> None:
         with fast_weights_init():
             original_model, self.config.model = self.config.model, self.no_weights_model
             original_export, self.config.export = self.config.export, True
-            self.logger.info("\t+ Loading no weights IPEXModel")
-            self._load_ipexmodel_from_pretrained()
+            self.load_ipexmodel_from_pretrained()
             self.config.export = original_export
             self.config.model = original_model
 
     @property
-    def automodel_kwargs(self) -> Dict[str, Any]:
+    def ipexmodel_kwargs(self) -> Dict[str, Any]:
         kwargs = {}
 
+        if self.config.export:
+            kwargs["export"] = self.config.export
+
         if self.config.torch_dtype is not None:
             kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)
 

diff --git a/optimum_benchmark/backends/ipex/config.py b/optimum_benchmark/backends/ipex/config.py
@@ -13,17 +13,17 @@ class IPEXConfig(BackendConfig):
     version: Optional[str] = ipex_version()
     _target_: str = "optimum_benchmark.backends.ipex.backend.IPEXBackend"
 
-    # load options
     no_weights: bool = False
-    torch_dtype: Optional[str] = None
 
-    # export options
-    export: bool = True
+    # ipexmodel kwargs
+    export: Optional[bool] = None
+    torch_dtype: Optional[str] = None
 
     def __post_init__(self):
         super().__post_init__()
 
         self.device = self.device.lower()
+
         if self.device not in ["cpu", "gpu"]:
             raise ValueError(f"IPEXBackend only supports CPU devices, got {self.device}")
 

diff --git a/optimum_benchmark/backends/ipex/utils.py b/optimum_benchmark/backends/ipex/utils.py
@@ -1,4 +1,4 @@
-TASKS_TO_IPEXMODEL = {
+TASKS_TO_IPEXMODELS = {
     "fill-mask": "optimum.intel.IPEXModelForMaskedLM",
     "text-generation": "optimum.intel.IPEXModelForCausalLM",
     "feature-extraction": "optimum.intel.IPEXModel",

diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py
@@ -28,37 +28,38 @@ def load_model_from_pretrained(self) -> None:
         """
 
         self.pretrained_model = Llama.from_pretrained(
-            repo_id=self.config.model,
-            filename=self.config.filename,
+            self.config.model,
             **self.llama_cpp_kwargs,
         )
 
     @property
     def llama_cpp_kwargs(self) -> Dict[str, Any]:
         return {
             "embedding": self.config.task == "feature-extraction",
+            "filename": self.config.filename,
             "verbose": False,
             "echo": False,
         }
 
     def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task == "text-generation":
             if inputs["input_ids"].shape[0] != 1:
-                raise ValueError("Batch size must be 1 for LlamaCpp text generation")
+                raise ValueError("Batch size must be 1 for Text Generation with llama-cpp-python")
             return {"tokens": inputs["input_ids"].squeeze(0).tolist()}
-
         elif self.config.task == "feature-extraction":
             return {"input": [self.pretrained_model.detokenize(x).decode("utf-8") for x in inputs["input_ids"]]}
-
-        raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
+        else:
+            raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
         self.pretrained_model.embed(**inputs)
 
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        next(self.pretrained_model.generate(**inputs))
+        generator = self.pretrained_model.generate(**inputs, reset=True)
+        for _ in range(kwargs["max_new_tokens"]):
+            next(generator)
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        generator = self.pretrained_model.generate(**inputs)
+        generator = self.pretrained_model.generate(**inputs, reset=True)
         for _ in range(kwargs["max_new_tokens"]):
             next(generator)
diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py
@@ -12,6 +12,8 @@ class LlamaCppConfig(BackendConfig):
     _target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend"
 
     no_weights: bool = False
+
+    # llamamodel kwargs
     filename: Optional[str] = None
 
     def __post_init__(self):