diff --git a/examples/cuda_tgi_llama.yaml b/examples/cuda_tgi_llama.yaml index a3d33af5..16d2f5f0 100644 --- a/examples/cuda_tgi_llama.yaml +++ b/examples/cuda_tgi_llama.yaml @@ -16,9 +16,8 @@ backend: device: cuda device_ids: 0 cuda_graphs: 0 # remove for better perf but bigger memory footprint - no_weights: true + no_weights: false model: TinyLlama/TinyLlama-1.1B-Chat-v1.0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1 scenario: input_shapes: diff --git a/optimum_benchmark/backends/py_txi/backend.py b/optimum_benchmark/backends/py_txi/backend.py index 0f52bc50..00e1044d 100644 --- a/optimum_benchmark/backends/py_txi/backend.py +++ b/optimum_benchmark/backends/py_txi/backend.py @@ -46,16 +46,12 @@ def download_pretrained_model(self) -> None: def create_no_weights_model(self) -> None: self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") + filename = os.path.join(self.no_weights_model, "model.safetensors") os.makedirs(self.no_weights_model, exist_ok=True) self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) self.pretrained_processor.save_pretrained(save_directory=self.no_weights_model) - if self.config.task in TEXT_GENERATION_TASKS: - self.generation_config.eos_token_id = None - self.generation_config.pad_token_id = None - self.generation_config.save_pretrained(save_directory=self.no_weights_model) - filename = os.path.join(self.no_weights_model, "model.safetensors") save_model(model=torch.nn.Linear(1, 1), filename=filename, metadata={"format": "pt"}) with fast_weights_init(): # unlike Transformers, TXI won't accept any missing tensors so we need to materialize the model @@ -66,6 +62,11 @@ def create_no_weights_model(self) -> None: del self.pretrained_model torch.cuda.empty_cache() + if self.config.task in TEXT_GENERATION_TASKS: + self.generation_config.eos_token_id = None + self.generation_config.pad_token_id = None + self.generation_config.save_pretrained(save_directory=self.no_weights_model) + def load_model_with_no_weights(self) -> None: self.config.volumes = {self.no_weights_model: {"bind": "/no_weights_model/", "mode": "rw"}} original_model, self.config.model = self.config.model, "/no_weights_model/" diff --git a/tests/configs/cpu_inference_py_txi_gpt2.yaml b/tests/configs/cpu_inference_py_txi_gpt2.yaml index 23c18416..1aef598e 100644 --- a/tests/configs/cpu_inference_py_txi_gpt2.yaml +++ b/tests/configs/cpu_inference_py_txi_gpt2.yaml @@ -9,7 +9,3 @@ defaults: - override backend: py-txi name: cpu_inference_py_txi_gpt2 - -backend: - cuda_graphs: 0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1 diff --git a/tests/configs/cuda_inference_py_txi_gpt2.yaml b/tests/configs/cuda_inference_py_txi_gpt2.yaml index 75e20094..73a5c10a 100644 --- a/tests/configs/cuda_inference_py_txi_gpt2.yaml +++ b/tests/configs/cuda_inference_py_txi_gpt2.yaml @@ -3,13 +3,8 @@ defaults: - _base_ # inherits from base config - _cuda_ # inherits from cuda config - _inference_ # inherits from inference config - - _no_weights_ # inherits from no weights config - _gpt2_ # inherits from gpt2 config - _self_ # hydra 1.1 compatibility - override backend: py-txi name: cuda_inference_py_txi_gpt2 - -backend: - cuda_graphs: 0 - image: ghcr.io/huggingface/text-generation-inference:2.4.1