Skip to content

Commit

Permalink
Optional backend kwargs (#307)
Browse files Browse the repository at this point in the history
  • Loading branch information
IlyasMoutawwakil authored Dec 11, 2024
1 parent a2700a8 commit 4c23a0d
Show file tree
Hide file tree
Showing 47 changed files with 350 additions and 410 deletions.
1 change: 1 addition & 0 deletions .github/workflows/test_api_rocm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
with:
machine_type: single-gpu
install_extras: testing,timm,diffusers,codecarbon
test_file: test_api.py
pytest_keywords: api and cuda
secrets:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
6 changes: 3 additions & 3 deletions .github/workflows/test_cli_cuda_tensorrt_llm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:

- name: Install dependencies
run: |
pip install -e .[testing]
pip install -e .[testing,tesnsorrt-llm]
- name: Run tests
run: |
Expand All @@ -57,7 +57,7 @@ jobs:
}}
name: Run examples
run: |
huggingface-cli delete-cache
rm -rf /root/.cache/huggingface
pytest tests/test_examples.py -x -s -k "cli and cuda and trt"
cli_cuda_tensorrt_llm_multi_gpu_tests:
Expand All @@ -84,7 +84,7 @@ jobs:

- name: Install dependencies
run: |
pip install -e .[testing]
pip install -e .[testing,tesnsorrt-llm]
- name: Run tests (sequential)
run: |
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/test_cli_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ jobs:
with:
machine_type: single-gpu
install_extras: testing,diffusers,timm,peft,autoawq,auto-gptq
test_file: test_cli.py
pytest_keywords: cli and cuda and pytorch and not (dp or ddp or device_map or deepspeed) and not bnb

run_cli_rocm_pytorch_multi_gpu_tests:
Expand All @@ -52,4 +53,5 @@ jobs:
with:
machine_type: multi-gpu
install_extras: testing,diffusers,timm,peft
test_file: test_cli.py
pytest_keywords: cli and cuda and pytorch and (dp or ddp or device_map)
4 changes: 2 additions & 2 deletions examples/cpu_ipex_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ launcher:
backend:
device: cpu
export: true
no_weights: false # because on multi-node machines, intializing weights could harm performance
torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
model: google-bert/bert-base-uncased

scenario:
Expand Down
4 changes: 2 additions & 2 deletions examples/cpu_ipex_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ launcher:
backend:
device: cpu
export: true
no_weights: false # because on multi-node machines, intializing weights could harm performance
torch_dtype: float32 # but use bfloat16 on compatible Intel CPUs
no_weights: false # on multi-node machines, intializing weights in the benchmark could harm performance
torch_dtype: float32 # use bfloat16 on compatible Intel CPUs
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

scenario:
Expand Down
20 changes: 0 additions & 20 deletions examples/cpu_onnxruntime_timm.yaml

This file was deleted.

5 changes: 4 additions & 1 deletion examples/cpu_openvino_8bit_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@ backend:
device: cpu
reshape: true
no_weights: true
load_in_8bit: false # enable 8bit on compatible Intel CPU machines
load_in_8bit: true
model: google-bert/bert-base-uncased
reshape_kwargs:
batch_size: 1
sequence_length: 128

scenario:
memory: true
Expand Down
5 changes: 5 additions & 0 deletions examples/cpu_openvino_diffusion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,14 @@ name: openvino_diffusion
backend:
device: cpu
export: true
task: text-to-image
model: stabilityai/stable-diffusion-2-1
half: false # enable half-precision on compatible Intel CPU machines

scenario:
input_shapes:
batch_size: 1
sequence_length: 16

call_kwargs:
num_inference_steps: 4
1 change: 1 addition & 0 deletions examples/cuda_tgi_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ backend:
device: cuda
device_ids: 0
cuda_graphs: 0 # remove for better perf but bigger memory footprint
no_weights: false # investigate later
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

scenario:
Expand Down
1 change: 1 addition & 0 deletions examples/cuda_trt_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ launcher:
backend:
device: cuda
device_ids: 0
no_weights: true
max_batch_size: 4
max_new_tokens: 32
max_prompt_length: 64
Expand Down
3 changes: 2 additions & 1 deletion examples/cuda_vllm_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ launcher:
backend:
device: cuda
device_ids: 0
serving_mode: online # server-like
no_weights: true
serving_mode: online
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
engine_args:
enforce_eager: true # remove for better perf but bigger memory footprint
Expand Down
12 changes: 6 additions & 6 deletions examples/mps_pytorch_bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ defaults:

name: mps_pytorch_bert

backend:
device: mps
no_weights: true
model: bert-base-uncased

scenario:
latency: true
memory: true
latency: true
input_shapes:
batch_size: 1
sequence_length: 128

backend:
device: mps
no_weights: true
model: bert-base-uncased
7 changes: 0 additions & 7 deletions optimum_benchmark/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,6 @@ def create_no_weights_model(self) -> None:
self.logger.info("\t+ Saving no weights model's config")
self.pretrained_config.save_pretrained(save_directory=self.no_weights_model)

def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
"""
This method is used to prepare and register the input shapes before using them by the model.
It can be used to pad the inputs to the correct shape, or compile it to the correct format.
"""
return input_shapes

def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""
This method is used to prepare and register the inputs before passing them to the model.
Expand Down
39 changes: 13 additions & 26 deletions optimum_benchmark/backends/ipex/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..base import Backend
from ..transformers_utils import fast_weights_init
from .config import IPEXConfig
from .utils import TASKS_TO_IPEXMODEL
from .utils import TASKS_TO_IPEXMODELS

if is_accelerate_available():
from accelerate import Accelerator
Expand All @@ -24,8 +24,8 @@ class IPEXBackend(Backend[IPEXConfig]):
def __init__(self, config: IPEXConfig) -> None:
super().__init__(config)

if self.config.task in TASKS_TO_IPEXMODEL:
self.ipexmodel_class = get_class(TASKS_TO_IPEXMODEL[self.config.task])
if self.config.task in TASKS_TO_IPEXMODELS:
self.ipexmodel_class = get_class(TASKS_TO_IPEXMODELS[self.config.task])
self.logger.info(f"\t+ Using IPEXModel class {self.ipexmodel_class.__name__}")
else:
raise NotImplementedError(f"IPEXBackend does not support task {self.config.task}")
Expand All @@ -38,48 +38,35 @@ def load(self) -> None:
self.logger.info("\t+ Creating no weights IPEXModel")
self.create_no_weights_model()
self.logger.info("\t+ Loading no weights IPEXModel")
self._load_ipexmodel_with_no_weights()
self.load_ipexmodel_with_no_weights()
else:
self.logger.info("\t+ Loading pretrained IPEXModel")
self._load_ipexmodel_from_pretrained()
self.load_ipexmodel_from_pretrained()

self.tmpdir.cleanup()

def _load_automodel_from_pretrained(self) -> None:
self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs)

def _load_automodel_with_no_weights(self) -> None:
original_model, self.config.model = self.config.model, self.no_weights_model

with fast_weights_init():
self._load_automodel_from_pretrained()

self.logger.info("\t+ Tying model weights")
self.pretrained_model.tie_weights()

self.config.model = original_model

def _load_ipexmodel_from_pretrained(self) -> None:
def load_ipexmodel_from_pretrained(self) -> None:
self.pretrained_model = self.ipexmodel_class.from_pretrained(
self.config.model,
export=self.config.export,
**self.config.model_kwargs,
**self.automodel_kwargs,
**self.ipexmodel_kwargs,
)

def _load_ipexmodel_with_no_weights(self) -> None:
def load_ipexmodel_with_no_weights(self) -> None:
with fast_weights_init():
original_model, self.config.model = self.config.model, self.no_weights_model
original_export, self.config.export = self.config.export, True
self.logger.info("\t+ Loading no weights IPEXModel")
self._load_ipexmodel_from_pretrained()
self.load_ipexmodel_from_pretrained()
self.config.export = original_export
self.config.model = original_model

@property
def automodel_kwargs(self) -> Dict[str, Any]:
def ipexmodel_kwargs(self) -> Dict[str, Any]:
kwargs = {}

if self.config.export:
kwargs["export"] = self.config.export

if self.config.torch_dtype is not None:
kwargs["torch_dtype"] = getattr(torch, self.config.torch_dtype)

Expand Down
8 changes: 4 additions & 4 deletions optimum_benchmark/backends/ipex/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ class IPEXConfig(BackendConfig):
version: Optional[str] = ipex_version()
_target_: str = "optimum_benchmark.backends.ipex.backend.IPEXBackend"

# load options
no_weights: bool = False
torch_dtype: Optional[str] = None

# export options
export: bool = True
# ipexmodel kwargs
export: Optional[bool] = None
torch_dtype: Optional[str] = None

def __post_init__(self):
super().__post_init__()

self.device = self.device.lower()

if self.device not in ["cpu", "gpu"]:
raise ValueError(f"IPEXBackend only supports CPU devices, got {self.device}")

Expand Down
2 changes: 1 addition & 1 deletion optimum_benchmark/backends/ipex/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
TASKS_TO_IPEXMODEL = {
TASKS_TO_IPEXMODELS = {
"fill-mask": "optimum.intel.IPEXModelForMaskedLM",
"text-generation": "optimum.intel.IPEXModelForCausalLM",
"feature-extraction": "optimum.intel.IPEXModel",
Expand Down
17 changes: 9 additions & 8 deletions optimum_benchmark/backends/llama_cpp/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,38 @@ def load_model_from_pretrained(self) -> None:
"""

self.pretrained_model = Llama.from_pretrained(
repo_id=self.config.model,
filename=self.config.filename,
self.config.model,
**self.llama_cpp_kwargs,
)

@property
def llama_cpp_kwargs(self) -> Dict[str, Any]:
return {
"embedding": self.config.task == "feature-extraction",
"filename": self.config.filename,
"verbose": False,
"echo": False,
}

def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
if self.config.task == "text-generation":
if inputs["input_ids"].shape[0] != 1:
raise ValueError("Batch size must be 1 for LlamaCpp text generation")
raise ValueError("Batch size must be 1 for Text Generation with llama-cpp-python")
return {"tokens": inputs["input_ids"].squeeze(0).tolist()}

elif self.config.task == "feature-extraction":
return {"input": [self.pretrained_model.detokenize(x).decode("utf-8") for x in inputs["input_ids"]]}

raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
else:
raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")

def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
self.pretrained_model.embed(**inputs)

def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
next(self.pretrained_model.generate(**inputs))
generator = self.pretrained_model.generate(**inputs, reset=True)
for _ in range(kwargs["max_new_tokens"]):
next(generator)

def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
generator = self.pretrained_model.generate(**inputs)
generator = self.pretrained_model.generate(**inputs, reset=True)
for _ in range(kwargs["max_new_tokens"]):
next(generator)
2 changes: 2 additions & 0 deletions optimum_benchmark/backends/llama_cpp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ class LlamaCppConfig(BackendConfig):
_target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend"

no_weights: bool = False

# llamamodel kwargs
filename: Optional[str] = None

def __post_init__(self):
Expand Down
Loading

0 comments on commit 4c23a0d

Please sign in to comment.