diff --git a/.github/workflows/test_cli_cpu_neural_compressor.yaml b/.github/workflows/test_cli_cpu_neural_compressor.yaml deleted file mode 100644 index 435f4216..00000000 --- a/.github/workflows/test_cli_cpu_neural_compressor.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: CLI CPU Intel Neural Compressor Tests - -on: - workflow_dispatch: - push: - branches: - - main - pull_request: - branches: - - main - types: - - opened - - reopened - - synchronize - - labeled - - unlabeled - -concurrency: - cancel-in-progress: true - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - -jobs: - run_cli_cpu_neural_compressor_tests: - if: ${{ - (github.event_name == 'push') || - (github.event_name == 'workflow_dispatch') || - contains( github.event.pull_request.labels.*.name, 'cli') || - contains( github.event.pull_request.labels.*.name, 'cpu') || - contains( github.event.pull_request.labels.*.name, 'neural_compressor') || - contains( github.event.pull_request.labels.*.name, 'cli_cpu_neural_compressor') - }} - - runs-on: ubuntu-latest - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - - name: Install requirements - run: | - pip install --upgrade pip - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install -e .[testing,neural-compressor,diffusers,timm] - - - name: Run tests - run: pytest tests/test_cli.py -s -k "cli and cpu and neural_compressor" diff --git a/optimum_benchmark/__init__.py b/optimum_benchmark/__init__.py index 313fb22a..7be4c4c3 100644 --- a/optimum_benchmark/__init__.py +++ b/optimum_benchmark/__init__.py @@ -1,9 +1,7 @@ from .backends import ( BackendConfig, - INCConfig, IPEXConfig, LlamaCppConfig, - LLMSwarmConfig, ORTConfig, OVConfig, PyTorchConfig, @@ -26,10 +24,8 @@ "EnergyStarConfig", "InferenceConfig", "IPEXConfig", - "INCConfig", "InlineConfig", "LauncherConfig", - "LLMSwarmConfig", "ORTConfig", "OVConfig", "ProcessConfig", diff --git a/optimum_benchmark/backends/__init__.py b/optimum_benchmark/backends/__init__.py index ec146f0b..2019270a 100644 --- a/optimum_benchmark/backends/__init__.py +++ b/optimum_benchmark/backends/__init__.py @@ -1,8 +1,6 @@ from .config import BackendConfig from .ipex.config import IPEXConfig from .llama_cpp.config import LlamaCppConfig -from .llm_swarm.config import LLMSwarmConfig -from .neural_compressor.config import INCConfig from .onnxruntime.config import ORTConfig from .openvino.config import OVConfig from .py_txi.config import PyTXIConfig @@ -18,9 +16,7 @@ "OVConfig", "TorchORTConfig", "TRTLLMConfig", - "INCConfig", "PyTXIConfig", - "LLMSwarmConfig", "BackendConfig", "VLLMConfig", "LlamaCppConfig", diff --git a/optimum_benchmark/backends/llm_swarm/__init__.py b/optimum_benchmark/backends/llm_swarm/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/optimum_benchmark/backends/llm_swarm/backend.py b/optimum_benchmark/backends/llm_swarm/backend.py deleted file mode 100644 index 8139e4ea..00000000 --- a/optimum_benchmark/backends/llm_swarm/backend.py +++ /dev/null @@ -1,85 +0,0 @@ -import asyncio -from typing import Any, Dict, List - -import torch -from huggingface_hub import AsyncInferenceClient -from llm_swarm import LLMSwarm -from llm_swarm import LLMSwarmConfig as LLMSwarmCfg - -from ...task_utils import TEXT_GENERATION_TASKS -from ..base import Backend -from .config import LLMSwarmConfig - - -class LLMSwarmBackend(Backend[LLMSwarmConfig]): - NAME: str = "llm-swarm" - - def __init__(self, config: LLMSwarmConfig) -> None: - super().__init__(config) - - if self.config.task not in TEXT_GENERATION_TASKS: - raise NotImplementedError(f"LLM Swarm does not support task {self.config.task}") - - def load(self) -> None: - self.logger.info("\t+ Downloading pretrained model") - self.download_pretrained_model() - self.logger.info("\t+ Preparing generation config") - self.prepare_generation_config() - self.logger.info("\t+ Loading pretrained model") - self.load_model_from_pretrained() - - def load_model_from_pretrained(self) -> None: - self.llm_swarm_config = LLMSwarmCfg( - gpus=self.config.gpus, - model=self.config.model, - instances=self.config.instances, - inference_engine=self.config.inference_engine, - slurm_template_path=self.config.slurm_template_path, - load_balancer_template_path=self.config.load_balancer_template_path, - per_instance_max_parallel_requests=self.config.per_instance_max_parallel_requests, - revision=self.config.model_kwargs.get("revision", "main"), - debug_endpoint=self.config.debug_endpoint, - ) - self.llm_swarm = LLMSwarm(self.llm_swarm_config).__enter__() - self.client = AsyncInferenceClient(self.llm_swarm.endpoint) - - def download_pretrained_model(self) -> None: - with torch.device("meta"): - self.auto_model_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def prepare_generation_config(self) -> None: - self.generation_config.eos_token_id = -100 - self.generation_config.pad_token_id = -100 - - model_cache_folder = f"models/{self.config.model}".replace("/", "--") - model_cache_path = f"{self.config.volume}/{model_cache_folder}" - snapshot_file = f"{model_cache_path}/refs/{self.config.model_kwargs.get('revision', 'main')}" - snapshot_ref = open(snapshot_file, "r").read().strip() - model_snapshot_path = f"{model_cache_path}/snapshots/{snapshot_ref}" - self.logger.info("\t+ Saving new pretrained generation config") - self.generation_config.save_pretrained(save_directory=model_snapshot_path) - - def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - if "inputs" in inputs: - inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["inputs"].tolist())} - elif "input_ids" in inputs: - inputs = {"prompt": self.pretrained_processor.batch_decode(inputs["input_ids"].tolist())} - else: - raise ValueError("inputs must contain either input_ids or inputs") - - return inputs - - async def single_client_call(self, prompt: str, kwargs: Dict[str, Any]) -> str: - return await self.client.text_generation(prompt, max_new_tokens=kwargs.get("max_new_tokens", 1)) - - async def batch_client_call(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return await asyncio.gather(*(self.single_client_call(p, kwargs) for p in inputs["prompt"])) - - def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) - - def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) - - def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> List[str]: - return asyncio.run(self.batch_client_call(inputs, kwargs)) diff --git a/optimum_benchmark/backends/llm_swarm/config.py b/optimum_benchmark/backends/llm_swarm/config.py deleted file mode 100644 index 745cdd3f..00000000 --- a/optimum_benchmark/backends/llm_swarm/config.py +++ /dev/null @@ -1,31 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -from ...import_utils import llm_swarm_version -from ..config import BackendConfig - - -@dataclass -class LLMSwarmConfig(BackendConfig): - name: str = "llm-swarm" - version: Optional[str] = llm_swarm_version() - _target_: str = "optimum_benchmark.backends.llm_swarm.backend.LLMSwarmBackend" - - # optimum benchmark specific - no_weights: bool = False - - # llm-swarm specific - gpus: int = 8 - instances: int = 1 - inference_engine: str = "tgi" - volume: str = "/fsx/ilyas/.cache" - per_instance_max_parallel_requests: int = 500 - slurm_template_path: str = "/fsx/ilyas/swarm-templates/tgi_h100.template.slurm" - load_balancer_template_path: str = "/fsx/ilyas/swarm-templates/nginx.template.conf" - debug_endpoint: Optional[str] = None - - def __post_init__(self): - super().__post_init__() - - # so that downloaded artifacts are stored in the same place - self.hub_kwargs["cache_dir"] = self.volume diff --git a/optimum_benchmark/backends/neural_compressor/__init__.py b/optimum_benchmark/backends/neural_compressor/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/optimum_benchmark/backends/neural_compressor/backend.py b/optimum_benchmark/backends/neural_compressor/backend.py deleted file mode 100644 index c180a5ba..00000000 --- a/optimum_benchmark/backends/neural_compressor/backend.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -from collections import OrderedDict -from tempfile import TemporaryDirectory -from typing import Any, Dict - -import torch -from hydra.utils import get_class -from neural_compressor.config import AccuracyCriterion, PostTrainingQuantConfig, TuningCriterion -from optimum.intel.neural_compressor.quantization import INCQuantizer - -from ...generators.dataset_generator import DatasetGenerator -from ..base import Backend -from ..transformers_utils import fast_weights_init -from .config import INCConfig -from .utils import TASKS_TO_INCMODELS - - -class INCBackend(Backend[INCConfig]): - NAME: str = "neural-compressor" - - def __init__(self, config: INCConfig): - super().__init__(config) - - if self.config.task in TASKS_TO_INCMODELS: - self.incmodel_class = get_class(TASKS_TO_INCMODELS[self.config.task]) - self.logger.info(f"Using INCModel class {self.incmodel_class.__name__}") - else: - raise NotImplementedError(f"INCBackend does not support task {self.config.task}") - - def load(self) -> None: - self.logger.info("\t+ Creating backend temporary directory") - self.tmpdir = TemporaryDirectory() - - if self.config.ptq_quantization: - if self.config.no_weights: - self.logger.info("\t+ Creating no weights AutoModel") - self.create_no_weights_model() - self.logger.info("\t+ Loading no weights AutoModel") - self.load_automodel_with_no_weights() - else: - self.logger.info("\t+ Loading pretrained AutoModel") - self.load_automodel_from_pretrained() - self.logger.info("\t+ Applying post-training quantization") - self.quantize_automodel() - self.logger.info("\t+ Loading quantized INCModel") - original_model, self.config.model = self.config.model, self.quantized_model - self.load_incmodel_from_pretrained() - self.config.model = original_model - elif self.config.no_weights: - self.logger.info("\t+ Creating no weights INCModel") - self.create_no_weights_model() - self.logger.info("\t+ Loading no weights INCModel") - self.load_incmodel_with_no_weights() - else: - self.logger.info("\t+ Loading pretrained INCModel") - self.load_incmodel_from_pretrained() - - self.tmpdir.cleanup() - - def load_automodel_from_pretrained(self) -> None: - self.pretrained_model = self.automodel_loader.from_pretrained(self.config.model, **self.config.model_kwargs) - - def load_automodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self.load_automodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.tie_weights() - - self.config.model = original_model - - def load_incmodel_from_pretrained(self) -> None: - self.pretrained_model = self.incmodel_class.from_pretrained(self.config.model, **self.config.model_kwargs) - - def load_incmodel_with_no_weights(self) -> None: - original_model, self.config.model = self.config.model, self.no_weights_model - - with fast_weights_init(): - self.load_incmodel_from_pretrained() - - self.logger.info("\t+ Tying model weights") - self.pretrained_model.model.tie_weights() - - self.config.model = original_model - - def create_no_weights_model(self) -> None: - self.no_weights_model = os.path.join(self.tmpdir.name, "no_weights_model") - self.logger.info("\t+ Creating no weights model directory") - os.makedirs(self.no_weights_model, exist_ok=True) - self.logger.info("\t+ Creating no weights model state dict") - state_dict = torch.nn.Linear(1, 1).state_dict() - self.logger.info("\t+ Saving no weights model pytorch_model.bin") - torch.save(state_dict, os.path.join(self.no_weights_model, "pytorch_model.bin")) - self.logger.info("\t+ Saving no weights model pretrained config") - self.pretrained_config.save_pretrained(save_directory=self.no_weights_model) - - def quantize_automodel(self) -> None: - self.quantized_model = f"{self.tmpdir.name}/quantized_model" - self.logger.info("\t+ Processing quantization config") - ptq_quantization_config = self.config.ptq_quantization_config.copy() - ptq_quantization_config["accuracy_criterion"] = AccuracyCriterion( - **ptq_quantization_config["accuracy_criterion"] - ) - ptq_quantization_config["tuning_criterion"] = TuningCriterion(**ptq_quantization_config["tuning_criterion"]) - ptq_quantization_config = PostTrainingQuantConfig(**ptq_quantization_config) - self.logger.info("\t+ Creating quantizer") - quantizer = INCQuantizer.from_pretrained( - model=self.pretrained_model, - task=self.config.task, - seed=self.config.seed, - # TODO: add support for these - calibration_fn=None, - eval_fn=None, - ) - - if self.config.calibration: - self.logger.info("\t+ Generating calibration dataset") - dataset_shapes = {"dataset_size": 1, "sequence_length": 1, **self.model_shapes} - calibration_dataset = DatasetGenerator( - task=self.config.task, dataset_shapes=dataset_shapes, model_shapes=self.model_shapes - )() - columns_to_be_removed = list(set(calibration_dataset.column_names) - set(quantizer._signature_columns)) - calibration_dataset = calibration_dataset.remove_columns(columns_to_be_removed) - else: - calibration_dataset = None - - self.logger.info("\t+ Quantizing model") - quantizer.quantize( - save_directory=self.quantized_model, - calibration_dataset=calibration_dataset, - quantization_config=ptq_quantization_config, - # TODO: add support for these - remove_unused_columns=True, - data_collator=None, - file_name=None, - batch_size=1, - ) - - @torch.inference_mode() - def forward(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model(**input, **kwargs) - - @torch.inference_mode() - def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model.generate(**inputs, **kwargs) - - @torch.inference_mode() - def generate(self, input: Dict[str, Any], kwargs: Dict[str, Any]) -> OrderedDict: - return self.pretrained_model.generate(**input, **kwargs) diff --git a/optimum_benchmark/backends/neural_compressor/config.py b/optimum_benchmark/backends/neural_compressor/config.py deleted file mode 100644 index 8aea5964..00000000 --- a/optimum_benchmark/backends/neural_compressor/config.py +++ /dev/null @@ -1,71 +0,0 @@ -from dataclasses import dataclass, field -from typing import Any, Dict, Optional - -from omegaconf import OmegaConf - -from ...import_utils import neural_compressor_version -from ..config import BackendConfig - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L490 -ACCURACY_CRITERION_CONFIG = {"higher_is_better": True, "criterion": "relative", "tolerable_loss": 0.01} - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L593 -TUNING_CRITERION_CONFIG = { - "strategy": "basic", - "strategy_kwargs": None, - "timeout": 0, - "max_trials": 100, - "objective": "performance", -} - -# https://github.com/intel/neural-compressor/blob/master/neural_compressor/config.py#L1242 -PTQ_QUANTIZATION_CONFIG = { - "device": "cpu", - "backend": "default", - "domain": "auto", - "recipes": {}, - "quant_format": "default", - "inputs": [], - "outputs": [], - "approach": "static", - "calibration_sampling_size": [100], - "op_type_dict": None, - "op_name_dict": None, - "reduce_range": None, - "example_inputs": None, - "excluded_precisions": [], - "quant_level": "auto", - "accuracy_criterion": ACCURACY_CRITERION_CONFIG, - "tuning_criterion": TUNING_CRITERION_CONFIG, -} - - -@dataclass -class INCConfig(BackendConfig): - name: str = "neural-compressor" - version: Optional[str] = neural_compressor_version() - _target_: str = "optimum_benchmark.backends.neural_compressor.backend.INCBackend" - - # load options - no_weights: bool = False - - # post-training quantization options - ptq_quantization: bool = False - ptq_quantization_config: Dict[str, Any] = field(default_factory=dict) - - # calibration options - calibration: bool = False - calibration_config: Dict[str, Any] = field(default_factory=dict) - - def __post_init__(self): - super().__post_init__() - - if self.device != "cpu": - raise ValueError(f"INCBackend only supports CPU devices, got {self.device}") - - if self.ptq_quantization: - self.ptq_quantization_config = OmegaConf.to_object( - OmegaConf.merge(PTQ_QUANTIZATION_CONFIG, self.ptq_quantization_config) - ) - if self.ptq_quantization_config["approach"] == "static" and not self.calibration: - raise ValueError("Calibration must be enabled when using static quantization.") diff --git a/optimum_benchmark/backends/neural_compressor/utils.py b/optimum_benchmark/backends/neural_compressor/utils.py deleted file mode 100644 index beb99977..00000000 --- a/optimum_benchmark/backends/neural_compressor/utils.py +++ /dev/null @@ -1,5 +0,0 @@ -from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS - -TASKS_TO_INCMODELS = { - task: f"optimum.intel.neural_compressor.{incmodel_name}" for task, incmodel_name in _HEAD_TO_AUTOMODELS.items() -}