diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 06b8542779dc0..f6d276fe7c0c8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -112,9 +112,7 @@ class EngineArgs: pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None - # NOTE(kzawora): default block size for Gaudi should be 128 - # smaller sizes still work, but very inefficiently - block_size: int = 16 if not current_platform.is_hpu() else 128 + block_size: Optional[int] = None enable_prefix_caching: Optional[bool] = None disable_sliding_window: bool = False use_v2_block_manager: bool = True @@ -1036,9 +1034,7 @@ def create_engine_config(self, self.enable_prefix_caching = False cache_config = CacheConfig( - # neuron needs block_size = max_model_len - block_size=self.block_size if self.device != "neuron" else - (self.max_model_len if self.max_model_len is not None else 0), + block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, swap_space=self.swap_space, cache_dtype=self.kv_cache_dtype, diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index aad8755d9fcd8..d95a2b4cd5565 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -60,6 +60,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE if kv_cache_space >= 0: diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index ae1fd6d5ce068..3c5350b778345 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -137,6 +137,10 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 2b947d280f9f8..0a44f2b74163a 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -48,6 +48,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + # NOTE(kzawora): default block size for Gaudi should be 128 + # smaller sizes still work, but very inefficiently + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 128 + @classmethod def is_pin_memory_available(cls): logger.warning("Pin memory is not supported on HPU.") diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 86113523385f6..a4bbbd27c8a89 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -33,6 +33,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.worker_cls = \ "vllm.worker.neuron_worker.NeuronWorker" + cache_config = vllm_config.cache_config + if cache_config: + # neuron needs block_size = max_model_len + vllm_config.cache_config.block_size = \ + vllm_config.model_config.max_model_len + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on Neuron.") diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index ccd94e8adb3b1..16eb8dc81efc2 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -87,6 +87,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: # check and update cache config ov_core = ov.Core() cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": if not OpenVinoPlatform.is_openvino_cpu(): logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 0133f26a0b1bc..7778b565372cb 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -84,6 +84,10 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 10d874349f36b..77f5c8401424b 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -46,6 +46,11 @@ def inference_mode(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: from vllm.config import CompilationLevel + + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + compilation_config = vllm_config.compilation_config if compilation_config.level == CompilationLevel.NO_COMPILATION: # TPU does not support NO_COMPILATION diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index c20190e789d7e..78e17c2afec65 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -51,6 +51,10 @@ def inference_mode(): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + # check and update model config model_config = vllm_config.model_config if model_config.dtype == torch.bfloat16: