diff --git a/requirements-common.txt b/requirements-common.txt index 818f72e14be96..72fb020a82c4e 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 -xgrammar +xgrammar >= 0.1.5; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 23c31fcfd7f05..3340bad38ab73 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform if TYPE_CHECKING: from transformers import PreTrainedTokenizer @@ -25,6 +26,12 @@ def maybe_backend_fallback( guided_params.backend = "xgrammar" if guided_params.backend == "xgrammar": + # xgrammar only has x86 wheels for linux, fallback to outlines + if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: + logger.warning("xgrammar is only supported on x86 CPUs. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + # xgrammar doesn't support regex or choice, fallback to outlines if guided_params.regex is not None or guided_params.choice is not None: logger.warning( diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 7cb8ac4b0a1e0..419237c252ffd 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,5 +1,5 @@ from .interface import _Backend # noqa: F401 -from .interface import Platform, PlatformEnum, UnspecifiedPlatform +from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform current_platform: Platform @@ -120,4 +120,4 @@ def cuda_is_jetson() -> bool: else: current_platform = UnspecifiedPlatform() -__all__ = ['Platform', 'PlatformEnum', 'current_platform'] +__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index eac2b413f9271..0be7df7941b8b 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,4 +1,5 @@ import enum +import platform import random from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union @@ -37,6 +38,14 @@ class PlatformEnum(enum.Enum): UNSPECIFIED = enum.auto() +class CpuArchEnum(enum.Enum): + X86 = enum.auto() + ARM = enum.auto() + POWERPC = enum.auto() + OTHER = enum.auto() + UNKNOWN = enum.auto() + + class DeviceCapability(NamedTuple): major: int minor: int @@ -184,6 +193,23 @@ def verify_quantization(cls, quant: str) -> None: f"{quant} quantization is currently not supported in " f"{cls.device_name}.") + @classmethod + def get_cpu_architecture(cls) -> CpuArchEnum: + """ + Determine the CPU architecture of the current system. + Returns CpuArchEnum indicating the architecture type. + """ + machine = platform.machine().lower() + + if machine in ("x86_64", "amd64", "i386", "i686"): + return CpuArchEnum.X86 + elif machine.startswith("arm") or machine.startswith("aarch"): + return CpuArchEnum.ARM + elif machine.startswith("ppc"): + return CpuArchEnum.POWERPC + + return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED