From 874ce6287f45fd4abbf347ac8a041e1e5ff566a5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 03:17:56 +0000 Subject: [PATCH 01/18] Fix incorrect model initialization Signed-off-by: DarkLight1337 --- vllm/model_executor/models/fuyu.py | 7 +++++-- vllm/model_executor/models/internlm2_ve.py | 8 +------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 50701793b7b83..31fc098a8bb3f 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -41,7 +41,8 @@ from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP -from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings +from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, + merge_multimodal_embeddings) # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 71011 @@ -245,7 +246,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): gather_output=True, ) self.language_model = PersimmonForCausalLM( - vllm_config.with_hf_config(config.text_config)) + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "language_model"), + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 34889d691a934..f1b7c896cadfe 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -161,11 +161,5 @@ class InternLM2VEForCausalLM(InternLM2ForCausalLM): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix) - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.model = InternLM2VEModel(config, - cache_config, - quant_config, + self.model = InternLM2VEModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) From 0789f613710a1d7e5c6390ff1aca1e63057a5377 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 06:44:39 +0000 Subject: [PATCH 02/18] Fix minicpmv Signed-off-by: DarkLight1337 --- vllm/model_executor/models/minicpmv.py | 32 +++++++------------------- 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 999739ccd98bf..fd8eda997f76f 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -382,11 +382,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): instantiated. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config quant_config = vllm_config.quant_config @@ -699,12 +695,8 @@ def is_default_weight_loading(self, name: str) -> bool: class MiniCPMV2_0(MiniCPMVBaseModel): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 0) def init_llm( @@ -857,12 +849,8 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 5) def init_llm( @@ -999,12 +987,8 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) assert self.version == (2, 6) def init_llm( @@ -1117,7 +1101,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __new__(cls, vllm_config: VllmConfig, prefix: str = ""): + def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config if not hasattr(config, "version"): if config.hidden_size == 2304 and config.query_num == 64: From 7afe8debd91eebd46e6b2d720d920dd531deda8d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 06:45:19 +0000 Subject: [PATCH 03/18] Add HF example model registry Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 6 +- tests/models/registry.py | 189 ++++++++++++++++++++ tests/models/test_registry.py | 9 + vllm/model_executor/models/registry.py | 9 +- 4 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 tests/models/registry.py diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 5d566f8308b70..c49ed9802cde8 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -166,14 +166,14 @@ def iter_params(self, model_name: str): "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4), "mosaicml/mpt-7b": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(), - "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(), + "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), + "adept/persimmon-8b-chat": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "adept/persimmon-8b-chat": PPTestSettings.fast(), + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), diff --git a/tests/models/registry.py b/tests/models/registry.py new file mode 100644 index 0000000000000..e40fca047bacd --- /dev/null +++ b/tests/models/registry.py @@ -0,0 +1,189 @@ +from dataclasses import dataclass, field +from typing import AbstractSet, Mapping + + +@dataclass(frozen=True) +class _HfExamplesInfo: + default: str + """The default model to use for testing this architecture.""" + + extras: Mapping[str, str] = field(default_factory=dict) + """Extra models to use for testing this architecture.""" + + is_available_online: bool = True + """ + Set this to ``False`` if the name of this architecture no longer exists on + the HF repo. To maintain backwards compatibility, we have not removed them + from the main model registry, so without this flag the registry tests will + fail. + """ + + trust_remote_code: bool = False + """The ``trust_remote_code`` level required to load the model.""" + + +# yapf: disable +_TEXT_GENERATION_EXAMPLE_MODELS = { + # [Decoder-only] + "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B", + trust_remote_code=True), + "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B", + trust_remote_code=True), + "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", + trust_remote_code=True), + "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", + trust_remote_code=True), + "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", + trust_remote_code=True), + "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"), + # ChatGLMModel supports multimodal + "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01", + trust_remote_code=True), + "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"), + "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct", + trust_remote_code=True), + "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"), + "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat", # noqa: E501 + trust_remote_code=True), + "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), + "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), + "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), + "GPT2LMHeadModel": _HfExamplesInfo("gpt2"), + "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"), + "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"), + "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"), + "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), + "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), + "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b"), + "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", + trust_remote_code=True), + "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B"), + "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), + "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), + "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), + "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf", + is_available_online=False), + "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"), + "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"), # noqa: E501 + "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16", + trust_remote_code=True), + "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B", + trust_remote_code=True), + "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"), + "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"), # noqa: E501 + "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"), # noqa: E501 + "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False), + "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"), + "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"), + "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"), + "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"), + "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"), + "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat", + trust_remote_code=True), + "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), + "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), + "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), + "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct", + trust_remote_code=True), + "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct", + trust_remote_code=True), + # QWenLMHeadModel supports multimodal + "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"), + "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"), + "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b", + is_available_online=False), + "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b", # noqa: E501 + is_available_online=False), + "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"), + "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), + "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), + "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", + trust_remote_code=True), + # [Encoder-decoder] + "BartModel": _HfExamplesInfo("facebook/bart-base"), + "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), + "Florence2ForConditionalGeneration": _HfExamplesInfo("Xenova/tiny-random-Florence2ForConditionalGeneration"), # noqa: E501 +} + +_EMBEDDING_EXAMPLE_MODELS = { + # [Text-only] + "BertModel": _HfExamplesInfo("google-bert/bert-base"), + "Gemma2Model": _HfExamplesInfo("gBAAI/bge-multilingual-gemma2"), + "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), + "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 + # [Multimodal] + "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), # noqa: E501 + "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", + trust_remote_code=True), + "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1") # noqa: E501, +} + +_MULTIMODAL_EXAMPLE_MODELS = { + # [Decoder-only] + "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 + "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 + "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", + extras={"text_only": "THUDM/chatglm3-6b"}, + trust_remote_code=True), + "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", + is_available_online=False), + "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), + "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), + "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", + trust_remote_code=True), + "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"), # noqa: E501 + "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf", + extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501 + "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 + "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 + "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", + trust_remote_code=True), + "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", + trust_remote_code=True), + "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B", + trust_remote_code=True), + "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501 + "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", + trust_remote_code=True), + "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409"), # noqa: E501 + "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat", + extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501 + trust_remote_code=True), + "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 + "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + # [Encoder-decoder] + "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 +} + +_SPECULATIVE_DECODING_EXAMPLE_MODELS = { + "EAGLEModel": _HfExamplesInfo("abhigoyal/vllm-eagle-llama-68m-random"), + "MedusaModel": _HfExamplesInfo("abhigoyal/vllm-medusa-llama-68m-random"), + "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("ibm-fms/llama-160m-accelerator"), # noqa: E501 +} + +_EXAMPLE_MODELS = { + **_TEXT_GENERATION_EXAMPLE_MODELS, + **_EMBEDDING_EXAMPLE_MODELS, + **_MULTIMODAL_EXAMPLE_MODELS, + **_SPECULATIVE_DECODING_EXAMPLE_MODELS, +} + + +class HfExampleModels: + def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None: + super().__init__() + + self.hf_models = hf_models + + def get_supported_archs(self) -> AbstractSet[str]: + return self.hf_models.keys() + + def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: + return self.hf_models[model_arch] + + +HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index a2194fa15f90e..d59ae06d4e649 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -14,6 +14,7 @@ from vllm.platforms import current_platform from ..utils import fork_new_process_for_each_test +from .registry import HF_EXAMPLE_MODELS @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) @@ -73,3 +74,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda): "This model no longer initializes CUDA on import. " "Please test using a different one.", stacklevel=2) + + +def test_hf_registry_coverage(): + untested_archs = (HF_EXAMPLE_MODELS.get_supported_archs() - + set(ModelRegistry.get_supported_archs())) + + assert not untested_archs, ("Please add the following architectures to " + f"`tests/models/registry.py: {untested_archs}") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f172c06c4a26a..e11e1b419a2bd 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,3 +1,8 @@ +""" +Whenever you add an architecture to this page, please also update +`vllm/model_executor/models/registry.py` with example HuggingFace models +for that architecture. +""" import importlib import os import pickle @@ -58,14 +63,14 @@ "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MambaForCausalLM": ("mamba", "MambaForCausalLM"), "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"), + "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), + "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"), "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"), # transformers's mpt class has lower case "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), - "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), - "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"), "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"), From 2434999fb680dde0a63b6ca12e263bc3af82b949 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 06:48:50 +0000 Subject: [PATCH 04/18] Add model initialization tests Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 10 +++++++++- tests/models/test_initialization.py | 20 ++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 tests/models/test_initialization.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fbaa427bb7270..904d69f354f3b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -320,7 +320,15 @@ steps: commands: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s models/*.py --ignore=models/test_oot_registration.py + - pytest -v -s models/test_registry.py + +- label: Model Initialization Test # 1h + optional: true + source_file_dependencies: + - vllm/ + - tests/models + commands: + - pytest -v -s models/test_initialization.py - label: Decoder-only Language Models Test (Standard) # 18min #mirror_hardwares: [amd] diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py new file mode 100644 index 0000000000000..8a0373e851a28 --- /dev/null +++ b/tests/models/test_initialization.py @@ -0,0 +1,20 @@ +import pytest + +from vllm import LLM + +from .registry import HF_EXAMPLE_MODELS + + +@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) +def test_model_can_initialize(model_arch): + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + + if not model_info.is_available_online: + pytest.skip("Model is not available online") + + try: + LLM(model_info.default, + trust_remote_code=model_info.trust_remote_code, + load_format="dummy") + except Exception: + raise From a9644a3493a453f0f1100029a17fc6c7e79a4d69 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 06:50:29 +0000 Subject: [PATCH 05/18] Trigger ci Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 8a0373e851a28..0815f75a02b3e 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -6,7 +6,7 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) -def test_model_can_initialize(model_arch): +def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: From 9e6103ff3d7f21f4e3cf1c64ca341096d04f395b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 08:31:41 +0000 Subject: [PATCH 06/18] Fix wrong models Signed-off-by: DarkLight1337 --- tests/models/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index e40fca047bacd..685d46dc48938 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -108,8 +108,8 @@ class _HfExamplesInfo: _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] - "BertModel": _HfExamplesInfo("google-bert/bert-base"), - "Gemma2Model": _HfExamplesInfo("gBAAI/bge-multilingual-gemma2"), + "BertModel": _HfExamplesInfo("google-bert/bert-base-uncased"), + "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 From f69a47bb4fee34473eed39da1b7ebf55c3c59578 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:01:54 +0000 Subject: [PATCH 07/18] Avoid OOM Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 0815f75a02b3e..f143faa675806 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -8,13 +8,28 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if not model_info.is_available_online: pytest.skip("Model is not available online") + # Avoid OOM + text_config_overrides = { + "num_layers": 1, + "num_hidden_layers": 1, + "num_experts": 2, + "num_experts_per_tok": 2, + "num_local_experts": 2, + } + try: - LLM(model_info.default, + LLM( + model_info.default, trust_remote_code=model_info.trust_remote_code, - load_format="dummy") + max_model_len=512, + max_num_seqs=1, + load_format="dummy", + hf_overrides={ + **text_config_overrides, + "text_config": text_config_overrides, # For multi-modal models + }) except Exception: raise From 7153190e376f9fbb7fc7e8509701af73631e74ae Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:02:46 +0000 Subject: [PATCH 08/18] Fix message Signed-off-by: DarkLight1337 --- tests/models/test_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index d59ae06d4e649..125c6854206d1 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -81,4 +81,4 @@ def test_hf_registry_coverage(): set(ModelRegistry.get_supported_archs())) assert not untested_archs, ("Please add the following architectures to " - f"`tests/models/registry.py: {untested_archs}") + f"`tests/models/registry.py`: {untested_archs}") From a801644fe65b7af900f8fe836b2fa23d9687d2a0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:05:52 +0000 Subject: [PATCH 09/18] format Signed-off-by: DarkLight1337 --- tests/models/test_registry.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 125c6854206d1..dbc415796ee55 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -80,5 +80,6 @@ def test_hf_registry_coverage(): untested_archs = (HF_EXAMPLE_MODELS.get_supported_archs() - set(ModelRegistry.get_supported_archs())) - assert not untested_archs, ("Please add the following architectures to " - f"`tests/models/registry.py`: {untested_archs}") + assert not untested_archs, ( + "Please add the following architectures to " + f"`tests/models/registry.py`: {untested_archs}") From 75587391ac62707d29ae28f813f2241464aa955e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:17:34 +0000 Subject: [PATCH 10/18] Avoid calling model.forward Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index f143faa675806..a703c32f0c109 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,3 +1,5 @@ +from unittest.mock import patch + import pytest from vllm import LLM @@ -20,16 +22,18 @@ def test_can_initialize(model_arch): "num_local_experts": 2, } - try: + def _initialize_kv_caches(self) -> None: + self.cache_config.num_gpu_blocks = 0 + self.cache_config.num_cpu_blocks = 0 + + with patch.object(LLM.get_engine_class(), "_initialize_kv_caches", + _initialize_kv_caches): LLM( model_info.default, trust_remote_code=model_info.trust_remote_code, - max_model_len=512, - max_num_seqs=1, load_format="dummy", hf_overrides={ **text_config_overrides, "text_config": text_config_overrides, # For multi-modal models - }) - except Exception: - raise + }, + ) From 3873f836efc7729e0da80f0cb3ac7765fa5f2976 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:18:07 +0000 Subject: [PATCH 11/18] Add comment Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index a703c32f0c109..485cc47664afc 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -22,6 +22,7 @@ def test_can_initialize(model_arch): "num_local_experts": 2, } + # Avoid calling model.forward() def _initialize_kv_caches(self) -> None: self.cache_config.num_gpu_blocks = 0 self.cache_config.num_cpu_blocks = 0 From 3dae7a93c89282c3d0e7a8414cb0671cc1862a50 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 09:38:17 +0000 Subject: [PATCH 12/18] Enable conditional HF override Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 28 +++++++++++++--------- vllm/config.py | 36 +++++++++++++++++++++-------- vllm/engine/arg_utils.py | 8 +++---- vllm/entrypoints/llm.py | 8 ++++--- 4 files changed, 52 insertions(+), 28 deletions(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 485cc47664afc..b8a3eec05510d 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,6 +1,7 @@ from unittest.mock import patch import pytest +from transformers import PretrainedConfig from vllm import LLM @@ -14,13 +15,21 @@ def test_can_initialize(model_arch): pytest.skip("Model is not available online") # Avoid OOM - text_config_overrides = { - "num_layers": 1, - "num_hidden_layers": 1, - "num_experts": 2, - "num_experts_per_tok": 2, - "num_local_experts": 2, - } + def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: + if hasattr(hf_config, "text_config"): + text_config: PretrainedConfig = hf_config.text_config + else: + text_config = hf_config + + text_config.update({ + "num_layers": 1, + "num_hidden_layers": 1, + "num_experts": 2, + "num_experts_per_tok": 2, + "num_local_experts": 2, + }) + + return text_config # Avoid calling model.forward() def _initialize_kv_caches(self) -> None: @@ -33,8 +42,5 @@ def _initialize_kv_caches(self) -> None: model_info.default, trust_remote_code=model_info.trust_remote_code, load_format="dummy", - hf_overrides={ - **text_config_overrides, - "text_config": text_config_overrides, # For multi-modal models - }, + hf_overrides=hf_overrides, ) diff --git a/vllm/config.py b/vllm/config.py index 002adb4316969..83b1483eb99e0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3,8 +3,8 @@ import json import warnings from dataclasses import dataclass, field, replace -from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal, - Mapping, Optional, Set, Tuple, Type, Union) +from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List, + Literal, Mapping, Optional, Set, Tuple, Type, Union) import torch from transformers import PretrainedConfig @@ -20,7 +20,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - print_warning_once) + identity, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -44,6 +44,9 @@ # "draft" is only used internally for speculative decoding _Task = Literal["generate", "embedding", "draft"] +HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], + PretrainedConfig]] + class ModelConfig: """Configuration for the model. @@ -115,7 +118,9 @@ class ModelConfig: can not be gathered from the vllm arguments. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. - hf_overrides: Arguments to be forwarded to the HuggingFace config. + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. mm_processor_kwargs: Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. pooling_type: Used to configure the pooling method in the embedding @@ -164,7 +169,7 @@ def __init__( override_neuron_config: Optional[Dict[str, Any]] = None, config_format: ConfigFormat = ConfigFormat.AUTO, chat_template_text_format: str = "string", - hf_overrides: Optional[Dict[str, Any]] = None, + hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, pooling_type: Optional[str] = None, pooling_norm: Optional[bool] = None, @@ -182,15 +187,23 @@ def __init__( if hf_overrides is None: hf_overrides = {} + + if callable(hf_overrides): + hf_overrides_kw = {} + hf_overrides_fn = hf_overrides + else: + hf_overrides_kw = hf_overrides + hf_overrides_fn = identity + if rope_scaling is not None: hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} - hf_overrides.update(hf_override) + hf_overrides_kw.update(hf_override) msg = ("`--rope-scaling` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) if rope_theta is not None: hf_override = {"rope_theta": rope_theta} - hf_overrides.update(hf_override) + hf_overrides_kw.update(hf_override) msg = ("`--rope-theta` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) @@ -207,9 +220,12 @@ def __init__( self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, config_format, - **hf_overrides) + + hf_config = get_config(self.model, trust_remote_code, revision, + code_revision, config_format, **hf_overrides_kw) + hf_config = hf_overrides_fn(hf_config) + self.hf_config = hf_config + self.hf_text_config = get_hf_text_config(self.hf_config) self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 31aa8c5908719..244aa09e12552 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -9,9 +9,9 @@ import vllm.envs as envs from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig, - DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, + DeviceConfig, HfOverrides, LoadConfig, LoadFormat, + LoRAConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig, TaskOption, TokenizerPoolConfig, VllmConfig) from vllm.executor.executor_base import ExecutorBase @@ -128,7 +128,7 @@ class EngineArgs: code_revision: Optional[str] = None rope_scaling: Optional[Dict[str, Any]] = None rope_theta: Optional[float] = None - hf_overrides: Optional[Dict[str, Any]] = None + hf_overrides: Optional[HfOverrides] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: Optional[bool] = None diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index a15dbd1c45119..63c2bb6097079 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,7 +9,7 @@ from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) -from vllm.engine.arg_utils import EngineArgs, TaskOption +from vllm.engine.arg_utils import EngineArgs, HfOverrides, TaskOption from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_hf_chat_template, @@ -101,7 +101,9 @@ class LLM: disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig` disable_async_output_proc: Disable async output processing. This may result in lower performance. - hf_overrides: Arguments to be forwarded to the HuggingFace config. + hf_overrides: If a dictionary, contains arguments to be forwarded to the + HuggingFace config. If a callable, it is called to update the + HuggingFace config. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) @@ -156,7 +158,7 @@ def __init__( max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, - hf_overrides: Optional[dict] = None, + hf_overrides: Optional[HfOverrides] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", From 361074679f081bfc8b738e2b16ad4ed3bc94f0d6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 10:16:25 +0000 Subject: [PATCH 13/18] Fix model loading Signed-off-by: DarkLight1337 --- tests/models/registry.py | 47 +++++++++++++++++++++-------- tests/models/test_initialization.py | 6 +++- 2 files changed, 40 insertions(+), 13 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 685d46dc48938..ec9ff52d112df 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,5 +1,5 @@ from dataclasses import dataclass, field -from typing import AbstractSet, Mapping +from typing import AbstractSet, Mapping, Optional @dataclass(frozen=True) @@ -10,6 +10,18 @@ class _HfExamplesInfo: extras: Mapping[str, str] = field(default_factory=dict) """Extra models to use for testing this architecture.""" + tokenizer: Optional[str] = None + """Set the tokenizer to load for this architecture.""" + + tokenizer_mode: str = "auto" + """Set the tokenizer type for this architecture.""" + + speculative_model: Optional[str] = None + """ + The default model to use for testing this architecture, which is only used + for speculative decoding. + """ + is_available_online: bool = True """ Set this to ``False`` if the name of this architecture no longer exists on @@ -55,10 +67,12 @@ class _HfExamplesInfo: "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"), "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"), "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"), - "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b"), + "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b", + trust_remote_code=True), "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b", - trust_remote_code=True), - "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B"), + trust_remote_code=True), + "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B", + trust_remote_code=True), "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"), "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"), "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"), @@ -99,25 +113,30 @@ class _HfExamplesInfo: "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"), "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"), "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat", + is_available_online=False, trust_remote_code=True), # [Encoder-decoder] "BartModel": _HfExamplesInfo("facebook/bart-base"), "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"), - "Florence2ForConditionalGeneration": _HfExamplesInfo("Xenova/tiny-random-Florence2ForConditionalGeneration"), # noqa: E501 + # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer + # Therefore, we borrow the BartTokenizer from the original Bart model + "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501 + tokenizer="facebook/bart-base", + trust_remote_code=True), # noqa: E501 } _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] - "BertModel": _HfExamplesInfo("google-bert/bert-base-uncased"), + "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 # [Multimodal] - "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), # noqa: E501 + "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", trust_remote_code=True), - "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1") # noqa: E501, + "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501 } _MULTIMODAL_EXAMPLE_MODELS = { @@ -148,7 +167,8 @@ class _HfExamplesInfo: "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"), # noqa: E501 "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct", trust_remote_code=True), - "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409"), # noqa: E501 + "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501 + tokenizer_mode="mistral"), "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat", extras={"text_only": "Qwen/Qwen-7B-Chat"}, # noqa: E501 trust_remote_code=True), @@ -160,9 +180,12 @@ class _HfExamplesInfo: } _SPECULATIVE_DECODING_EXAMPLE_MODELS = { - "EAGLEModel": _HfExamplesInfo("abhigoyal/vllm-eagle-llama-68m-random"), - "MedusaModel": _HfExamplesInfo("abhigoyal/vllm-medusa-llama-68m-random"), - "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("ibm-fms/llama-160m-accelerator"), # noqa: E501 + "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m", + speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501 + "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", + speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 + "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", + speculative_model="ibm-fms/llama-160m-accelerator"), # noqa: E501 } _EXAMPLE_MODELS = { diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index b8a3eec05510d..2577c83d37de8 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -29,7 +29,7 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: "num_local_experts": 2, }) - return text_config + return hf_config # Avoid calling model.forward() def _initialize_kv_caches(self) -> None: @@ -40,6 +40,10 @@ def _initialize_kv_caches(self) -> None: _initialize_kv_caches): LLM( model_info.default, + tokenizer=model_info.tokenizer, + tokenizer_mode=model_info.tokenizer_mode, + speculative_model=model_info.speculative_model, + num_speculative_tokens=1, trust_remote_code=model_info.trust_remote_code, load_format="dummy", hf_overrides=hf_overrides, From 122e5343bad3007f48786e42a31c1c3be80e2586 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 10:16:35 +0000 Subject: [PATCH 14/18] Fix MLP speculator Signed-off-by: DarkLight1337 --- vllm/model_executor/models/mlp_speculator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 6aa43f22f4c93..4d7e82880041d 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -65,7 +65,7 @@ class MLPSpeculator(nn.Module): https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite """ - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config self.n_predict = config.n_predict From 876b2bb8f88d2f2cea4186177f7647412ff3baa5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 10:16:41 +0000 Subject: [PATCH 15/18] Merge the tests Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 904d69f354f3b..baad54eaf6a91 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -313,7 +313,7 @@ steps: ##### models test ##### -- label: Basic Models Test # 10min +- label: Basic Models Test # 30min source_file_dependencies: - vllm/ - tests/models @@ -321,13 +321,6 @@ steps: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_registry.py - -- label: Model Initialization Test # 1h - optional: true - source_file_dependencies: - - vllm/ - - tests/models - commands: - pytest -v -s models/test_initialization.py - label: Decoder-only Language Models Test (Standard) # 18min From f596632b3a8873a8d96812141a0f1d83e995d1f2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 10:19:59 +0000 Subject: [PATCH 16/18] Fix wrong file in docstring Signed-off-by: DarkLight1337 --- vllm/model_executor/models/registry.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index e11e1b419a2bd..67f6213a74597 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -1,7 +1,6 @@ """ Whenever you add an architecture to this page, please also update -`vllm/model_executor/models/registry.py` with example HuggingFace models -for that architecture. +`tests/models/registry.py` with example HuggingFace models for it. """ import importlib import os From 3678ac2098d7e54723dc5a867d8c4007f434e683 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 14 Nov 2024 19:07:02 +0800 Subject: [PATCH 17/18] fix --- tests/models/test_initialization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 2577c83d37de8..3b728f2744fca 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -43,7 +43,7 @@ def _initialize_kv_caches(self) -> None: tokenizer=model_info.tokenizer, tokenizer_mode=model_info.tokenizer_mode, speculative_model=model_info.speculative_model, - num_speculative_tokens=1, + num_speculative_tokens=1 if model_info.speculative_model else None, trust_remote_code=model_info.trust_remote_code, load_format="dummy", hf_overrides=hf_overrides, From 1ac461938e25e2553d08e25577515f338dd0a186 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 12:47:28 +0000 Subject: [PATCH 18/18] Skip idefics3 Signed-off-by: DarkLight1337 --- tests/models/test_initialization.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 3b728f2744fca..b8312c2d9b7cc 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,6 +1,7 @@ from unittest.mock import patch import pytest +import transformers from transformers import PretrainedConfig from vllm import LLM @@ -10,6 +11,10 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): + if (model_arch == "Idefics3ForConditionalGeneration" + and transformers.__version__ < "4.46.0"): + pytest.skip(reason="Model introduced in HF >= 4.46.0") + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: pytest.skip("Model is not available online")