From 2464b30ce179617a28313afd9e601058eaf5e948 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:13:16 +0000 Subject: [PATCH 01/15] Introduce Qwen2 embedding model Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.rst | 13 +- .../embedding/language/test_embedding.py | 2 + vllm/model_executor/models/qwen2.py | 116 ++++++++++++++++-- vllm/model_executor/models/qwen2_cls.py | 14 +-- vllm/model_executor/models/qwen2_rm.py | 15 +-- vllm/model_executor/models/registry.py | 1 + 6 files changed, 120 insertions(+), 41 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 5a474043078db..0a71237ef4190 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -330,11 +330,16 @@ Text Embedding - :code:`BAAI/bge-multilingual-gemma2`, etc. - - ✅︎ - * - :code:`MistralModel` - - Mistral-based + * - :code:`LlamaModel`, :code:`MistralModel` + - Llama-based, Mistral-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - ✅︎ - ✅︎ + * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` + - Qwen2-based + - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc. + - ✅︎ + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. @@ -355,7 +360,7 @@ Reward Modeling * - :code:`Qwen2ForRewardModel` - Qwen2-based - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - + - ✅︎ - ✅︎ .. note:: @@ -376,7 +381,7 @@ Classification * - :code:`Qwen2ForSequenceClassification` - Qwen2-based - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - + - ✅︎ - ✅︎ .. note:: diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 39b6bbaf43180..c4db053a8f5e3 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -11,6 +11,8 @@ "intfloat/e5-mistral-7b-instruct", "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2", + "ssmits/Qwen2-7B-Instruct-embed-base", + "Alibaba-NLP/gte-Qwen2-1.5B-instruct", ] ENCODER_ONLY = [ diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b0156a25ca5cf..fe4078675404c 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -37,6 +37,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -44,8 +45,9 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, @@ -248,6 +250,19 @@ def __init__( prefix: str = "", ) -> None: super().__init__() + + # TODO (@robertgshaw2): see if this can be moved out + if (cache_config.sliding_window is not None + and hasattr(config, "max_window_layers")): + raise ValueError("Sliding window for some but all layers is not " + "supported. This model uses sliding window " + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -413,17 +428,7 @@ def __init__( cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) + pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -445,6 +450,15 @@ def __init__( self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() + + # The same model class supports both language generation and embedding + # because the architecture name is the same + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -477,6 +491,13 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, @@ -484,3 +505,74 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if self.config.tie_word_embeddings else None), ) loader.load_weights(weights) + + +class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] + + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + pooler_config = vllm_config.model_config.pooler_config + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = Qwen2Model(config, cache_config, quant_config) + + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> torch.Tensor: + return self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors) + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self, + ignore_unexpected_prefixes=["lm_head."]) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 25ecf76e35f22..d00bdfd742108 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -17,10 +17,11 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput +from .interfaces import SupportsLoRA, SupportsPP from .utils import AutoWeightsLoader -class Qwen2ForSequenceClassification(nn.Module): +class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -54,17 +55,6 @@ def __init__( quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) self.config = config self.lora_config = lora_config diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 1f9411241bdd6..d9a059eec9ae6 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -16,7 +16,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .qwen2 import Qwen2Model from .utils import AutoWeightsLoader @@ -32,7 +32,7 @@ def forward(self, input): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsPP): +class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -66,17 +66,6 @@ def __init__( quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) self.config = config self.lora_config = lora_config diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 32750602b988c..34f34811e75ae 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -104,6 +104,7 @@ }, "MistralModel": ("llama", "LlamaEmbeddingModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), + "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 # [Multimodal] From 617ec86c91565de495d25c5b97b63c00ad58e66e Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:19:05 +0000 Subject: [PATCH 02/15] Update docs Signed-off-by: DarkLight1337 --- docs/source/models/supported_models.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 0a71237ef4190..e47c1e56e16d1 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -330,8 +330,8 @@ Text Embedding - :code:`BAAI/bge-multilingual-gemma2`, etc. - - ✅︎ - * - :code:`LlamaModel`, :code:`MistralModel` - - Llama-based, Mistral-based + * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. + - Llama-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - ✅︎ - ✅︎ From 136786dfcccbc8c74e9a22c7d43de26f81ef37e0 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:19:19 +0000 Subject: [PATCH 03/15] format Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index fe4078675404c..b83b6e221b59b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -562,8 +562,8 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, ) -> torch.Tensor: - return self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + return self.model(input_ids, positions, kv_caches, attn_metadata, + intermediate_tensors) def pooler( self, From 4974a491178b6c06d39bff9d5b7f1ab4947d6731 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:40:32 +0000 Subject: [PATCH 04/15] Fix default pooling Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b83b6e221b59b..34139f3989d4b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -550,7 +550,7 @@ def __init__( self._pooler = Pooler.from_config_with_defaults( pooler_config, - pooling_type=PoolingType.LAST, + pooling_type=PoolingType.MEAN, normalize=True, softmax=False) From e7735732249dfd5b75b771b7f541bdfe95a1d4a4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:49:37 +0000 Subject: [PATCH 05/15] Fix tests Signed-off-by: DarkLight1337 --- .../decoder_only/language/test_jamba.py | 18 ++++------------ .../decoder_only/language/test_mamba.py | 18 ++++------------ .../embedding/language/test_cls_models.py | 21 ++++++------------- .../embedding/language/test_embedding.py | 13 ++++++++---- 4 files changed, 23 insertions(+), 47 deletions(-) diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 384ec77e5455a..6542689c3f277 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,6 +33,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -293,17 +297,3 @@ def test_jamba_distributed_produces_identical_generation( name_0="vllm_tp_1", name_1="vllm_tp_2", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 2dc231c595ffa..78eab8d5354fd 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,6 +51,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -279,17 +283,3 @@ def test_state_cleanup( except ValueError: pytest.fail("Mamba inner state wasn't cleaned up between states, " "could be related to finished_requests_ids") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index d8ca6d361f0e3..fb1ad7dae5c55 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -26,8 +26,13 @@ def test_classification_models( auto_cls=AutoModelForSequenceClassification) as hf_model: hf_outputs = hf_model.classify(example_prompts) - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, task="embedding", dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) + print(hf_outputs, vllm_outputs) @@ -37,17 +42,3 @@ def test_classification_models( vllm_output = torch.tensor(vllm_output) assert torch.allclose(hf_output, vllm_output, 1e-3) - - -@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_classification_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index c4db053a8f5e3..6a8f966378c18 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -8,9 +8,9 @@ # Model, Guard MODELS = [ - "intfloat/e5-mistral-7b-instruct", - "BAAI/bge-base-en-v1.5", - "BAAI/bge-multilingual-gemma2", + # "intfloat/e5-mistral-7b-instruct", + # "BAAI/bge-base-en-v1.5", + # "BAAI/bge-multilingual-gemma2", "ssmits/Qwen2-7B-Instruct-embed-base", "Alibaba-NLP/gte-Qwen2-1.5B-instruct", ] @@ -45,8 +45,13 @@ def test_models( is_sentence_transformer=True) as hf_model: hf_outputs = hf_model.encode(example_prompts) - with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model: + with vllm_runner(model, task="embedding", dtype=dtype, + max_model_len=None) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) check_embeddings_close( embeddings_0_lst=hf_outputs, From 369a66a1dd0c879c07eaa2edb6f3b1950eb14744 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 15:53:20 +0000 Subject: [PATCH 06/15] format Signed-off-by: DarkLight1337 --- tests/models/embedding/language/test_cls_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index fb1ad7dae5c55..fdcf46ecf5df7 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -33,7 +33,6 @@ def test_classification_models( print(vllm_model.model.llm_engine.model_executor.driver_worker. model_runner.model) - print(hf_outputs, vllm_outputs) # check logits difference From 8db9fa82dd6e0f78a47fbf62dc141c3799caae51 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sat, 9 Nov 2024 16:27:18 +0000 Subject: [PATCH 07/15] Fix test Signed-off-by: DarkLight1337 --- vllm/model_executor/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 34f34811e75ae..6ffa06318a22b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -105,6 +105,7 @@ "MistralModel": ("llama", "LlamaEmbeddingModel"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), + "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 # [Multimodal] From e3e2422084a22f8d75b467b63edb4fe1c27bb32f Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Mon, 11 Nov 2024 07:17:50 +0000 Subject: [PATCH 08/15] lint Signed-off-by: DarkLight1337 --- vllm/model_executor/models/qwen2_rm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 525f9fbe6a3ca..55843d8325348 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -58,7 +58,6 @@ class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config From a7e26d134b10353e6f060487823e0bf373def466 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Nov 2024 17:58:03 +0000 Subject: [PATCH 09/15] Select tests using model flags Signed-off-by: DarkLight1337 --- .buildkite/run-cpu-test-ppc64le.sh | 6 +- .buildkite/run-cpu-test.sh | 6 +- .buildkite/test-pipeline.yaml | 38 +++++----- .../decoder_only/language/test_models.py | 71 ++++++++++++------- .../embedding/language/test_cls_models.py | 10 ++- .../embedding/language/test_embedding.py | 34 ++++----- .../vision_language/test_llava_next.py | 2 + .../embedding/vision_language/test_phi3v.py | 2 + .../encoder_decoder/language/test_bart.py | 9 ++- .../vision_language/test_mllama.py | 3 + 10 files changed, 107 insertions(+), 74 deletions(-) diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index 79526adef2a79..5d7a0bff90963 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -27,9 +27,9 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/decoder_only/audio_language -m cpu_model pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index b3771bb268e22..efe2abd5e9b62 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -38,9 +38,9 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - pytest -v -s tests/models/embedding/language - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model pytest -v -s tests/models/decoder_only/audio_language -m cpu_model pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index fbaa427bb7270..f57fea7902392 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -322,62 +322,64 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test (Standard) # 18min +- label: Language Models Test (Standard) # 25min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - pytest -v -s models/decoder_only/language -m core_model - pytest -v -s models/decoder_only/language -m quant_model + - pytest -v -s models/embedding/language -m core_model + - pytest -v -s models/embedding/vision_language -m core_model -- label: Decoder-only Language Models Test (Extended) # 46min +- label: Language Models Test (Extended) # 50min nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/language -m 'not core_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' -- label: Decoder-only Multi-Modal Models Test (Standard) # 22min +- label: Multi-Modal Models Test (Standard) # 25min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/decoder_only/audio_language -m core_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model # No tests under this group for now # - pytest -v -s models/decoder_only/audio_language -m quant_model - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model + - pytest -v -s models/encoder_decoder/language -m core_model + - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m +- label: Multi-Modal Models Test (Extended) # 1h10m nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - -- label: Other Models Test # 20min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/models/embedding/language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/language - - tests/models/encoder_decoder/vision_language - commands: - - pytest -v -s models/embedding/language - - pytest -v -s models/embedding/vision_language - - pytest -v -s models/encoder_decoder/language - - pytest -v -s models/encoder_decoder/vision_language + - pytest -v -s models/encoder_decoder/language -m 'not core_model' + - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index beb1ffb18436e..d6688a5982b1b 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -4,37 +4,54 @@ """ import pytest -from vllm.platforms import current_platform - from ...utils import check_logprobs_close -MODELS = [ - "facebook/opt-125m", # opt - "openai-community/gpt2", # gpt2 - # "Milos/slovak-gpt-j-405M", # gptj - # "bigcode/tiny_starcoder_py", # gpt_bigcode - # "EleutherAI/pythia-70m", # gpt_neox - "bigscience/bloom-560m", # bloom - testing alibi slopes - "microsoft/phi-2", # phi - # "stabilityai/stablelm-3b-4e1t", # stablelm - # "bigcode/starcoder2-3b", # starcoder2 - "google/gemma-1.1-2b-it", # gemma - "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 - "meta-llama/Llama-3.2-1B-Instruct", # llama -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - ] - -target_dtype = "half" - @pytest.mark.core_model -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.cpu_model +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "facebook/opt-125m", # opt + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openai-community/gpt2", # gpt2 + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param("Milos/slovak-gpt-j-405M"), # gptj + pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode + pytest.param("EleutherAI/pythia-70m"), # gpt_neox + pytest.param( + "bigscience/bloom-560m", # bloom - testing alibi slopes + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "microsoft/phi-2", # phi + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm + pytest.param("bigcode/starcoder2-3b"), # starcoder2 + pytest.param( + "google/gemma-1.1-2b-it", # gemma + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "meta-llama/Llama-3.2-1B-Instruct", # llama + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openbmb/MiniCPM3-4B", + # fused_moe not supported on CPU + marks=[pytest.mark.core_model], + ), + ]) +@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models( diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index fdcf46ecf5df7..6c22586087857 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -9,10 +9,14 @@ import torch from transformers import AutoModelForSequenceClassification -CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"] - -@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) +@pytest.mark.parametrize( + "model", + [ + pytest.param("jason9693/Qwen2.5-1.5B-apeach", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + ], +) @pytest.mark.parametrize("dtype", ["float"]) def test_classification_models( hf_runner, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 0345034e08d27..c07da6213688b 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,25 +4,24 @@ """ import pytest -from vllm.utils import current_platform - from ..utils import check_embeddings_close -# Model, Guard -MODELS = [ - # "intfloat/e5-mistral-7b-instruct", - # "BAAI/bge-base-en-v1.5", - # "BAAI/bge-multilingual-gemma2", - "ssmits/Qwen2-7B-Instruct-embed-base", - "Alibaba-NLP/gte-Qwen2-1.5B-instruct", -] - -ENCODER_ONLY = [ - "BAAI/bge-base-en-v1.5", -] - -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize( + "model", + [ + # [Encoder-only] + pytest.param("BAAI/bge-base-en-v1.5", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Encoder-decoder] + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model]), + pytest.param("BAAI/bge-multilingual-gemma2", + marks=[pytest.mark.core_model]), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), + ], +) @pytest.mark.parametrize("dtype", ["half"]) def test_models( hf_runner, @@ -31,9 +30,6 @@ def test_models( model, dtype: str, ) -> None: - if model not in ENCODER_ONLY and current_platform.is_cpu(): - pytest.skip("Skip large embedding models test on CPU.") - # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" # sentence_transformers will strip the input texts, see: diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index 9fab5898a06ba..329c6ba279f89 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -88,6 +88,7 @@ def _run_test( @pytest.mark.skipif(transformers.__version__.startswith("4.46"), reason="Model broken with changes in transformers 4.46") +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( @@ -112,6 +113,7 @@ def test_models_text( @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_image( diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index ee411472ba284..6145aff1a5ea2 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -74,6 +74,7 @@ def _run_test( ) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( @@ -98,6 +99,7 @@ def test_models_text( @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_image( diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 8e8862fadbf04..8fc4250c449bf 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -170,7 +170,14 @@ def run_test( ) -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize( + "model", + [ + pytest.param("facebook/bart-base", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("facebook/bart-large-cnn"), + ], +) @pytest.mark.parametrize("dtype", ["float", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index a3b1c0950d9a2..77dd1d81f84d7 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -233,6 +233,7 @@ def clear_cache(): @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", @@ -278,6 +279,7 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @@ -326,6 +328,7 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @large_gpu_test(min_gb=48) +@pytest.mark.core_model @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) From 7e789f47598d433079efa1e9099dbda5100df486 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Nov 2024 17:58:46 +0000 Subject: [PATCH 10/15] Update timing Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f57fea7902392..e42846498ca89 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -364,7 +364,7 @@ steps: - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Multi-Modal Models Test (Extended) # 1h10m +- label: Multi-Modal Models Test (Extended) # 1h15m nightly: true source_file_dependencies: - vllm/ From 1048c04882c7e7d1babef75472a56132d529eff7 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 13 Nov 2024 18:02:57 +0000 Subject: [PATCH 11/15] Combine commands Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e42846498ca89..541c14c253590 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -330,8 +330,7 @@ steps: - tests/models/embedding/language - tests/models/encoder_decoder/language commands: - - pytest -v -s models/decoder_only/language -m core_model - - pytest -v -s models/decoder_only/language -m quant_model + - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - pytest -v -s models/embedding/language -m core_model - pytest -v -s models/embedding/vision_language -m core_model @@ -356,11 +355,8 @@ steps: - tests/models/embedding/vision_language - tests/models/encoder_decoder/vision_language commands: - - pytest -v -s models/decoder_only/audio_language -m core_model - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model - # No tests under this group for now - # - pytest -v -s models/decoder_only/audio_language -m quant_model - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model + - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model From f183929a6787a9a5222c38eecec802714365a777 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 14 Nov 2024 03:12:06 +0000 Subject: [PATCH 12/15] Update timings Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 541c14c253590..5988c6c801da7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -322,7 +322,7 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Language Models Test (Standard) # 25min +- label: Language Models Test (Standard) # 42min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -346,7 +346,7 @@ steps: - pytest -v -s models/embedding/language -m 'not core_model' - pytest -v -s models/embedding/vision_language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 25min +- label: Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ From f64f560d13934dd2a701e8f1a8d4daca410ef659 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 15 Nov 2024 00:22:17 +0000 Subject: [PATCH 13/15] Update test distribution Signed-off-by: DarkLight1337 --- .../decoder_only/language/test_models.py | 26 +++++++++---------- .../embedding/language/test_embedding.py | 2 +- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index d6688a5982b1b..2a7ed8826d2f3 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -7,13 +7,11 @@ from ...utils import check_logprobs_close -@pytest.mark.core_model -@pytest.mark.cpu_model @pytest.mark.parametrize( "model", [ pytest.param( - "facebook/opt-125m", # opt + "bigscience/bloom-560m", # bloom - testing alibi slopes marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), pytest.param( @@ -24,32 +22,32 @@ pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode pytest.param("EleutherAI/pythia-70m"), # gpt_neox pytest.param( - "bigscience/bloom-560m", # bloom - testing alibi slopes + "google/gemma-1.1-2b-it", # gemma marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), pytest.param( - "microsoft/phi-2", # phi + "meta-llama/Llama-3.2-1B-Instruct", # llama marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), - pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm - pytest.param("bigcode/starcoder2-3b"), # starcoder2 pytest.param( - "google/gemma-1.1-2b-it", # gemma - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + "openbmb/MiniCPM3-4B", + # fused_moe not supported on CPU + marks=[pytest.mark.core_model], ), pytest.param( - "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + "facebook/opt-125m", # opt marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), pytest.param( - "meta-llama/Llama-3.2-1B-Instruct", # llama - marks=[pytest.mark.core_model, pytest.mark.cpu_model], + "microsoft/phi-2", # phi + marks=[pytest.mark.core_model], ), pytest.param( - "openbmb/MiniCPM3-4B", - # fused_moe not supported on CPU + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 marks=[pytest.mark.core_model], ), + pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm + pytest.param("bigcode/starcoder2-3b"), # starcoder2 ]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 518ed0cb187e7..c3f351ef707be 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -16,7 +16,7 @@ pytest.param("intfloat/multilingual-e5-large"), # [Encoder-decoder] pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model]), + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), From 153f2341498e86fec995933f2514a9942814e275 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 15 Nov 2024 01:22:58 +0000 Subject: [PATCH 14/15] Fix coverage test Signed-off-by: DarkLight1337 --- tests/models/registry.py | 4 ++++ tests/models/test_registry.py | 4 ++-- vllm/model_executor/models/registry.py | 7 ++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index ec9ff52d112df..3848367b6126c 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -129,9 +129,13 @@ class _HfExamplesInfo: # [Text-only] "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), + "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), + "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 + "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 + "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"), # [Multimodal] "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index dbc415796ee55..e462dae3dc688 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -77,8 +77,8 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda): def test_hf_registry_coverage(): - untested_archs = (HF_EXAMPLE_MODELS.get_supported_archs() - - set(ModelRegistry.get_supported_archs())) + untested_archs = (ModelRegistry.get_supported_archs() - + HF_EXAMPLE_MODELS.get_supported_archs()) assert not untested_archs, ( "Please add the following architectures to " diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2d0883da727bd..22c2e328bfb65 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -11,7 +11,8 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field from functools import lru_cache -from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union +from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import cloudpickle import torch.nn as nn @@ -303,8 +304,8 @@ class _ModelRegistry: # Keyed by model_arch models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> List[str]: - return list(self.models.keys()) + def get_supported_archs(self) -> AbstractSet[str]: + return self.models.keys() def register_model( self, From 319867b888dfd644c8493ca9f352bf5d11bd0deb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 15 Nov 2024 01:42:33 +0000 Subject: [PATCH 15/15] Remove unused models Signed-off-by: DarkLight1337 --- tests/models/encoder_decoder/language/test_bart.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py index 8fc4250c449bf..10aba8427944f 100644 --- a/tests/models/encoder_decoder/language/test_bart.py +++ b/tests/models/encoder_decoder/language/test_bart.py @@ -14,8 +14,6 @@ from ....utils import multi_gpu_test from ...utils import check_logprobs_close -MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"] - def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],