Skip to content

Commit

Permalink
Merge branch 'main' into deepseek-vl2
Browse files Browse the repository at this point in the history
  • Loading branch information
Isotr0py authored Jan 2, 2025
2 parents 0231368 + 8c38ee7 commit 12f4553
Show file tree
Hide file tree
Showing 114 changed files with 3,978 additions and 2,476 deletions.
2 changes: 1 addition & 1 deletion .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ steps:
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: block-h100
depends_on: ~
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
Expand Down
29 changes: 21 additions & 8 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -106,14 +106,12 @@ steps:
source_file_dependencies:
- vllm/
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

Expand Down Expand Up @@ -333,8 +331,6 @@ steps:
- vllm/
- tests/models
commands:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py

Expand All @@ -360,7 +356,7 @@ steps:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'

- label: Multi-Modal Models Test (Standard) # 28min
- label: Multi-Modal Models Test (Standard) # 40min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -376,7 +372,7 @@ steps:
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model

- label: Multi-Modal Models Test (Extended) 1 # 1h16m
- label: Multi-Modal Models Test (Extended) 1 # 48m
optional: true
source_file_dependencies:
- vllm/
Expand Down Expand Up @@ -469,11 +465,28 @@ steps:
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py

- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
fast_check: true
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
commands:
# begin platform plugin tests, all the code in-between runs on dummy platform
- pip install -e ./plugins/vllm_add_dummy_platform
- pytest -v -s plugins_tests/test_platform_plugins.py
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# other tests continue here:
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s models/test_oot_registration.py # it needs a clean process

- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
Expand Down
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
Expand Down
184 changes: 184 additions & 0 deletions benchmarks/benchmark_long_document_qa_throughput.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
Offline benchmark to test the long document QA throughput.
Example usage:
# This command run the vllm with 50GB CPU memory for offloading
# The workload samples 8 different prompts with a default input
# length of 20000 tokens, then replicates each prompt 2 times
# in random order.
python benchmark_long_document_qa_throughput.py \
--model meta-llama/Llama-2-7b-chat-hf \
--enable-prefix-caching \
--num-documents 8 \
--repeat-count 2
Commandline arguments:
--num-documents: The number of documents to sample prompts from.
--document-length: The length of each document in tokens.
(Optional, default: 20000)
--output-len: The number of tokens to generate for each prompt.
(Optional, default: 10)
--repeat-count: The number of times to repeat each prompt.
(Optional, default: 2)
--repeat-mode: The mode to repeat prompts. The supported modes are:
- 'random': shuffle the prompts randomly. (Default)
- 'tile': the entire prompt list is repeated in sequence. (Potentially
lowest cache hit)
- 'interleave': each prompt is repeated consecutively before
moving to the next element. (Highest cache hit)
--shuffle-seed: Random seed when the repeat mode is "random".
(Optional, default: 0)
In the meantime, it also supports all the vLLM engine args to initialize the
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
details.
"""

import dataclasses
import random
import time

from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser


def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
"""
Test long document QA with the given prompts and sampling parameters.
Print the time spent in processing all the prompts.
Args:
llm: The language model used for generating responses.
sampling_params: Sampling parameter used to generate the response.
prompts: A list of prompt strings to be processed by the LLM.
"""
start_time = time.time()
llm.generate(prompts, sampling_params=sampling_params)
end_time = time.time()
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")


def repeat_prompts(prompts, repeat_count, mode: str):
"""
Repeat each prompt in the list for a specified number of times.
The order of prompts in the output list depends on the mode.
Args:
prompts: A list of prompts to be repeated.
repeat_count: The number of times each prompt is repeated.
mode: The mode of repetition. Supported modes are:
- 'random': Shuffle the prompts randomly after repetition.
- 'tile': Repeat the entire prompt list in sequence.
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
- 'interleave': Repeat each prompt consecutively before moving to
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
Returns:
A list of repeated prompts in the specified order.
Raises:
ValueError: If an invalid mode is provided.
"""
print("Repeat mode: ", mode)
if mode == 'random':
repeated_prompts = prompts * repeat_count
random.shuffle(repeated_prompts)
return repeated_prompts
elif mode == 'tile':
return prompts * repeat_count
elif mode == 'interleave':
repeated_prompts = []
for prompt in prompts:
repeated_prompts.extend([prompt] * repeat_count)
return repeated_prompts
else:
raise ValueError(f"Invalid mode: {mode}, only support "
"'random', 'tile', 'interleave'")


def main(args):
random.seed(args.shuffle_seed)

# Prepare the prompts:
# we append the document id at the beginning to avoid any of the document
# being the prefix of other documents
prompts = [
str(i) + ' '.join(['hi'] * args.document_length)
for i in range(args.num_documents)
]

prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)

warmup_prompts = [
"This is warm up request " + str(i) + \
' '.join(['hi'] * args.document_length)
for i in range(args.num_documents)]

# Create the LLM engine
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)

print("------warm up------")
test_long_document_qa(
llm=llm,
prompts=warmup_prompts,
sampling_params=sampling_params,
)

print("------start generating------")
test_long_document_qa(
llm=llm,
prompts=prompts,
sampling_params=sampling_params,
)


if __name__ == "__main__":
parser = FlexibleArgumentParser(
description=
'Benchmark the performance with or without automatic prefix caching.')

parser.add_argument(
'--document-length',
type=int,
# Roughly the number of tokens for a system paper,
# excluding images
default=20000,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')

parser.add_argument('--num-documents',
type=int,
default=8,
help='Range of input lengths for sampling prompts,'
'specified as "min:max" (e.g., "128:256").')

parser.add_argument('--output-len', type=int, default=10)

parser.add_argument('--repeat-count',
type=int,
default=2,
help='Number of times to repeat each prompt')

parser.add_argument("--repeat-mode",
type=str,
default='random',
help='The mode to repeat prompts. The supported '
'modes are "random", "tile", and "interleave". '
'See repeat_prompts() in the source code for details.')

parser.add_argument("--shuffle-seed",
type=int,
default=0,
help='Random seed when the repeat mode is "random"')

parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args)
6 changes: 4 additions & 2 deletions docs/source/design/plugin_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ Every plugin has three parts:
2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name.
3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module.

## What Can Plugins Do?
## Types of supported plugins

Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function.

- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported.

## Guidelines for Writing Plugins

Expand Down
14 changes: 7 additions & 7 deletions docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -566,25 +566,25 @@ See [this page](#generative-models) for more information on how to use generativ
- [V1](gh-issue:8779)
* - `AriaForConditionalGeneration`
- Aria
- T + I
- T + I<sup>+</sup>
- `rhymes-ai/Aria`
-
- ✅︎
-
- ✅︎
* - `Blip2ForConditionalGeneration`
- BLIP-2
- T + I<sup>E</sup>
- `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-
- ✅︎
-
- ✅︎
* - `ChameleonForConditionalGeneration`
- Chameleon
- T + I
- `facebook/chameleon-7b` etc.
-
- ✅︎
-
- ✅︎
* - `DeepseekVLV2ForCausalLM`
- DeepSeek-VL2
- T + I
Expand All @@ -598,7 +598,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `adept/fuyu-8b` etc.
-
- ✅︎
-
- ✅︎
* - `ChatGLMModel`
- GLM-4V
- T + I
Expand Down Expand Up @@ -640,7 +640,7 @@ See [this page](#generative-models) for more information on how to use generativ
- `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-
- ✅︎
-
- ✅︎
* - `LlavaNextVideoForConditionalGeneration`
- LLaVA-NeXT-Video
- T + V
Expand Down Expand Up @@ -673,7 +673,7 @@ See [this page](#generative-models) for more information on how to use generativ
- Molmo
- T + I
- `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc.
-
- ✅︎
- ✅︎
- ✅︎
* - `NVLM_D_Model`
Expand Down
10 changes: 8 additions & 2 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ def run_aria(question: str, modality: str):
assert modality == "image"
model_name = "rhymes-ai/Aria"

# NOTE: Need L40 (or equivalent) to avoid OOM
llm = LLM(model=model_name,
tokenizer_mode="slow",
trust_remote_code=True,
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)

prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
Expand Down Expand Up @@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str):
prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
stop_token_ids = None
return llm, prompt, stop_token_ids
Expand Down Expand Up @@ -275,7 +279,7 @@ def run_minicpmv(question: str, modality: str):
# 2.5
# model_name = "openbmb/MiniCPM-Llama3-V-2_5"

#2.6
# 2.6
model_name = "openbmb/MiniCPM-V-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
Expand Down Expand Up @@ -448,9 +452,11 @@ def run_pixtral_hf(question: str, modality: str):

model_name = "mistral-community/pixtral-12b"

# NOTE: Need L40 (or equivalent) to avoid OOM
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
)

Expand Down
Loading

0 comments on commit 12f4553

Please sign in to comment.