diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 3db77d5f16022..64ba1b32fb074 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -21,7 +21,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh resources: @@ -51,7 +51,7 @@ steps: queue: H200 plugins: - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -65,13 +65,18 @@ steps: - VLLM_USAGE_SOURCE - HF_TOKEN + - block: "Run H100 Benchmark" + key: block-h100 + depends_on: ~ + - label: "H100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 + depends_on: block-h100 plugins: - docker#v5.12.0: - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index 19f7160e68a4d..aa0f7ade808e0 100644 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -1,6 +1,6 @@ #!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) -URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" TIMEOUT_SECONDS=10 diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index f78e360b7afd3..93e118fb3eab8 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -1,7 +1,7 @@ steps: - label: "Build wheel - CUDA 12.1" agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" @@ -18,7 +18,7 @@ steps: - label: "Build wheel - CUDA 11.8" # depends_on: block-build-cu118-wheel agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" @@ -26,3 +26,16 @@ steps: - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + + - block: "Build release image" + depends_on: ~ + key: block-release-image-build + + - label: "Build release image" + depends_on: block-release-image-build + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index faeac8e2ded36..e0a12afbe7320 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -12,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image and launch offline inference -docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py +# Run the image and test offline inference/tensor parallel +docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' + python3 examples/offline_inference.py + python3 examples/offline_inference_cli.py -tp 2 +' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f5591f1098534..8f57006214c88 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -50,9 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker - - tests/test_lazy_torch_compile.py + - tests/standalone_tests/lazy_torch_compile.py commands: - - python3 test_lazy_torch_compile.py + - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py @@ -61,6 +61,13 @@ steps: - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker +- label: Python-only Installation Test + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + - label: Basic Correctness Test # 30min #mirror_hardwares: [amd] fast_check: true @@ -230,7 +237,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -355,6 +362,7 @@ steps: - tests/models/embedding/vision_language - tests/models/encoder_decoder/vision_language commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model @@ -370,6 +378,7 @@ steps: - tests/models/embedding/vision_language - tests/models/encoder_decoder/vision_language commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 @@ -481,7 +490,6 @@ steps: - label: LoRA TP Test (Distributed) num_gpus: 4 - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 76dbd4c04d3f3..77162bc82de62 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -1,5 +1,6 @@ # default base image -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.0-ubuntu20.04" +# https://gallery.ecr.aws/neuron/pytorch-inference-neuronx +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04" FROM $BASE_IMAGE diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index c3fed56e8a956..b67849038cf0d 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -24,6 +24,7 @@ class RequestFuncInput: model: str best_of: int = 1 logprobs: Optional[int] = None + extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False @@ -36,6 +37,7 @@ class RequestFuncOutput: ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies + tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -242,6 +244,8 @@ async def async_request_openai_completions( "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } @@ -336,6 +340,8 @@ async def async_request_openai_chat_completions( "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py new file mode 100644 index 0000000000000..1a0e62598bfcb --- /dev/null +++ b/benchmarks/benchmark_guided.py @@ -0,0 +1,494 @@ +"""Benchmark guided decoding throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +from typing import List + +import datasets +import pandas as pd +import uvloop +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) +from vllm.sampling_params import GuidedDecodingParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str = 'json' + completion: str = None + + +def run_vllm(requests: List[SampleRequest], + engine_args: EngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False) -> float: + from vllm import LLM, SamplingParams + llm = LLM(**vars(engine_args)) + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + # create a list containing random selected true or false + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if guided_decoding_rate > 0 else None, + )) + llm.generate(prompts, sampling_params, use_tqdm=False) + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + **{request.structure_type: request.schema}) + if i in guided_decoding_req_idx else None, + )) + + start = time.perf_counter() + outputs = llm.generate(prompts, sampling_params, use_tqdm=False) + ret = [] + for output, request in zip(outputs, requests): + generated_text = output.outputs[0].text + ret.append({ + "generated": generated_text, + "expected": request.completion + }) + end = time.perf_counter() + return end - start, ret + + +async def run_vllm_async( + requests: List[SampleRequest], + engine_args: AsyncEngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False, + disable_frontend_multiprocessing: bool = False) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + json=request.schema) + if guided_decoding_rate > 0 else None, + )) + generators = [] + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if i in guided_decoding_req_idx else None, + )) + + generators = [] + start_time = [] + latencies = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + start_time.append(time.perf_counter()) + latencies.append([]) + all_gens = merge_async_iterators(*generators) + generated_texts = [''] * len(requests) + async for i, res in all_gens: + generated_texts[i] = res.outputs[0].text + lat = time.perf_counter() - start_time[i] + latencies[i].append(lat) + ret = [{ + 'generated': gt, + 'expected': req.completion + } for gt, req in zip(generated_texts, requests)] + end = time.perf_counter() + first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) + next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 + for lat in latencies]) + return end - start, ret, (first_latency, next_latency) + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + args.warmup = False + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + completion=completion)) + + return requests + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # async engine is working for 'regex', 'choice' and 'grammar' + if args.dataset == 'grammar': + args.structure_type = 'grammar' + args.async_engine = False + elif args.dataset == 'regex': + args.structure_type = 'regex' + args.async_engine = False + elif args.dataset == 'choice': + args.structure_type = 'choice' + args.async_engine = False + else: + args.structure_type = 'json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += f"_async{args.async_engine}" + result_file_name += f"_warmup{args.warmup}" + result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" + result_file_name += ".txt" + else: + result_file_name = None + + # Synthesize a prompt with the given input length. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + requests = sample_requests(tokenizer, args) + + if args.async_engine: + engine_args = AsyncEngineArgs.from_cli_args(args) + elapsed_time, ret, (first_latency, next_latency) = uvloop.run( + run_vllm_async(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup, + args.disable_frontend_multiprocessing)) + else: + engine_args = EngineArgs.from_cli_args(args) + elapsed_time, ret = run_vllm(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup) + first_latency, next_latency = None, None + + score = evaluate(ret, args) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if first_latency is not None: + latency_breakdown = "\nFirst token latency(msecs):\n" + latency_breakdown += f"{first_latency.describe()}" + latency_breakdown += "\nNext token latency(msecs):\n" + latency_breakdown += f"{next_latency.describe()}" + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s", + f"Correct rate is {score} %", + f"{latency_breakdown if first_latency is not None else ''}") + + # Output JSON results if specified + if args.output_json or result_file_name: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "total_output_tokens": total_output_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", + "output_tokens_per_second": + f"{total_output_tokens / elapsed_time:.2f}", + "correct_rate(%)": score + } + results = {"outputs": ret, **results} + if first_latency is not None: + results["first_token_latency(msecs)"] = first_latency.describe( + ).to_dict() + results["next_token_latency(msecs)"] = next_latency.describe( + ).to_dict() + if args.output_json: + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + elif result_file_name: + with open(result_file_name, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark guided decoding.") + parser = AsyncEngineArgs.add_cli_args(parser) + + parser.add_argument("--output-len", + type=int, + default=512, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=10, + help="Number of prompts to process.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser.add_argument("--warmup", + action="store_true", + default=False, + help="Run warmup prompts before benchmark.") + parser.add_argument("--save-results", + action="store_true", + default=False, + help="save output results.") + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + main(args) diff --git a/benchmarks/benchmark_serving_guided.py b/benchmarks/benchmark_serving_guided.py new file mode 100644 index 0000000000000..4435d87e18a8a --- /dev/null +++ b/benchmarks/benchmark_serving_guided.py @@ -0,0 +1,881 @@ +r"""Benchmark online serving throughput with guided decoding. + +On the server side, run one of the following commands: + (vLLM OpenAI API server) + vllm serve --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset json \ + --guided-decoding-ratio 1.0 \ + --guided-decoding-backend xgrammar \ + --request-rate 10 \ + --num-prompts 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import dataclasses +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from typing import AsyncGenerator, List, Optional, Tuple + +import datasets +import numpy as np +import pandas as pd +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str + completion: str = None + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + completion=completion)) + + return requests + + +async def get_request( + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[Tuple[int, SampleRequest], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + + for i, request in enumerate(input_requests): + yield i, request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + all_tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0 + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], + ignore_eos: bool, + max_concurrency: Optional[int], + guided_decoding_ratio: float, + guided_decoding_backend: str, +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + def prepare_extra_body(request) -> dict: + extra_body = {} + # Add the schema to the extra_body + extra_body[request.structure_type] = request.schema + # Add the specific guided_decoding_backend + extra_body["guided_decoding_backend"] = guided_decoding_backend + return extra_body + + print("Starting initial single prompt test run...") + guided_decoding_req_idx = random.sample( + range(len(input_requests)), + int(len(input_requests) * guided_decoding_ratio)) + + test_request = input_requests[0] + test_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=api_url, + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/start_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: List[asyncio.Task] = [] + expected: List[str] = [] + async for i, request in get_request(input_requests, request_rate, + burstiness): + extra_body = prepare_extra_body( + request) if i in guided_decoding_req_idx else None + request_func_input = RequestFuncInput( + model=model_id, + prompt=request.prompt, + api_url=api_url, + prompt_len=request.prompt_len, + output_len=request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + expected.append(request.completion) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": + benchmark_duration, + "completed": + metrics.completed, + "total_input_tokens": + metrics.total_input, + "total_output_tokens": + metrics.total_output, + "request_throughput": + metrics.request_throughput, + "output_throughput": + metrics.output_throughput, + "total_token_throughput": + metrics.total_token_throughput, + "ttft_description": + pd.Series([output.ttft for output in outputs]).describe().to_dict(), + "tpot_description": + pd.Series([output.tpot for output in outputs]).describe().to_dict(), + "input_lens": [output.prompt_len for output in outputs], + "output_lens": + actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "errors": [output.error for output in outputs], + } + + ret = [{ + 'generated': output.generated_text, + 'expected': gt + } for output, gt in zip(outputs, expected)] + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result, ret + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'guided_json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'guided_regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'guided_choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset == 'grammar': + args.structure_type = 'guided_grammar' + elif args.dataset == 'regex': + args.structure_type = 'guided_regex' + elif args.dataset == 'choice': + args.structure_type = 'guided_choice' + else: + args.structure_type = 'guided_json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{backend}" + result_file_name += f"_{args.request_rate}qps" + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += ".txt" + else: + result_file_name = None + + input_requests = sample_requests(tokenizer, args) + + benchmark_result, ret = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + max_concurrency=args.max_concurrency, + guided_decoding_ratio=args.guided_decoding_ratio, + guided_decoding_backend=args.guided_decoding_backend, + )) + + # Save config and results to json + score = evaluate(ret, args) + print("correct_rate(%)", score, '\n') + if args.save_results: + results = { + "backend": + backend, + "model_id": + model_id, + "tokenizer_id": + tokenizer_id, + "num_prompts": + args.num_prompts, + "request_rate": + args.request_rate if args.request_rate < float("inf") else "inf", + "burstiness": + args.burstiness, + "max_concurrency": + args.max_concurrency, + "correct_rate(%)": + score + } + results = {"outputs": ret, **results, **benchmark_result} + + # Save to file + if args.result_filename: + result_file_name = args.result_filename + if args.result_dir: + result_file_name = os.path.join(args.result_dir, result_file_name) + with open(result_file_name, "w", encoding='utf-8') as outfile: + json.dump(results, outfile, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--output-len", + type=int, + default=128, + help="Number of output tokens.", + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--save-results", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--guided-decoding-backend", + type=str, + choices=["outlines", "lm-format-enforcer", "xgrammar"], + default="xgrammar", + help="Backend to use for guided decoding") + + args = parser.parse_args() + main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 159cf055737ce..1e5967bd9bf8b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -294,23 +294,36 @@ def main(args: argparse.Namespace): tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: - # Synthesize a prompt with the given input length. - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for i in range(-10, 10): - prompt = "hi " * (args.input_len + i) - tokenized_prompt = tokenizer(prompt).input_ids - if len(tokenized_prompt) == args.input_len: - break - else: - raise ValueError( - f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [ - SampleRequest(prompt=prompt, - prompt_len=args.input_len, - expected_output_len=args.output_len) - for _ in range(args.num_prompts) - ] + vocab_size = tokenizer.vocab_size + requests = [] + for _ in range(args.num_prompts): + # Synthesize a prompt with the given input length. + candidate_ids = [ + random.randint(0, vocab_size - 1) + for _ in range(args.input_len) + ] + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for _ in range(5): # Max attempts to correct + candidate_prompt = tokenizer.decode(candidate_ids) + tokenized_len = len(tokenizer.encode(candidate_prompt)) + + if tokenized_len == args.input_len: + break + + # Adjust length based on difference + diff = args.input_len - tokenized_len + if diff > 0: + candidate_ids.extend([ + random.randint(100, vocab_size - 100) + for _ in range(diff) + ]) + else: + candidate_ids = candidate_ids[:diff] + requests.append( + SampleRequest(prompt=candidate_prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len)) else: requests = sample_requests(tokenizer, args) diff --git a/benchmarks/structured_schemas/structured_schema_1.json b/benchmarks/structured_schemas/structured_schema_1.json new file mode 100644 index 0000000000000..6003698469e8d --- /dev/null +++ b/benchmarks/structured_schemas/structured_schema_1.json @@ -0,0 +1,113 @@ +{ + "$schema": + "https://json-schema.org/draft/2020-12/schema", + "title": + "User Profile", + "type": + "object", + "properties": { + "userId": { + "type": "string", + "description": "Unique identifier for the user." + }, + "personalInfo": { + "type": "object", + "properties": { + "firstName": { + "type": "string", + "description": "The user's first name." + }, + "lastName": { + "type": "string", + "description": "The user's last name." + }, + "age": { + "type": "integer", + "minimum": 0, + "description": "The user's age." + }, + "phoneNumbers": { + "type": + "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["home", "work", "mobile"], + "description": "Type of phone number." + }, + "number": { + "type": "string", + "pattern": "^\\+?[1-9]\\d{1,14}$", + "description": "Phone number in E.164 format." + } + }, + "required": ["type", "number"] + }, + "description": + "List of phone numbers associated with the user." + } + }, + "required": ["firstName", "lastName"] + }, + "address": { + "type": "object", + "properties": { + "street": { + "type": "string", + "description": "Street address." + }, + "city": { + "type": "string", + "description": "City name." + }, + "state": { + "type": "string", + "description": "State or province." + }, + "postalCode": { + "type": "string", + "pattern": "^\\d{5}(-\\d{4})?$", + "description": "Postal code." + }, + "country": { + "type": "string", + "description": "Country name." + } + }, + "required": ["street", "city", "state", "postalCode", "country"] + }, + "preferences": { + "type": "object", + "properties": { + "newsletterSubscribed": { + "type": + "boolean", + "description": + "Indicates if the user is subscribed to the newsletter." + }, + "favoriteCategories": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of user's favorite categories." + } + }, + "required": ["newsletterSubscribed"] + }, + "accountStatus": { + "type": "string", + "enum": ["active", "inactive", "suspended"], + "description": "Current status of the user's account." + }, + "registrationDate": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 formatted date-time of user registration." + } + }, + "required": + ["userId", "personalInfo", "address", "accountStatus", "registrationDate"] +} \ No newline at end of file diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 1be806bbfa43c..8a95279f9a25a 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -307,10 +307,20 @@ void reshape_and_cache_flash( torch::Tensor& key_cache, // [num_blocks, block_size, num_heads, head_size] torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] - torch::Tensor& slot_mapping, // [num_tokens] + torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] const std::string& kv_cache_dtype, const double k_scale, const double v_scale) { - int num_tokens = key.size(0); + // NOTE(woosuk): In vLLM V1, key.size(0) can be different from + // slot_mapping.size(0) because of padding for CUDA graphs. + // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because + // both include padding. + // In vLLM V1, however, key.size(0) can be larger than slot_mapping.size(0) + // since key includes padding for CUDA graphs, while slot_mapping does not. + // In this case, slot_mapping.size(0) represents the actual number of tokens + // before padding. + // For compatibility with both cases, we use slot_mapping.size(0) as the + // number of tokens. + int num_tokens = slot_mapping.size(0); int num_heads = key.size(1); int head_size = key.size(2); int block_size = key_cache.size(1); diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 498d069c05f0d..dd1e6de2e0180 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -424,7 +424,7 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), // (which occurs when `final_state_position` is a non-positivie index) // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it - if (final_state_position < 0 && seqlen > kWidth){ + if (conv_states != nullptr && final_state_position < 0 && seqlen > kWidth){ input_t vals_load[kNElts] = {0}; if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){ // chunk = n_chunks - 2, a segment of the final state sits in the last index diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 8ea240f59c38f..5c80645b405ae 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -16,4 +16,5 @@ mistral_common >= 1.5.0 aiohttp starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file +partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args +requests diff --git a/docs/source/conf.py b/docs/source/conf.py index 96ad9a4c26b09..e9d9ac68c9560 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,11 +10,13 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. +import inspect import logging import os import sys from typing import List +import requests from sphinx.ext import autodoc logger = logging.getLogger(__name__) @@ -34,6 +36,7 @@ extensions = [ "sphinx.ext.napoleon", "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", "sphinx.ext.autodoc", @@ -94,6 +97,69 @@ def setup(app): generate_examples() +_cached_base: str = "" +_cached_branch: str = "" + + +def get_repo_base_and_branch(pr_number): + global _cached_base, _cached_branch + if _cached_base and _cached_branch: + return _cached_base, _cached_branch + + url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + _cached_base = data['head']['repo']['full_name'] + _cached_branch = data['head']['ref'] + return _cached_base, _cached_branch + else: + logger.error("Failed to fetch PR details: %s", response) + return None, None + + +def linkcode_resolve(domain, info): + if domain != 'py': + return None + if not info['module']: + return None + filename = info['module'].replace('.', '/') + module = info['module'] + + # try to determine the correct file and line number to link to + obj = sys.modules[module] + + # get as specific as we can + lineno: int = 0 + filename: str = "" + try: + for part in info['fullname'].split('.'): + obj = getattr(obj, part) + + if not (inspect.isclass(obj) or inspect.isfunction(obj) + or inspect.ismethod(obj)): + obj = obj.__class__ # Get the class of the instance + + lineno = inspect.getsourcelines(obj)[1] + filename = (inspect.getsourcefile(obj) + or f"{filename}.py").split("vllm/", 1)[1] + except Exception: + # For some things, like a class member, won't work, so + # we'll use the line number of the parent (the class) + pass + + if filename.startswith("checkouts/"): + # a PR build on readthedocs + pr_number = filename.split("/")[1] + filename = filename.split("/", 2)[2] + base, branch = get_repo_base_and_branch(pr_number) + if base and branch: + return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" + + # Otherwise, link to the source file on the main branch + return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" + + # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ "compressed_tensors", @@ -112,6 +178,7 @@ def setup(app): "tensorizer", "pynvml", "outlines", + "xgrammar," "librosa", "soundfile", "gguf", diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst index 30f543abc20c7..c6d47f90b62d5 100644 --- a/docs/source/design/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.rst @@ -7,7 +7,7 @@ Multi-Modality vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` +Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities @@ -15,9 +15,6 @@ by following :ref:`this guide `. Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. -.. - TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported - Guides ++++++ diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index e3dbbc9affe66..9b6cb0e80d60e 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -21,7 +21,7 @@ You can install vLLM using pip: .. code-block:: console $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y + $ conda create -n myenv python=3.12 -y $ conda activate myenv $ # Install vLLM with CUDA 12.1. @@ -73,7 +73,7 @@ Another way to access the latest code is to use the docker images: .. code-block:: console $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT} + $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. @@ -89,45 +89,24 @@ Build from source Python-only build (without compilation) --------------------------------------- -If you only need to change Python code, you can simply build vLLM without compilation. - -The first step is to install the latest vLLM wheel: - -.. code-block:: console - - pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -After verifying that the installation is successful, you can use `the following script `_: +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: .. code-block:: console $ git clone https://github.com/vllm-project/vllm.git $ cd vllm - $ python python_only_dev.py + $ VLLM_USE_PRECOMPILED=1 pip install --editable . -The script will: +This will download the latest nightly wheel and use the compiled libraries from there in the install. -* Find the installed vLLM package in the current environment. -* Copy built files to the current directory. -* Rename the installed vLLM package. -* Symbolically link the current directory to the installed vLLM package. - -Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. - -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: +The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: .. code-block:: console - $ python python_only_dev.py --quit-dev - -The ``--quit-dev`` flag will: - -* Remove the symbolic link from the current directory to the vLLM package. -* Restore the original vLLM package from the backup. + $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl + $ pip install --editable . -If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. +You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. .. note:: @@ -148,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. + The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. + Use an existing PyTorch installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/index.rst b/docs/source/index.rst index 0692e949f1c77..86b1eed2d26ba 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,12 +85,8 @@ Documentation serving/deploying_with_nginx serving/distributed_serving serving/metrics - serving/env_vars - serving/usage_stats serving/integrations serving/tensorizer - serving/compatibility_matrix - serving/faq .. toctree:: :maxdepth: 1 @@ -99,12 +95,21 @@ Documentation models/supported_models models/adding_model models/enabling_multimodal_inputs - models/engine_args - models/lora - models/vlm - models/structured_outputs - models/spec_decode - models/performance + +.. toctree:: + :maxdepth: 1 + :caption: Usage + + usage/lora + usage/multimodal_inputs + usage/structured_outputs + usage/spec_decode + usage/compatibility_matrix + usage/performance + usage/faq + usage/engine_args + usage/env_vars + usage/usage_stats .. toctree:: :maxdepth: 1 diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst index 49b5285c45590..5c1236e1a8972 100644 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -3,7 +3,7 @@ Enabling Multimodal Inputs ========================== -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal ` inputs. +This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. .. seealso:: :ref:`adding_a_new_model` diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9f3b6f59068e2..4e5b10967e3bb 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -471,6 +471,8 @@ Sentence Pair Scoring .. note:: These models are supported in both offline and online inference via Score API. +.. _supported_mm_models: + Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -489,13 +491,11 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive. - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -.. _supported_vlms: - Text Generation --------------- .. list-table:: - :widths: 25 25 15 25 5 5 + :widths: 25 25 15 20 5 5 5 :header-rows: 1 * - Architecture @@ -504,151 +504,194 @@ Text Generation - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + - V1 * - :code:`AriaForConditionalGeneration` - Aria - T + I - :code:`rhymes-ai/Aria` - - ✅︎ + - * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - T + I\ :sup:`E` - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ + - * - :code:`ChameleonForConditionalGeneration` - Chameleon - T + I - :code:`facebook/chameleon-7b` etc. - - ✅︎ + - * - :code:`FuyuForCausalLM` - Fuyu - T + I - :code:`adept/fuyu-8b` etc. - - ✅︎ + - * - :code:`ChatGLMModel` - GLM-4V - T + I - :code:`THUDM/glm-4v-9b` etc. - ✅︎ - ✅︎ + - * - :code:`H2OVLChatModel` - H2OVL - T + I\ :sup:`E+` - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - ✅︎ + - * - :code:`Idefics3ForConditionalGeneration` - Idefics3 - T + I - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - ✅︎ + - - * - :code:`InternVLChatModel` - - InternVL2 + - InternVL 2.5, Mono-InternVL, InternVL 2.0 - T + I\ :sup:`E+` - - :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. + - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - ✅︎ + - ✅︎ * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - T + I\ :sup:`E+` - - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. + - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - ✅︎ + - ✅︎ * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT - T + I\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ + - * - :code:`LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - ✅︎ + - * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ + - * - :code:`MiniCPMV` - MiniCPM-V - T + I\ :sup:`E+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - ✅︎ - ✅︎ + - * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - T + I\ :sup:`+` - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - + - * - :code:`MolmoForCausalLM` - Molmo - T + I - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. - - ✅︎ + - ✅︎ * - :code:`NVLM_D_Model` - NVLM-D 1.0 - T + I\ :sup:`E+` - :code:`nvidia/NVLM-D-72B`, etc. - - ✅︎ + - ✅︎ * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - T + I\ :sup:`E` - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. - - ✅︎ + - * - :code:`Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + I\ :sup:`E+` - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - ✅︎ + - ✅︎ * - :code:`PixtralForConditionalGeneration` - Pixtral - T + I\ :sup:`+` - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - ✅︎ + - ✅︎ * - :code:`QWenLMHeadModel` - Qwen-VL - T + I\ :sup:`E+` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - ✅︎ - ✅︎ + - * - :code:`Qwen2AudioForConditionalGeneration` - Qwen2-Audio - T + A\ :sup:`+` - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - ✅︎ + - * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ + - * - :code:`UltravoxModel` - Ultravox - T + A\ :sup:`E+` - :code:`fixie-ai/ultravox-v0_3` - - ✅︎ + - | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. important:: + To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) + or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + + .. code-block:: python + + llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, + ) + + .. code-block:: bash + + vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 + .. note:: vLLM currently only supports adding LoRA to the language backbone of multimodal models. +.. note:: + To use :code:`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo (:code:`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) + and pass :code:`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. + .. note:: The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 diff --git a/docs/source/serving/deploying_with_kubeai.rst b/docs/source/serving/deploying_with_kubeai.rst new file mode 100644 index 0000000000000..ec3c065320fd9 --- /dev/null +++ b/docs/source/serving/deploying_with_kubeai.rst @@ -0,0 +1,17 @@ +.. _deploying_with_kubeai: + +Deploying with KubeAI +===================== + +`KubeAI `_ is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies. + + +Please see the Installation Guides for environment specific instructions: + +* `Any Kubernetes Cluster `_ +* `EKS `_ +* `GKE `_ + +Once you have KubeAI installed, you can +`configure text generation models `_ +using vLLM. \ No newline at end of file diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index f39997e0e44d9..0dd505a739863 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -6,6 +6,7 @@ Integrations run_on_sky deploying_with_kserve + deploying_with_kubeai deploying_with_triton deploying_with_bentoml deploying_with_cerebrium diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index c39cef85897ed..d75e90807ca1d 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -32,7 +32,7 @@ We currently support the following OpenAI APIs: - [Completions API](https://platform.openai.com/docs/api-reference/completions) - *Note: `suffix` parameter is not supported.* - [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) - - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). - *Note: `image_url.detail` parameter is not supported.* - We also support `audio_url` content type for audio files. - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. @@ -41,7 +41,7 @@ We currently support the following OpenAI APIs: - [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), which will be treated as a single prompt to the model according to its chat template. - - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). + - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Score API for Cross Encoder Models diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/usage/compatibility_matrix.rst similarity index 100% rename from docs/source/serving/compatibility_matrix.rst rename to docs/source/usage/compatibility_matrix.rst diff --git a/docs/source/models/engine_args.rst b/docs/source/usage/engine_args.rst similarity index 100% rename from docs/source/models/engine_args.rst rename to docs/source/usage/engine_args.rst diff --git a/docs/source/serving/env_vars.rst b/docs/source/usage/env_vars.rst similarity index 100% rename from docs/source/serving/env_vars.rst rename to docs/source/usage/env_vars.rst diff --git a/docs/source/serving/faq.rst b/docs/source/usage/faq.rst similarity index 99% rename from docs/source/serving/faq.rst rename to docs/source/usage/faq.rst index 9e858e612c8bf..ce327abd5fa20 100644 --- a/docs/source/serving/faq.rst +++ b/docs/source/usage/faq.rst @@ -1,3 +1,5 @@ +.. _faq: + Frequently Asked Questions =========================== diff --git a/docs/source/models/lora.rst b/docs/source/usage/lora.rst similarity index 99% rename from docs/source/models/lora.rst rename to docs/source/usage/lora.rst index ef0177eaf2162..c2c6fa2aebfaf 100644 --- a/docs/source/models/lora.rst +++ b/docs/source/usage/lora.rst @@ -1,7 +1,7 @@ .. _lora: -Using LoRA adapters -=================== +LoRA Adapters +============= This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. diff --git a/docs/source/models/vlm.rst b/docs/source/usage/multimodal_inputs.rst similarity index 62% rename from docs/source/models/vlm.rst rename to docs/source/usage/multimodal_inputs.rst index bcbe50a25fa09..c93f65327e31b 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/usage/multimodal_inputs.rst @@ -1,34 +1,31 @@ -.. _vlm: +.. _multimodal_inputs: -Using VLMs -========== +Multimodal Inputs +================= -vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here `. -This document shows you how to run and serve these models using vLLM. +This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. .. note:: - We are actively iterating on VLM support. See `this RFC `_ for upcoming changes, + We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, and `open an issue on GitHub `_ if you have any feedback or feature requests. Offline Inference ----------------- -Single-image input -^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models. - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - -To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: +To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: * ``prompt``: The prompt should follow the format that is documented on HuggingFace. * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. +Image +^^^^^ + +You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: + .. code-block:: python + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + # Refer to the HuggingFace repo for the correct format to use prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" @@ -41,41 +38,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT "multi_modal_data": {"image": image}, }) - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input - image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input with additional parameters - # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters. - mm_data = {} - - image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - # For Qwen2VL, image_grid_thw is needed to calculate positional encoding. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3), - } - # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_size_list": [image.size] # list of image sizes - } - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - for o in outputs: generated_text = o.outputs[0].text print(generated_text) @@ -102,12 +64,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptT A code example can be found in `examples/offline_inference_vision_language.py `_. -Multi-image input -^^^^^^^^^^^^^^^^^ - -Multi-image input is only supported for a subset of VLMs, as shown :ref:`here `. - -To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class. +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: .. code-block:: python @@ -118,10 +75,6 @@ To enable multiple multi-modal items per text prompt, you have to set ``limit_mm limit_mm_per_prompt={"image": 2}, # The maximum number to accept ) -Instead of passing in a single image, you can pass in a list of images. - -.. code-block:: python - # Refer to the HuggingFace repo for the correct format to use prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" @@ -169,30 +122,114 @@ Multi-image input can be extended to perform video captioning. We show this with generated_text = o.outputs[0].text print(generated_text) +Video +^^^^^ + +You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Please refer to `examples/offline_inference_vision_language.py `_ for more details. + +Audio +^^^^^ + +You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. + +Please refer to `examples/offline_inference_audio_language.py `_ for more details. + +Embedding +^^^^^^^^^ + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +.. code-block:: python + + # Inference with image embeddings as input + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Embeddings for single image + # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +.. code-block:: python + + # Construct the prompt based on your model + prompt = ... + + # Embeddings for multiple images + # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + # Qwen2-VL + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } + } + + # MiniCPM-V + llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } + } + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + Online Inference ---------------- -OpenAI Vision API -^^^^^^^^^^^^^^^^^ +Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. + +.. important:: + A chat template is **required** to use Chat Completions API. + + Although most models come with a chat template, for others you have to define one yourself. + The chat template can be inferred based on the documentation on the model's HuggingFace repo. + For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. + +Image +^^^^^ -You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. +Image input is supported according to `OpenAI Vision API `_. +Here is a simple example using Phi-3.5-Vision. -Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server. +First, launch the OpenAI-compatible server: .. code-block:: bash vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 -.. important:: - Since OpenAI Vision API is based on `Chat Completions API `_, - a chat template is **required** to launch the API server. - - Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `_. - -To consume the server, you can use the OpenAI client like in the example below: +Then, you can use the OpenAI client as follows: .. code-block:: python @@ -252,22 +289,59 @@ A full code example can be found in `examples/openai_chat_completion_client_for_ .. note:: - By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: + By default, the timeout for fetching images through HTTP URL is ``5`` seconds. + You can override this by setting the environment variable: .. code-block:: console $ export VLLM_IMAGE_FETCH_TIMEOUT= -Chat Embeddings API -^^^^^^^^^^^^^^^^^^^ +Video +^^^^^ + +Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. + +You can use `these tests `_ as reference. + +.. note:: + + By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_VIDEO_FETCH_TIMEOUT= -vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, -where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. +Audio +^^^^^ + +Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`. + +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. + +.. note:: + + By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_AUDIO_FETCH_TIMEOUT= + +Embedding +^^^^^^^^^ + +vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. .. tip:: The schema of ``messages`` is exactly the same as in Chat Completions API. + You can refer to the above tutorials for more details on how to pass each type of multi-modal data. -In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: .. code-block:: bash @@ -279,10 +353,8 @@ In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` to run this model in embedding mode instead of text generation mode. -.. important:: - - VLM2Vec does not expect chat-based input. We use a `custom chat template `_ - to combine the text and images together. + The custom chat template is completely different from the original one for this model, + and can be found `here `__. Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: @@ -310,7 +382,7 @@ Since the request schema is not defined by OpenAI client, we post a request to t response_json = response.json() print("Embedding output:", response_json["data"][0]["embedding"]) -Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. +Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. .. code-block:: bash @@ -319,8 +391,10 @@ Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. .. important:: - Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, - which is handled by the jinja template. + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. + + Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled + by `this custom chat template `__. .. important:: diff --git a/docs/source/models/performance.rst b/docs/source/usage/performance.rst similarity index 100% rename from docs/source/models/performance.rst rename to docs/source/usage/performance.rst diff --git a/docs/source/models/spec_decode.rst b/docs/source/usage/spec_decode.rst similarity index 97% rename from docs/source/models/spec_decode.rst rename to docs/source/usage/spec_decode.rst index d57ffec53215d..f1f1917f974bb 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/usage/spec_decode.rst @@ -1,13 +1,16 @@ .. _spec_decode: -Speculative decoding in vLLM -============================ +Speculative decoding +==================== .. warning:: Please note that speculative decoding in vLLM is not yet optimized and does not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work to optimize it is ongoing and can be followed in `this issue. `_ +.. warning:: + Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. + This document shows how to use `Speculative Decoding `_ with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. @@ -182,7 +185,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. + titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. **Conclusion** @@ -197,7 +200,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. Resources for vLLM contributors ------------------------------- diff --git a/docs/source/models/structured_outputs.rst b/docs/source/usage/structured_outputs.rst similarity index 100% rename from docs/source/models/structured_outputs.rst rename to docs/source/usage/structured_outputs.rst diff --git a/docs/source/serving/usage_stats.md b/docs/source/usage/usage_stats.md similarity index 100% rename from docs/source/serving/usage_stats.md rename to docs/source/usage/usage_stats.md diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index f08f22eec164a..c6a274ee5894b 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -223,7 +223,7 @@ def run_internvl(question: str, modality: str): # Stop tokens for InternVL # models variants may have different stop tokens # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B#service + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] return llm, prompt, stop_token_ids @@ -419,6 +419,22 @@ def run_aria(question: str, modality: str): return llm, prompt, stop_token_ids +# Mantis +def run_mantis(question: str, modality: str): + assert modality == "image" + + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501 + prompt = llama3_template.format(f"{question}\n") + + llm = LLM( + model="TIGER-Lab/Mantis-8B-siglip-llama3", + max_model_len=4096, + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, + ) + stop_token_ids = [128009] + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -441,6 +457,7 @@ def run_aria(question: str, modality: str): "glm4v": run_glm4v, "idefics3": run_idefics3, "aria": run_aria, + "mantis": run_mantis, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 788b604cfd4a0..928bbef54eab7 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -165,7 +165,7 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: # Stop tokens for InternVL # models variants may have different stop tokens # please refer to the model card for the correct "stop words": - # https://huggingface.co/OpenGVLab/InternVL2-2B#service + # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja index 39f902c1c3c40..2b290c0eede03 100644 --- a/examples/tool_chat_template_llama3.2_json.jinja +++ b/examples/tool_chat_template_llama3.2_json.jinja @@ -26,13 +26,11 @@ {%- endfor %} {%- endfor %} - {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} {%- if messages[0]['content'] is string %} {%- set system_message = messages[0]['content']|trim %} {%- else %} - {#- Support vLLM's transforming of a content string to JSON. #} {%- set system_message = messages[0]['content'][0]['text']|trim %} {%- endif %} {%- set messages = messages[1:] %} @@ -44,14 +42,8 @@ {%- endif %} {%- endif %} -{#- Including an image is not compatible with a system message #} -{%- if image_ns.has_images and not system_message == "" %} - {{- raise_exception("Prompting with images is incompatible with system messages and tool use.") }} -{%- endif %} - - -{#- System message, if there are no images #} -{%- if not image_ns.has_images %} +{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #} +{%- if system_message or not image_ns.has_images %} {{- "<|start_header_id|>system<|end_header_id|>\n\n" }} {%- if tools is not none %} {{- "Environment: ipython\n" }} diff --git a/python_only_dev.py b/python_only_dev.py index 1ca0f5c30b741..f70b4984025b3 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -1,92 +1,14 @@ -# enable python only development -# copy compiled files to the current directory directly +msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). -import argparse -import os -import shutil -import subprocess -import sys -import warnings +TL;DR: -parser = argparse.ArgumentParser( - description="Development mode for python-only code") -parser.add_argument('-q', - '--quit-dev', - action='store_true', - help='Set the flag to quit development mode') -args = parser.parse_args() +VLLM_USE_PRECOMPILED=1 pip install -e . -# cannot directly `import vllm` , because it will try to -# import from the current directory -output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"], - capture_output=True) +or -assert output.returncode == 0, "vllm is not installed" +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +pip install -e . +""" # noqa -text = output.stdout.decode("utf-8") - -package_path = None -for line in text.split("\n"): - if line.startswith("Location: "): - package_path = line.split(": ")[1] - break - -assert package_path is not None, "could not find package path" - -cwd = os.getcwd() - -assert cwd != package_path, "should not import from the current directory" - -files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", - "vllm/vllm_flash_attn/flash_attn_interface.py", - "vllm/vllm_flash_attn/__init__.py", - # "vllm/_version.py", # not available in nightly wheels yet -] - -# Try to create _version.py to avoid version related warning -# Refer to https://github.com/vllm-project/vllm/pull/8771 -try: - from setuptools_scm import get_version - get_version(write_to="vllm/_version.py") -except ImportError: - warnings.warn( - "To avoid warnings related to vllm._version, " - "you should install setuptools-scm by `pip install setuptools-scm`", - stacklevel=2) - -if not args.quit_dev: - for file in files_to_copy: - src = os.path.join(package_path, file) - dst = file - print(f"Copying {src} to {dst}") - shutil.copyfile(src, dst) - - pre_built_vllm_path = os.path.join(package_path, "vllm") - tmp_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") - shutil.copytree(pre_built_vllm_path, tmp_path) - shutil.rmtree(pre_built_vllm_path) - - print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") - os.symlink(current_vllm_path, pre_built_vllm_path) -else: - vllm_symlink_path = os.path.join(package_path, "vllm") - vllm_backup_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}") - assert os.path.islink( - vllm_symlink_path - ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link" - assert current_vllm_path == os.readlink( - vllm_symlink_path - ), "current directory is not the source code of package" - os.unlink(vllm_symlink_path) - - print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}") - os.rename(vllm_backup_path, vllm_symlink_path) +print(msg) diff --git a/requirements-common.txt b/requirements-common.txt index 02e3d65fb774c..112528880c0ac 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,6 +19,7 @@ prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 +xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 4674efb812cfd..17d40d0ee131a 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@e096d6f diff --git a/requirements-test.in b/requirements-test.in index 44972866ddc4b..c0b228148ab31 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -24,9 +24,6 @@ mistral_common[opencv] >= 1.5.0 # required for pixtral test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - # quantization bitsandbytes>=0.44.0 buildkite-test-collector==0.1.9 diff --git a/requirements-test.txt b/requirements-test.txt index a59b85023948b..38a064bca449a 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.9 +# This file is autogenerated by pip-compile with Python 3.12 # by the following command: # -# pip-compile requirements-test.in +# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt # absl-py==2.1.0 # via rouge-score @@ -27,10 +27,6 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator -async-timeout==4.0.3 - # via - # aiohttp - # redis attrs==24.2.0 # via # aiohttp @@ -111,10 +107,6 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval -exceptiongroup==1.2.2 - # via - # anyio - # pytest fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -165,8 +157,6 @@ idna==3.10 # httpx # requests # yarl -importlib-resources==6.4.5 - # via matplotlib inflect==5.6.2 # via datamodel-code-generator iniconfig==2.0.0 @@ -518,12 +508,6 @@ timm==1.0.11 # via -r requirements-test.in tokenizers==0.20.3 # via transformers -toml==0.10.2 - # via datamodel-code-generator -tomli==2.0.2 - # via - # black - # pytest torch==2.5.1 # via # -r requirements-test.in @@ -550,7 +534,7 @@ tqdm==4.66.6 # transformers tqdm-multiprocess==0.0.11 # via lm-eval -transformers==4.45.2 +transformers==4.46.3 # via # lm-eval # peft @@ -567,12 +551,9 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via - # anyio - # black # huggingface-hub # librosa # mistral-common - # multidict # pydantic # pydantic-core # torch @@ -590,8 +571,6 @@ xxhash==3.5.0 # evaluate yarl==1.17.1 # via aiohttp -zipp==3.20.2 - # via importlib-resources zstandard==0.23.0 # via lm-eval diff --git a/setup.py b/setup.py index 9d4bae6dd265e..6b1b66e41fb11 100644 --- a/setup.py +++ b/setup.py @@ -249,6 +249,74 @@ def run(self): self.copy_file(file, dst_file) +class repackage_wheel(build_ext): + """Extracts libraries and other files from an existing wheel.""" + default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + def run(self) -> None: + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", + self.default_wheel) + + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/flash_attn_interface.py", + "vllm/vllm_flash_attn/__init__.py", + # "vllm/_version.py", # not available in nightly wheels yet + ] + file_members = filter(lambda x: x.filename in files_to_copy, + wheel.filelist) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + + def _is_hpu() -> bool: is_hpu_available = True try: @@ -397,12 +465,15 @@ def get_vllm_version() -> str: if envs.VLLM_TARGET_DEVICE == "empty": version += f"{sep}empty" elif _is_cuda(): - cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - # skip this for source tarball, required for pypi - if "sdist" not in sys.argv: - version += f"{sep}cu{cuda_version_str}" + if envs.VLLM_USE_PRECOMPILED: + version += ".precompiled" + else: + cuda_version = str(get_nvcc_cuda_version()) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + # skip this for source tarball, required for pypi + if "sdist" not in sys.argv: + version += f"{sep}cu{cuda_version_str}" elif _is_hip(): # Get the HIP version hipcc_version = get_hipcc_rocm_version() @@ -515,13 +586,18 @@ def _read_requirements(filename: str) -> List[str]: package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } -if envs.VLLM_USE_PRECOMPILED: - ext_modules = [] - package_data["vllm"].append("*.so") if _no_device(): ext_modules = [] +if not ext_modules: + cmdclass = {} +else: + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } + setup( name="vllm", version=get_vllm_version(), @@ -558,7 +634,7 @@ def _read_requirements(filename: str) -> List[str]: "audio": ["librosa", "soundfile"], # Required for audio processing "video": ["decord"] # Required for video processing }, - cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, + cmdclass=cmdclass, package_data=package_data, entry_points={ "console_scripts": [ diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index 7ef502abee345..aa11524812cdd 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -7,7 +7,6 @@ from torch import nn from torch.library import Library -from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, @@ -81,6 +80,7 @@ def test_simple_piecewise_compile(): use_cudagraph=True, splitting_ops=["silly.attention"], cudagraph_copy_inputs=True, + cudagraph_capture_sizes=[1, 2], )) with set_current_vllm_config(vllm_config): model = SillyModel(vllm_config=vllm_config, prefix='') @@ -96,11 +96,10 @@ def test_simple_piecewise_compile(): 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): - with set_compile_context([1, 2]): - model(inputs) + model(inputs) - model(torch.randn(2).cuda()) - model(torch.randn(1).cuda()) + model(torch.randn(2).cuda()) + model(torch.randn(1).cuda()) input = torch.zeros(2).cuda() global global_counter diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index dbd5a3bbffeab..07c10a3a18c55 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -13,7 +13,6 @@ from torch import nn from torch.library import Library -from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, @@ -256,6 +255,7 @@ def run_model(llama_config, compilation_config = CompilationConfig( level=CompilationLevel.PIECEWISE, use_cudagraph=True, + cudagraph_capture_sizes=[1, 2], ) if split_attn: compilation_config.splitting_ops = ["silly.attention"] @@ -273,10 +273,9 @@ def run_model(llama_config, input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() positions = torch.arange(B).cuda() - with set_compile_context([1, 2]): - model(input_ids, positions) - model(input_ids[:2], positions[:2]) - model(input_ids[:1], positions[:1]) + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) input_ids[:2].zero_() output = model(input_ids[:2], positions[:2]) @@ -379,10 +378,13 @@ def benchmark(): level=CompilationLevel.PIECEWISE, use_cudagraph=True, splitting_ops=["silly.attention"], + cudagraph_capture_sizes=cudagraph_sizes, ) else: compilation_config = CompilationConfig( - level=CompilationLevel.PIECEWISE, ) + level=CompilationLevel.PIECEWISE, + cudagraph_capture_sizes=cudagraph_sizes, + ) vllm_config = VllmConfig(compilation_config=compilation_config) with set_current_vllm_config(vllm_config): @@ -396,17 +398,16 @@ def benchmark(): graphs = {} - with set_compile_context(cudagraph_sizes): - model(input_ids, positions) - for b in cudagraph_sizes[::-1]: - if not piecewise: - graph = torch.cuda.CUDAGraph() - with torch.cuda.graph(graph, pool=pool): - output = model(input_ids[:b], positions[:b]) - graphs[b] = (graph, output) - else: + model(input_ids, positions) + for b in cudagraph_sizes[::-1]: + if not piecewise: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=pool): output = model(input_ids[:b], positions[:b]) - graphs[b] = (model, output) + graphs[b] = (graph, output) + else: + output = model(input_ids[:b], positions[:b]) + graphs[b] = (model, output) for b in cudagraph_sizes: if piecewise: # noqa is for `Function definition does not bind loop variable` diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 386877e0e0a2c..b818ca921fcb0 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -247,9 +247,19 @@ def _compare_tp( *, method: Literal["generate", "encode"], ): - tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup - multi_node_only, trust_remote_code, tokenizer_mode, \ - load_format, hf_overrides = test_options + ( + tp_size, + pp_size, + eager_mode, + chunked_prefill, + ) = parallel_setup + ( + multi_node_only, + trust_remote_code, + tokenizer_mode, + load_format, + hf_overrides, + ) = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index fb24d6bc2c100..3e9b0e10a11d8 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -61,8 +61,8 @@ def worker_fn(): dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): tensor = pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == pynccl_comm.world_size + torch.cuda.synchronize() + assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -86,12 +86,12 @@ def multiple_allreduce_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = pynccl_comm.all_reduce(tensor) tensor = pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 4 + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() else: tensor = pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 2 + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -112,12 +112,12 @@ def multiple_allreduce_with_vllm_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 4 + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() else: tensor = tensor_model_parallel_all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 2 + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -141,10 +141,10 @@ def worker_fn_with_cudagraph(): graph, stream=pynccl_comm.stream), pynccl_comm.change_state( enable=True): a_out = pynccl_comm.all_reduce(a) - pynccl_comm.stream.synchronize() + torch.cuda.synchronize() graph.replay() - pynccl_comm.stream.synchronize() - assert a_out.mean().cpu().item() == pynccl_comm.world_size**1 + torch.cuda.synchronize() + assert torch.all(a_out == pynccl_comm.world_size).cpu().item() @worker_fn_wrapper @@ -170,6 +170,7 @@ def all_gather_worker_fn(): with pynccl_comm.change_state(enable=True): pynccl_comm.all_gather(result, tensor) + torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -207,6 +208,7 @@ def reduce_scatter_worker_fn(): with pynccl_comm.change_state(enable=True): pynccl_comm.reduce_scatter(result, tensor) + torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -241,8 +243,8 @@ def send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - result = tensor.mean().cpu().item() - assert result == 1 + torch.cuda.synchronize() + assert torch.all(tensor == 1).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -280,11 +282,11 @@ def multiple_send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - result = tensor.mean().cpu().item() + torch.cuda.synchronize() if torch.distributed.get_rank() in [0, 2]: - assert result == 1 + assert torch.all(tensor == 1).cpu().item() else: - assert result == 2 + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -293,6 +295,38 @@ def test_pynccl_multiple_send_recv(): distributed_run(multiple_send_recv_worker_fn, 4) +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +def test_pynccl_broadcast(): + distributed_run(broadcast_worker_fn, 4) + + +@worker_fn_wrapper +def broadcast_worker_fn(): + # Test broadcast for every root rank. + # Essentially this is an all-gather operation. + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + recv_tensors = [ + torch.empty(16, + 1024, + 1024, + dtype=torch.float32, + device=pynccl_comm.device) + for i in range(pynccl_comm.world_size) + ] + recv_tensors[pynccl_comm.rank] = torch.ones( + 16, 1024, 1024, dtype=torch.float32, + device=pynccl_comm.device) * pynccl_comm.rank + + for i in range(pynccl_comm.world_size): + pynccl_comm.broadcast(recv_tensors[i], src=i) + # the broadcast op might be launched in a different stream + # need to synchronize to make sure the tensor is ready + torch.cuda.synchronize() + assert torch.all(recv_tensors[i] == i).cpu().item() + + def test_ncclGetUniqueId(): lib = NCCLLibrary() unique_id = lib.ncclGetUniqueId() diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index de78d41ad12eb..4e269de9fc40b 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -50,12 +50,12 @@ def test_compilation_config(): args = parser.parse_args(["-O=3"]) assert args.compilation_config.level == 3 - # set to json - args = parser.parse_args(["--compilation-config", '{"level": 3}']) + # set to string form of a dict + args = parser.parse_args(["--compilation-config", "{'level': 3}"]) assert args.compilation_config.level == 3 - # set to json - args = parser.parse_args(['--compilation-config={"level": 3}']) + # set to string form of a dict + args = parser.parse_args(["--compilation-config={'level': 3}"]) assert args.compilation_config.level == 3 diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index e7ef5637c8ccb..0f7d15e1d85aa 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -69,6 +69,37 @@ def sample_json_schema(): } +@pytest.fixture +def sample_complex_json_schema(): + return { + "type": "object", + "properties": { + "score": { + "type": "integer", + "minimum": 0, + "maximum": 100 # Numeric range + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$" # Regex pattern + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "pattern": + "^[a-z]{1,10}$" # Combining length and pattern restrictions + } + } + }, + "required": ["score", "grade", "email", "tags"] + } + + @pytest.fixture def sample_guided_choice(): return [ diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py index 67c79415f322a..de6257cfc551c 100644 --- a/tests/entrypoints/llm/test_guided_generate.py +++ b/tests/entrypoints/llm/test_guided_generate.py @@ -76,6 +76,34 @@ def test_guided_json_completion(sample_json_schema, llm): jsonschema.validate(instance=output_json, schema=sample_json_schema) +@pytest.mark.skip_global_cleanup +def test_guided_complex_json_completion(sample_complex_json_schema, llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema)) + outputs = llm.generate(prompts=[ + f"Give an example JSON for an assignment grade " + f"that fits this schema: {sample_complex_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, + schema=sample_complex_json_schema) + + @pytest.mark.skip_global_cleanup def test_guided_choice_completion(sample_guided_choice, llm): sampling_params = SamplingParams( @@ -159,3 +187,30 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): sampling_params=sampling_params, use_tqdm=True, guided_options_request=dict(guided_regex=sample_regex)) + + +@pytest.mark.skip_global_cleanup +def test_guided_json_object(llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=100, + guided_decoding=GuidedDecodingParams(json_object=True)) + + outputs = llm.generate( + prompts=("Generate a JSON object describing a person with name " + "and age for John Smith who is 31 years old."), + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + + # Parse to verify it is valid JSON + parsed_json = json.loads(generated_text) + assert isinstance(parsed_json, dict) diff --git a/tests/entrypoints/openai/test_vision.py b/tests/entrypoints/openai/test_vision.py index 157d873a75b4d..a0b6edd566561 100644 --- a/tests/entrypoints/openai/test_vision.py +++ b/tests/entrypoints/openai/test_vision.py @@ -89,7 +89,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=772, total_tokens=782) + completion_tokens=10, prompt_tokens=775, total_tokens=785) message = choice.message message = chat_completion.choices[0].message @@ -181,7 +181,7 @@ async def test_single_chat_session_image_base64encoded( choice = chat_completion.choices[0] assert choice.finish_reason == "length" assert chat_completion.usage == openai.types.CompletionUsage( - completion_tokens=10, prompt_tokens=772, total_tokens=782) + completion_tokens=10, prompt_tokens=775, total_tokens=785) message = choice.message message = chat_completion.choices[0].message diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index d0c43b47bf0af..425f2a10ec855 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -95,5 +95,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, assert len(embeddings["data"]) == 1 assert len(embeddings["data"][0]["embedding"]) == 3072 assert embeddings["usage"]["completion_tokens"] == 0 - assert embeddings["usage"]["prompt_tokens"] == 762 - assert embeddings["usage"]["total_tokens"] == 762 + assert embeddings["usage"]["prompt_tokens"] == 765 + assert embeddings["usage"]["total_tokens"] == 765 diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index f9b11018288be..51be2425d7dd7 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -149,13 +149,14 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor, @pytest.mark.parametrize("itype", [torch.bfloat16, torch.float]) @pytest.mark.parametrize("silu_activation", [True]) @pytest.mark.parametrize("has_bias", [True]) +@pytest.mark.parametrize("has_initial_state", [True, False]) @pytest.mark.parametrize("width", [4]) @pytest.mark.parametrize( 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]) @pytest.mark.parametrize('dim', [64]) @pytest.mark.parametrize('batch', [1]) def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, - itype): + has_initial_state, itype): device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (3e-3, 5e-3) if itype == torch.bfloat16: @@ -167,11 +168,18 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, weight = torch.randn(dim, width, device=device, dtype=itype) bias = torch.randn(dim, device=device, dtype=itype) if has_bias else None - initial_states = torch.randn(batch, - dim, - width - 1, - device=device, - dtype=itype) + if has_initial_state: + initial_states = torch.randn(batch, + dim, + width - 1, + device=device, + dtype=itype) + has_initial_state_tensor = torch.ones(batch, + dtype=torch.bool, + device=x.device) + else: + initial_states = None + has_initial_state_tensor = None x_ref = x.clone() weight_ref = weight.clone() bias_ref = bias.clone() if bias is not None else None @@ -183,9 +191,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, bias, activation=activation, conv_states=initial_states, - has_initial_state=torch.ones(batch, - dtype=torch.bool, - device=x.device)) + has_initial_state=has_initial_state_tensor) out_ref, final_states_ref = causal_conv1d_ref( x_ref, weight_ref, @@ -193,11 +199,12 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, initial_states=initial_states_ref, return_final_states=True, activation=activation) - assert initial_states is not None and final_states_ref is not None - assert torch.allclose(initial_states, - final_states_ref, - rtol=rtol, - atol=atol) + if has_initial_state: + assert initial_states is not None and final_states_ref is not None + assert torch.allclose(initial_states, + final_states_ref, + rtol=rtol, + atol=atol) assert torch.allclose(out, out_ref, rtol=rtol, atol=atol) causal_conv1d_opcheck_fn(x, @@ -205,9 +212,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, bias, activation=activation, conv_states=initial_states, - has_initial_state=torch.ones(batch, - dtype=torch.bool, - device=x.device)) + has_initial_state=has_initial_state_tensor) @pytest.mark.parametrize("itype", [torch.bfloat16]) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index 15e576cb065c7..fb8c0b2a7ba26 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -28,7 +28,7 @@ # yapf: enable from vllm.lora.models import (LongContextLoRAContext, LoRALayerWeights, PackedLoRALayerWeights) -from vllm.lora.punica import PunicaWrapper +from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.model_executor.layers.linear import (ColumnParallelLinear, MergedColumnParallelLinear, QKVParallelLinear, @@ -48,11 +48,12 @@ torch.float32: (5e-3, 5e-3), torch.bfloat16: (3e-2, 2e-2), } -CUDA_DEVICES = [ +# TODO: Modify this based on platform +DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] -# We will launch different triton kernels between the prefill and decode +#For GPU, we will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) STAGES = [True, False] @@ -192,9 +193,18 @@ def create_random_inputs( return inputs, index_mapping, prompt_mapping +def check_punica_wrapper(punica_wrapper) -> bool: + if current_platform.is_cuda_alike(): + from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU + + return type(punica_wrapper) is PunicaWrapperGPU + else: + return False + + @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: @@ -205,7 +215,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: torch.set_default_device(device) max_loras = 8 - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -296,7 +307,7 @@ def create_random_embedding_layer(): # @pytest.mark.skip( # reason="Fails when loras are in any slot other than the first.") @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) def test_embeddings_with_new_embeddings(dist_init, num_loras, device, @@ -305,7 +316,8 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -432,7 +444,7 @@ def create_random_embedding_layer(): @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) @pytest.mark.parametrize("stage", STAGES) def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, @@ -441,7 +453,8 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, lora_dtype=torch.float16) @@ -563,17 +576,21 @@ def _pretest(): @torch.inference_mode() @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) -def test_linear_replicated(dist_init, num_loras, device, stage) -> None: +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_replicated(dist_init, num_loras, device, stage, + bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_replicated_layer(): @@ -585,7 +602,12 @@ def create_random_linear_replicated_layer(): lora_linear = ReplicatedLinearWithLoRA(linear) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -667,19 +689,22 @@ def create_random_linear_replicated_layer(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("orientation", ["row", "column"]) @pytest.mark.parametrize("fully_shard", [True, False]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_parallel_layer(): if orientation == "row": @@ -700,7 +725,12 @@ def create_random_linear_parallel_layer(): if not fully_shard else ColumnParallelLinearWithShardedLoRA(linear)) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -782,19 +812,22 @@ def create_random_linear_parallel_layer(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("repeats", [1, 2, 3]) @pytest.mark.parametrize("fully_shard", [True, False]) -@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: torch.cuda.set_device(device) torch.set_default_device(device) - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_column_parallel_packed_layer(): if repeats == 2: @@ -832,10 +865,16 @@ class FakeConfig: num_key_value_heads = 32 num_attention_heads = 32 + n_slices = repeats lora_linear.create_lora_weights(max_loras, lora_config, model_config=FakeConfig()) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == n_slices) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -911,7 +950,6 @@ class FakeConfig: 512, lora_config.lora_extra_vocab_size, ) - # lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] @@ -941,7 +979,8 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, seed = 0 current_platform.seed_everything(seed) torch.set_default_device(device) - punica_wrapper = PunicaWrapper(8192, 256, device) + punica_wrapper = get_punica_wrapper(8192, 256, device) + assert check_punica_wrapper(punica_wrapper) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index aae6310a2a213..d3ca7f878191a 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -55,15 +55,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@fork_new_process_for_each_test -def test_llama_lora(sql_lora_files): - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - +def generate_and_test(llm, sql_lora_files): print("lora adapter created") assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT @@ -79,6 +71,17 @@ def test_llama_lora(sql_lora_files): print("removing lora") +@fork_new_process_for_each_test +def test_llama_lora(sql_lora_files): + + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1) + generate_and_test(llm, sql_lora_files) + + @fork_new_process_for_each_test def test_llama_lora_warmup(sql_lora_files): """Test that the LLM initialization works with a warmup LORA path and @@ -118,20 +121,7 @@ def test_llama_lora_tp4(sql_lora_files): max_loras=4, tensor_parallel_size=4, ) - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT - - print("removing lora") + generate_and_test(llm, sql_lora_files) @multi_gpu_test(num_gpus=4) @@ -146,16 +136,20 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): tensor_parallel_size=4, fully_sharded_loras=True, ) - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + generate_and_test(llm, sql_lora_files) - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): - print("removing lora") + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + enable_lora_bias=True, + ) + generate_and_test(llm, sql_lora_files) diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index daa39b2a3dba1..d225a3f7d6c06 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): tokenizer_id="gpt2", enable_lora=True, max_num_seqs=1, + max_loras=1, max_input_length=None, ) lora_request = LoRARequest("1", 1, sql_lora_files) @@ -53,3 +54,22 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer + + +@pytest.mark.parametrize("enable_lora", [True, False]) +@pytest.mark.parametrize("max_num_seqs", [1, 2]) +@pytest.mark.parametrize("max_loras", [1, 2]) +def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras): + tokenizer_group = get_tokenizer_group( + get_tokenizer_pool_config(None), + tokenizer_id="gpt2", + enable_lora=enable_lora, + max_num_seqs=max_num_seqs, + max_loras=max_loras, + max_input_length=None, + ) + if enable_lora: + assert tokenizer_group.lora_tokenizers.capacity == max( + max_num_seqs, max_loras) + else: + assert tokenizer_group.lora_tokenizers.capacity == 0 diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index 45fab8e96b968..9f4d81b583141 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -36,7 +36,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema): @pytest.mark.asyncio -@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("backend", + ["outlines", "lm-format-enforcer", "xgrammar"]) async def test_guided_logits_processor_black_box(backend: str, sample_regex, sample_json_schema): tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 87a05b3011393..cae25ae9fa2c8 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -1,8 +1,8 @@ import pytest from tests.utils import multi_gpu_test +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -189,7 +189,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 01e208347bff4..35018c3c14dee 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -5,8 +5,8 @@ import pytest from transformers import AutoModelForCausalLM, AutoTokenizer +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -200,7 +200,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py index 60a8f63eb5faa..c16192a1e1438 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -2,12 +2,10 @@ from typing import Optional import pytest -import torch -from transformers import AutoImageProcessor, AutoTokenizer +from transformers import AutoTokenizer -from vllm.inputs import InputContext, token_inputs +from vllm.inputs import InputContext, InputProcessingContext from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID -from vllm.multimodal import MultiModalRegistry from .....conftest import _ImageAssets from ....utils import build_model_context @@ -17,15 +15,9 @@ # Wrap lazy imports to avoid initializing CUDA during test collection @pytest.fixture() -def input_processor_for_phi3v(): - from vllm.model_executor.models.phi3v import input_processor_for_phi3v - return input_processor_for_phi3v - - -@pytest.fixture() -def dummy_data_for_phi3v(): - from vllm.model_executor.models.phi3v import dummy_data_for_phi3v - return dummy_data_for_phi3v +def processor_for_phi3v(): + from vllm.model_executor.models.phi3v import Phi3VProcessor + return Phi3VProcessor @pytest.fixture() @@ -34,53 +26,6 @@ def get_max_phi3v_image_tokens(): return get_max_phi3v_image_tokens -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_crops", [4, 16, None]) -def test_input_mapper_override(model: str, image_assets: _ImageAssets, - num_crops: Optional[int]): - """Ensure that the [default] input mapper handles num_crops properly.""" - # We pass the processor kwargs here since for this model, we fall back to - # the default mapper; this will fall back to the HF mapper and forward - # mm_processor_kwargs to it. - mm_processor_kwargs = { - "num_crops": num_crops - } if num_crops is not None else {} - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=mm_processor_kwargs, - ) - - hf_processor = AutoImageProcessor.from_pretrained(model, - trust_remote_code=True, - **mm_processor_kwargs) - - mm_registry = MultiModalRegistry() - mm_registry.init_mm_limits_per_prompt(ctx.model_config) - - image = image_assets[0].pil_image - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - - vllm_result = mm_registry.map_input( - ctx.model_config, - {"image": image}, - ) - - assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"]) - assert torch.all( - hf_result["num_img_tokens"] == vllm_result["num_img_tokens"]) - - # For pixel values, the second axis should be the num_crops + 1 - # for the rescaled original image. The default value in VLLM falls - # back to the HF config, which is why we compare to the processor num_crops - assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) - assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1 - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("num_crops,expected_max_tokens", [ (4, 781), @@ -112,48 +57,20 @@ def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, @pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [ - (4, 781, 1), - (4, 781, 2), - (16, 2653, 1), - (16, 2653, 2), -]) -def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int, - toks_per_img: int, num_imgs: int): - """Ensure dummy_data_for_phi3v handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the dummy data func. - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - dummy_data = dummy_data_for_phi3v( - ctx=ctx, - seq_len=8192, # Should be bigger than num_imgs * toks_per_img - mm_counts={"image": num_imgs}, - num_crops=num_crops, - ) - sequence_data = dummy_data.seq_data - # Ensure we have the right number of placeholders per num_crops size - img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID) - assert img_tok_count == toks_per_img * num_imgs - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [ - (4, 757, 1), - (4, 757, 2), - (16, 1921, 1), - (16, 1921, 2), -]) -def test_input_processor_override(input_processor_for_phi3v, - image_assets: _ImageAssets, model: str, - num_crops: int, expected_toks_per_img: int, - num_imgs: int): +@pytest.mark.parametrize( + "num_crops,expected_toks_per_img,num_imgs", + [ + (4, 757, 1), + (4, 757, 2), + (16, 1921, 1), + (16, 1921, 2), + # the default num_crops of phi-3.5-vision is 4 + (None, 757, 2), + (None, 757, 2), + ]) +def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, + model: str, num_crops: Optional[int], + expected_toks_per_img: int, num_imgs: int): """Ensure input_processor_for_phi3v handles num_crops properly.""" # Same as the previous test - don't initialize mm_processor_kwargs # in this test and assume that the kwargs will be correctly expanded by @@ -163,19 +80,20 @@ def test_input_processor_override(input_processor_for_phi3v, tokenizer_name=model, trust_remote_code=True, ) - tokenizer = AutoTokenizer.from_pretrained(model) + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + ctx = InputProcessingContext(ctx.model_config, tokenizer) # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" images = [image_assets[0].pil_image] * num_imgs - inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), - prompt=prompt, - multi_modal_data={"image": images}) + mm_data = {"image": images} + mm_processor_kwargs = {} + if num_crops is not None: + mm_processor_kwargs = {"num_crops": num_crops} - processed_inputs = input_processor_for_phi3v(ctx, - inputs, - num_crops=num_crops) + processor = processor_for_phi3v(ctx) + processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) # Ensure we have the right number of placeholders per num_crops size img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index dbb0b4d350d10..ed8f34a677f84 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -6,7 +6,6 @@ from typing import Type import pytest -import transformers from transformers import AutoModelForVision2Seq from transformers.utils import is_flash_attn_2_available @@ -35,7 +34,7 @@ "dtype": "half", "max_tokens": 5, "tensor_parallel_size": 2, - "model_kwargs": {"device_map": "auto"}, + "hf_model_kwargs": {"device_map": "auto"}, "image_size_factors": [(.25, 0.5, 1.0)], "distributed_executor_backend": ( "ray", @@ -109,7 +108,7 @@ "cherry_blossom": "What is in the picture?", }), auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, @@ -152,7 +151,7 @@ "cherry_blossom": "Please infer the season with reason.", }), multi_image_prompt="Describe the two images shortly.", # noqa: E501 - postprocess_inputs=model_utils.get_key_type_post_processor("pixel_values"), + postprocess_inputs=model_utils.cast_dtype_post_processor("pixel_values"), stop_str=["<|im_end|>"], image_size_factors=[(0.10, 0.15)], max_tokens=64, @@ -178,7 +177,7 @@ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), # For chameleon, we only compare the sequences @@ -187,12 +186,6 @@ comparator=check_outputs_equal, max_tokens=8, dtype="bfloat16", - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken in HF, see huggingface/transformers#34379" - ), - ] ), "fuyu": VLMTestInfo( models=["adept/fuyu-8b"], @@ -243,13 +236,7 @@ max_model_len=8192, max_num_seqs=2, auto_cls=AutoModelForVision2Seq, - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.0", - reason="Model introduced in HF >= 4.46.0" - ), - large_gpu_mark(min_gb=48), - ], + marks=[large_gpu_mark(min_gb=48)], ), "intern_vl": VLMTestInfo( models=[ @@ -294,7 +281,7 @@ prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 num_video_frames=16, max_model_len=16384, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values_videos" ), auto_cls=AutoModelForVision2Seq, @@ -318,12 +305,20 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - marks=[ - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken with changes in transformers 4.46" - ) - ], + ), + "mantis": VLMTestInfo( + models=["TIGER-Lab/Mantis-8B-siglip-llama3"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + max_model_len=4096, + postprocess_inputs=model_utils.cast_dtype_post_processor( + "pixel_values" + ), + vllm_runner_kwargs={"hf_overrides": {"architectures": ["MantisForConditionalGeneration"]}}, # noqa: E501 + get_stop_token_ids=lambda tok: [128009], + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.mantis_vllm_to_hf_output, + patch_hf_runner=model_utils.mantis_patch_hf_runner, ), "minicpmv_25": VLMTestInfo( models=["openbmb/MiniCPM-Llama3-V-2_5"], @@ -361,7 +356,7 @@ # max_num_seqs=2, # task="generate", # # use eager mode for hf runner since phi3v didn't work with flash_attn - # model_kwargs={"_attn_implementation": "eager"}, + # hf_model_kwargs={"_attn_implementation": "eager"}, # use_tokenizer_eos=True, # vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output, # num_logprobs=10, @@ -392,7 +387,7 @@ prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), vllm_output_post_proc = lambda vllm_output, model: vllm_output[:2], @@ -404,10 +399,6 @@ cuda_device_count_stateless() < 2, reason="Need at least 2 GPUs to run the test.", ), - pytest.mark.skipif( - transformers.__version__ < "4.46.2", - reason="Model broken in HF, see huggingface/transformers#34379" - ) ], **COMMON_BROADCAST_SETTINGS # type: ignore ), @@ -461,7 +452,7 @@ test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - postprocess_inputs=model_utils.get_key_type_post_processor( + postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" ), auto_cls=AutoModelForVision2Seq, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 6233860747b9c..90c0fab99054c 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -228,7 +228,7 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None: name_1="output") -@large_gpu_test(min_gb=24) +@large_gpu_test(min_gb=48) @pytest.mark.parametrize( "prompt,expected_ranges", [(_create_engine_inputs_hf(IMG_URLS[:1]), [{ diff --git a/tests/models/decoder_only/vision_language/vlm_utils/core.py b/tests/models/decoder_only/vision_language/vlm_utils/core.py index 88349ef9a3a69..54b7b0733210f 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/core.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/core.py @@ -3,9 +3,11 @@ import torch from PIL.Image import Image -from transformers import AutoTokenizer, BatchEncoding +from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase from transformers.models.auto.auto_factory import _BaseAutoModelClass +from vllm.config import TaskOption + from .....conftest import HfRunner, VllmRunner from .types import RunnerOutput @@ -28,13 +30,15 @@ def run_test( use_tokenizer_eos: bool, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding], comparator: Callable[..., None], - get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]], + get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], + List[int]]], stop_str: Optional[List[str]], tokenizer_mode: str, limit_mm_per_prompt: Dict[str, int], - model_kwargs: Optional[Dict[str, Any]], + vllm_runner_kwargs: Optional[Dict[str, Any]], + hf_model_kwargs: Optional[Dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], - task: str = "auto", + task: TaskOption = "auto", runner_mm_key: str = "images", distributed_executor_backend: Optional[str] = None, tensor_parallel_size: int = 1, @@ -58,6 +62,9 @@ def run_test( if stop_str: vllm_kwargs["stop"] = stop_str + if vllm_runner_kwargs is None: + vllm_runner_kwargs = {} + with vllm_runner(model, tokenizer_mode=tokenizer_mode, max_model_len=max_model_len, @@ -67,7 +74,8 @@ def run_test( tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=enforce_eager, - task=task) as vllm_model: + task=task, + **vllm_runner_kwargs) as vllm_model: for prompts, media in vllm_inputs: vllm_kwargs[runner_mm_key] = media vllm_output = vllm_model.generate_greedy_logprobs( @@ -78,7 +86,7 @@ def run_test( dtype=dtype, auto_cls=auto_cls, postprocess_inputs=postprocess_inputs, - model_kwargs=model_kwargs) + model_kwargs=hf_model_kwargs) # Some models need to patch things like the model processor, e.g., internvl if patch_hf_runner is not None: diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 15f15dd7d8030..3eca8fb9dcb1a 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -126,6 +126,16 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput, return hf_output_ids, hf_output_str, out_logprobs +def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, + model: str) -> RunnerOutput: + """Sanitize vllm output [mantis] to compare with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "<|eot_id|>" + + return output_ids, hf_output_str, out_logprobs + + def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput: """Sanitize vllm output [phi3v] to be comparable with hf output.""" @@ -184,7 +194,7 @@ def get_llava_embeddings(image_assets: _ImageAssets): ####### postprocessors to run on HF BatchEncoding -def get_key_type_post_processor( +def cast_dtype_post_processor( hf_inp_key: str) -> Callable[[BatchEncoding, str], BatchEncoding]: """Gets a handle to a post processor which converts a given key into a target data type.""" @@ -418,3 +428,26 @@ def _internvl_generate( ) return outputs + + +def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + from mantis.models.mllava import MLlavaProcessor + + hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name) + + orig_generate = hf_model.model.generate + tokenizer = hf_model.processor.tokenizer + + def _generate(self, *args, **kwargs): + return orig_generate( + *args, + **kwargs, + eos_token_id=[ + tokenizer.eos_token_id, + tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ], + ) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + + return hf_model diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index d410fa8c653ce..e2e0c6390fcb9 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -7,9 +7,11 @@ import torch from PIL.Image import Image from pytest import MarkDecorator -from transformers import AutoModelForCausalLM, AutoTokenizer, BatchEncoding +from transformers import (AutoModelForCausalLM, BatchEncoding, + PreTrainedTokenizerBase) from transformers.models.auto.auto_factory import _BaseAutoModelClass +from vllm.config import TaskOption from vllm.sequence import SampleLogprobs from vllm.utils import identity @@ -66,7 +68,7 @@ class ImageSizeWrapper(NamedTuple): class VLMTestInfo(NamedTuple): """Holds the configuration for 1+ tests for one model architecture.""" - models: Union[List[str]] + models: List[str] test_type: Union[VLMTestType, Iterable[VLMTestType]] # Should be None only if this is a CUSTOM_INPUTS test @@ -92,18 +94,20 @@ class VLMTestInfo(NamedTuple): enforce_eager: bool = True max_model_len: int = 1024 max_num_seqs: int = 256 - task: str = "auto" + task: TaskOption = "auto" tensor_parallel_size: int = 1 + vllm_runner_kwargs: Optional[Dict[str, Any]] = None # Optional callable which gets a list of token IDs from the model tokenizer - get_stop_token_ids: Optional[Callable[[AutoTokenizer], List[int]]] = None + get_stop_token_ids: Optional[Callable[[PreTrainedTokenizerBase], + List[int]]] = None # Optional list of strings to stop generation, useful when stop tokens are # not special tokens in the tokenizer stop_str: Optional[List[str]] = None # Exposed options for HF runner - model_kwargs: Optional[Dict[str, Any]] = None - # Indicates we should explicitly pass the EOS from the tokeniezr + hf_model_kwargs: Optional[Dict[str, Any]] = None + # Indicates we should explicitly pass the EOS from the tokenizer use_tokenizer_eos: bool = False auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM # Callable to pass to the HF runner to run on inputs; for now, we also pass @@ -164,6 +168,7 @@ def get_non_parametrized_runner_kwargs(self): "max_num_seqs": self.max_num_seqs, "task": self.task, "tensor_parallel_size": self.tensor_parallel_size, + "vllm_runner_kwargs": self.vllm_runner_kwargs, "hf_output_post_proc": self.hf_output_post_proc, "vllm_output_post_proc": self.vllm_output_post_proc, "auto_cls": self.auto_cls, @@ -171,8 +176,8 @@ def get_non_parametrized_runner_kwargs(self): "postprocess_inputs": self.postprocess_inputs, "comparator": self.comparator, "get_stop_token_ids": self.get_stop_token_ids, + "hf_model_kwargs": self.hf_model_kwargs, "stop_str": self.stop_str, - "model_kwargs": self.model_kwargs, "patch_hf_runner": self.patch_hf_runner, "tokenizer_mode": self.tokenizer_mode } diff --git a/tests/models/registry.py b/tests/models/registry.py index 461f453d8b1c3..a89518820045f 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -176,6 +176,7 @@ class _HfExamplesInfo: "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"), # noqa: E501 "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index 2a072737db043..3b728f2744fca 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,6 @@ from unittest.mock import patch import pytest -import transformers from transformers import PretrainedConfig from vllm import LLM @@ -11,10 +10,6 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): - if (model_arch in {"Idefics3ForConditionalGeneration", "GlmForCausalLM"} - and transformers.__version__ < "4.46.0"): - pytest.skip(reason="Model introduced in HF >= 4.46.0") - model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) if not model_info.is_available_online: pytest.skip("Model is not available online") diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 13ad4a7966b9d..71832acbd17b8 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -2,7 +2,7 @@ import numpy as np import pytest -from transformers import CLIPImageProcessor, LlavaNextImageProcessor +from transformers import LlavaNextImageProcessor from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry @@ -14,49 +14,6 @@ def mm_registry(): return MultiModalRegistry() -@pytest.mark.parametrize("dtype", ["half", "float"]) -@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) -def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): - MODEL_NAME = "llava-hf/llava-1.5-7b-hf" - - hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME) - assert isinstance(hf_processor, CLIPImageProcessor) - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype=dtype, - revision=None, - limit_mm_per_prompt={"image": 1}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - for asset in image_assets: - image = rescale_image_size(asset.pil_image, size_factor) - - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - vllm_result = mm_registry.map_input( - model_config, - {"image": image}, - ) - - assert hf_result.keys() == vllm_result.keys() - for key, hf_tensor in hf_result.items(): - hf_arr: np.ndarray = hf_tensor.numpy() - vllm_arr: np.ndarray = vllm_result[key].numpy() - - assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" - assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" - - @pytest.mark.parametrize("dtype", ["half", "float"]) @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) def test_llava_next_image_processor(image_assets, mm_registry, dtype, @@ -107,7 +64,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, (2, 1, False), (2, 2, True)], ) def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): - MODEL_NAME = "llava-hf/llava-1.5-7b-hf" + MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" model_config = ModelConfig( model=MODEL_NAME, @@ -138,7 +95,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): # NOTE: We don't test zero images since the HF processor doesn't support it @pytest.mark.parametrize("num_images", [1, 2]) def test_image_mapper_multi(image_assets, mm_registry, num_images): - MODEL_NAME = "llava-hf/llava-1.5-7b-hf" + MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" model_config = ModelConfig( model=MODEL_NAME, diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index b2367060c6c1b..ae668d1dd56c8 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -3,50 +3,15 @@ import pytest from transformers import BatchFeature -from vllm.multimodal.processing import (PromptReplacement, find_text_matches, - find_token_matches, iter_token_matches, - iter_token_runs, replace_text_matches) +from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, + find_text_matches, find_token_matches, + iter_placeholders, iter_token_matches, + replace_text_matches, + replace_token_matches) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby -# yapf: disable -@pytest.mark.parametrize( - ("token_ids", "expected"), - [ - ([], []), - ( - [32000, 32000, 32000], - [{ "token_id": 32000, "start_idx": 0, "length": 3 }], - ), - ( - [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], - [ - { "token_id": 9833, "start_idx": 0, "length": 1 }, - { "token_id": 28747, "start_idx": 1, "length": 1 }, - { "token_id": 32000, "start_idx": 2, "length": 3 }, - { "token_id": 9833, "start_idx": 5, "length": 1 }, - { "token_id": 28747, "start_idx": 6, "length": 1 }, - { "token_id": 32000, "start_idx": 7, "length": 2 }, - { "token_id": 918, "start_idx": 9, "length": 1 }, - ], - ), - ], -) -# yapf: enable -def test_iter_token_runs(token_ids, expected): - result = list(iter_token_runs(token_ids)) - - # Only displayed on error - print("result:", result) - - # Manually constructed results - assert [item._asdict() for item in result] == expected - - # Invariants - assert sum(run_info.length for run_info in result) == len(token_ids) - - # yapf: disable @pytest.mark.parametrize( ("token_ids", "match_ids", "expected"), @@ -170,13 +135,11 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key): # Should not be used since there is nothing to convert to token IDs mock_tokenizer = cast(AnyTokenizer, object()) - result = find_token_matches( - prompt, - [ - PromptReplacement(target, [], 0).bind(key, mock_tokenizer) - for key, target in target_by_key.items() - ], - ) + prompt_repls = [ + PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ] + result = find_token_matches(prompt, prompt_repls) # Only displayed on error print("result:", result) @@ -279,13 +242,11 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): # Should not be used since there is nothing to convert to text mock_tokenizer = cast(AnyTokenizer, object()) - result = find_text_matches( - prompt, - [ - PromptReplacement(target, [], 0).bind(key, mock_tokenizer) - for key, target in target_by_key.items() - ], - ) + prompt_repls = [ + PromptReplacement(target, [], 0).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ] + result = find_text_matches(prompt, prompt_repls) # Only displayed on error print("result:", result) @@ -303,7 +264,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): # yapf: disable @pytest.mark.parametrize( - ("prompt", "target_by_key", "repl_by_key", "expected_by_mm_count"), + ("prompt", "target_by_key", "repl_by_key"), [ ( "Image:Image:!", @@ -322,49 +283,201 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key): # Test multiple repl_count "pattern_3": ("?", 2), }, - { - # Test no replacement - 0: "Image:Image:!", - # Test single replacement - 1: "Image:??", - # Test repeated replacement - 2: "??", - }, ), ] ) +@pytest.mark.parametrize( + ("mm_count", "expected"), + [ + (0, "Image:Image:!"), + (1, "Image:??"), + (2, "??"), + ] +) # yapf: enable def test_find_replace_text( prompt, target_by_key, repl_by_key, - expected_by_mm_count, + mm_count, + expected, ): # Should not be used since there is nothing to convert to text mock_tokenizer = cast(AnyTokenizer, object()) - matches = find_text_matches( + prompt_repls = [ + PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ] + matches = find_text_matches(prompt, prompt_repls) + + result = replace_text_matches( prompt, - [ - PromptReplacement(target, *repl_by_key[key]) \ - .bind(key, mock_tokenizer) - for key, target in target_by_key.items() - ], + matches, + {key: list(range(mm_count)) + for key in repl_by_key}, + BatchFeature(), ) - result_by_mm_count = { - mm_count: replace_text_matches( - prompt, - matches, - {key: list(range(mm_count)) - for key in repl_by_key}, - BatchFeature(), - ) - for mm_count in expected_by_mm_count - } # Only displayed on error print("matches:", matches) - print("result_by_mm_count:", result_by_mm_count) + print("result:", result) + + # Manually constructed results + assert result == expected + + +# yapf: disable +@pytest.mark.parametrize( + ("prompt", "target_by_key", "repl_by_key"), + [ + # Tokenized test cases of `test_find_replace_text` + # using the vocab of llava-hf/llava-v1.6-mistral-7b-hf + ( + [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], + { + # We use `` before `Image:` to test matches that + # occur out of order + "pattern_1": [32000], + "pattern_2": [9833, 28747], + "pattern_3": [918], + }, + { + # Test whether target is confused with repl_unit + "pattern_1": ([32000, 32000], 1), + # Test empty repl_unit + "pattern_2": ([], 1), + # Test multiple repl_count + "pattern_3": ([1550], 2), + }, + ), + ] +) +@pytest.mark.parametrize( + ("mm_count", "expected"), + [ + (0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]), + (1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550]), + (2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550]), + ] +) +# yapf: enable +def test_find_replace_tokens( + prompt, + target_by_key, + repl_by_key, + mm_count, + expected, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) + + prompt_repls = [ + PromptReplacement(target, *repl_by_key[key]).bind(key, mock_tokenizer) + for key, target in target_by_key.items() + ] + matches = find_token_matches(prompt, prompt_repls) + + result = replace_token_matches( + prompt, + matches, + {key: list(range(mm_count)) + for key in repl_by_key}, + BatchFeature(), + ) + + # Only displayed on error + print("matches:", matches) + print("result:", result) + + # Manually constructed results + assert result == expected + + +# yapf: disable +@pytest.mark.parametrize( + "repl_by_key", + [ + { + "pattern_1": ([32000, 32000], 1), + "pattern_2": ([], 1), + "pattern_3": ([1550], 2), + }, + ], +) +@pytest.mark.parametrize( + ("prompt", "expected"), + [ + ( + [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], + [ + _PlaceholderInfo( + modality="pattern_1", + start_idx=6, + unit=[32000, 32000], + unit_count=1, + ), + ], + ), + ( + [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 1550], + [ + _PlaceholderInfo( + modality="pattern_1", + start_idx=1, + unit=[32000, 32000], + unit_count=1, + ), + _PlaceholderInfo( + modality="pattern_1", + start_idx=5, + unit=[32000, 32000], + unit_count=1, + ), + _PlaceholderInfo( + modality="pattern_3", + start_idx=7, + unit=[1550], + unit_count=2, + ), + ], + ), + ( + [1, 32000, 32000, 32000, 32000, 32000, 1550, 1550], + [ + _PlaceholderInfo( + modality="pattern_1", + start_idx=1, + unit=[32000, 32000], + unit_count=2, + ), + _PlaceholderInfo( + modality="pattern_3", + start_idx=6, + unit=[1550], + unit_count=2, + ), + ], + ), + ] +) +def test_iter_placeholders( + repl_by_key, + prompt, + expected, +): + # Should not be used since there is nothing to convert to tokens + mock_tokenizer = cast(AnyTokenizer, object()) + + prompt_repls = [ + PromptReplacement([], *repl).bind(key, mock_tokenizer) + for key, repl in repl_by_key.items() + ] + + result = list(iter_placeholders(prompt_repls, prompt)) + + # Only displayed on error + print("result:", result) # Manually constructed results - assert result_by_mm_count == expected_by_mm_count + assert result == expected diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index e6c8793989e13..d141cdf1f083b 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -15,13 +15,13 @@ # Used for fast tests where the model doesn't matter DUMMY_MODEL_ID = "facebook/opt-125m" # Used for tests that need a multimodal model -MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +MULTIMODAL_MODEL_ID = "OpenGVLab/InternVL2-2B" # For mm_processor_kwargs - we test overrides by defining mocks for each place # it is used, and ensuring that we can pass processor kwargs an override value # to receive the intended result for things like sequence length etc. -DEFAULT_NUM_CROPS = 4 -NUM_CROPS_OVERRIDE = 16 +DEFAULT_MAX_DYNAMIC_PATCH = 6 +MAX_DYNAMIC_PATCH_OVERRIDE = 4 # Mocks for all of the places that we use the mm_processor_kwargs @@ -33,10 +33,11 @@ def use_processor_mock(): def custom_processor(ctx: InputContext, inputs: DecoderOnlyInputs, *, - num_crops=DEFAULT_NUM_CROPS): + max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH): # For testing purposes, we don't worry about the prompt - return token_inputs(prompt_token_ids=[], - mm_processor_kwargs={"num_crops": num_crops}) + return token_inputs( + prompt_token_ids=[], + mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch}) with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor", return_value=custom_processor): @@ -52,9 +53,9 @@ def custom_dummy_data_factory(self, seq_len: int, mm_counts: Mapping[str, int], *, - num_crops=DEFAULT_NUM_CROPS): + max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH): seq_data = SequenceData( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops)) + array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * max_dynamic_patch)) return DummyData(seq_data, None) with patch( @@ -65,15 +66,15 @@ def custom_dummy_data_factory(self, # Lazy import to avoid CUDA reinitialization error def mm_model_cls(): - from vllm.model_executor.models.phi3v import Phi3VForCausalLM + from vllm.model_executor.models.internvl import InternVLChatModel - return Phi3VForCausalLM + return InternVLChatModel # lambda whose signature matches max token calcs extra & mapper + extra kwargs -get_num_crops = lambda ctx, *, num_crops=DEFAULT_NUM_CROPS: num_crops -custom_mapper = lambda ctx, data, *, num_crops=DEFAULT_NUM_CROPS: { - "pixel_values": torch.zeros(size=(1, num_crops + 1, 3, 336, 336)) +get_max_dynamic_patch = lambda ctx, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: max_dynamic_patch # noqa: E501 +custom_mapper = lambda ctx, data, *, max_dynamic_patch=DEFAULT_MAX_DYNAMIC_PATCH: { # noqa: E501 + "pixel_values": torch.zeros(size=(1, max_dynamic_patch + 1, 3, 448, 448)) } @@ -88,27 +89,28 @@ def test_default_processor_is_a_noop(): assert proc_inputs is proc_outputs -def _get_num_crops_info(init_num_crops: int, inference_num_crops: int): - """Get the init / inference kwargs and expected num_crops for this test.""" - # If we have a value for num_crops, pass the override value and make +def _get_max_dynamic_patch_info(init_max_dynamic_patch: int, + inference_max_dynamic_patch: int): + """Get the init / inference kwargs and expected max_dynamic_patch.""" + # If we have a value for max_dynamic_patch, pass the override value and make # sure we get that value as a return-value from out mock processor, # otherwise fall back to the default value - init_kwargs = None if init_num_crops is None else { - "num_crops": init_num_crops + init_kwargs = None if init_max_dynamic_patch is None else { + "max_dynamic_patch": init_max_dynamic_patch } - inference_kwargs = None if inference_num_crops is None else { - "num_crops": inference_num_crops + inference_kwargs = None if inference_max_dynamic_patch is None else { + "max_dynamic_patch": inference_max_dynamic_patch } - if inference_num_crops is not None: - expected_seq_count = inference_num_crops - elif init_num_crops is not None: - expected_seq_count = init_num_crops + if inference_max_dynamic_patch is not None: + expected_seq_count = inference_max_dynamic_patch + elif init_max_dynamic_patch is not None: + expected_seq_count = init_max_dynamic_patch else: - expected_seq_count = DEFAULT_NUM_CROPS + expected_seq_count = DEFAULT_MAX_DYNAMIC_PATCH return init_kwargs, inference_kwargs, expected_seq_count -def _get_processed_num_crops( +def _get_processed_max_dynamic_patch( processor: Callable[[ProcessorInputs], ProcessorInputs], inference_kwargs: Optional[Dict[str, int]], ) -> int: @@ -120,27 +122,30 @@ def _get_processed_num_crops( assert "type" in processed_inputs assert processed_inputs["type"] == "token" assert "mm_processor_kwargs" in processed_inputs - return processed_inputs["mm_processor_kwargs"]["num_crops"] + return processed_inputs["mm_processor_kwargs"]["max_dynamic_patch"] -@pytest.mark.parametrize("init_num_crops,inference_num_crops", [ - (None, None), - (NUM_CROPS_OVERRIDE, None), - (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE), -]) -def test_input_processor_kwargs(use_processor_mock, init_num_crops, - inference_num_crops): +@pytest.mark.parametrize( + "init_max_dynamic_patch,inference_max_dynamic_patch", [ + (None, None), + (MAX_DYNAMIC_PATCH_OVERRIDE, None), + (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE), + ]) +def test_input_processor_kwargs(use_processor_mock, init_max_dynamic_patch, + inference_max_dynamic_patch): """Ensure input processors can use processor kwargs.""" dummy_registry = InputRegistry() - init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info( - init_num_crops, inference_num_crops) + (init_kwargs, inference_kwargs, + expected_seq_count) = _get_max_dynamic_patch_info( + init_max_dynamic_patch, inference_max_dynamic_patch) ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs) processor = dummy_registry.create_input_processor(ctx.model_config) - num_crops_val = _get_processed_num_crops(processor, inference_kwargs) + max_dynamic_patch_val = _get_processed_max_dynamic_patch( + processor, inference_kwargs) - assert num_crops_val == expected_seq_count + assert max_dynamic_patch_val == expected_seq_count @pytest.mark.parametrize( @@ -165,18 +170,21 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock, processor = dummy_registry.create_input_processor(ctx.model_config) # Should filter out the inference time kwargs - num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs) - assert num_crops_val == DEFAULT_NUM_CROPS + max_dynamic_patch_val = _get_processed_max_dynamic_patch( + processor, mm_processor_kwargs) + assert max_dynamic_patch_val == DEFAULT_MAX_DYNAMIC_PATCH ### Test overrides for the dummy data -@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE]) -def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops): +@pytest.mark.parametrize("max_dynamic_patch", + [None, MAX_DYNAMIC_PATCH_OVERRIDE]) +def test_dummy_data_kwarg_overrides(use_dummy_data_mock, max_dynamic_patch): """Ensure dummy data factories can use processor kwargs.""" - mm_processor_kwargs = None if num_crops is None else { - "num_crops": num_crops + mm_processor_kwargs = None if max_dynamic_patch is None else { + "max_dynamic_patch": max_dynamic_patch } - expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops + expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH + if max_dynamic_patch is None else max_dynamic_patch) dummy_registry = InputRegistry() ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=mm_processor_kwargs) @@ -217,17 +225,20 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock, # len is solely dependent on the value of the mm_processor_kwargs. dummy_data = dummy_registry.dummy_data_for_profiling( ctx.model_config, seq_len=-1, mm_registry=mm_registry) - assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS + assert len( + dummy_data.seq_data.prompt_token_ids) == DEFAULT_MAX_DYNAMIC_PATCH ### Test overrides for the max token count per multimodal instance -@pytest.mark.parametrize("num_crops", [None, NUM_CROPS_OVERRIDE]) -def test_max_tokens_kwarg_overrides(num_crops): +@pytest.mark.parametrize("max_dynamic_patch", + [None, MAX_DYNAMIC_PATCH_OVERRIDE]) +def test_max_tokens_kwarg_overrides(max_dynamic_patch): """Ensure max token calcs can use processor kwargs.""" - mm_processor_kwargs = None if num_crops is None else { - "num_crops": num_crops + mm_processor_kwargs = None if max_dynamic_patch is None else { + "max_dynamic_patch": max_dynamic_patch } - expected_seq_count = DEFAULT_NUM_CROPS if num_crops is None else num_crops + expected_seq_count = (DEFAULT_MAX_DYNAMIC_PATCH + if max_dynamic_patch is None else max_dynamic_patch) ctx = build_model_context(MULTIMODAL_MODEL_ID, task="generate", @@ -239,11 +250,11 @@ def test_max_tokens_kwarg_overrides(num_crops): mm_registry.init_mm_limits_per_prompt(ctx.model_config) # Patch the image registry for phi3v with our lambda that is compatible # with overrides, then ensure that calling the method correctly echos - # our num_crops value back from the mm_processor_kwargs. + # our max_dynamic_patch value back from the mm_processor_kwargs. with patch.object( mm_registry._get_plugin("image"), "_max_mm_tokens", - {mm_model_cls(): get_num_crops}, + {mm_model_cls(): get_max_dynamic_patch}, ): max_multimodal_tokens = mm_registry.get_max_multimodal_tokens( ctx.model_config) @@ -279,26 +290,29 @@ def test_max_tokens_with_sad_kwarg_overrides(mm_processor_kwargs): with patch.object( mm_registry._get_plugin("image"), "_max_mm_tokens", - {mm_model_cls(): get_num_crops}, + {mm_model_cls(): get_max_dynamic_patch}, ): max_multimodal_tokens = mm_registry.get_max_multimodal_tokens( ctx.model_config) - assert max_multimodal_tokens == DEFAULT_NUM_CROPS + assert max_multimodal_tokens == DEFAULT_MAX_DYNAMIC_PATCH ### Test overrides for the mapper -@pytest.mark.parametrize("num_crops", [DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE]) -def test_default_mapper_with_processor_kwargs(image_assets, num_crops): +@pytest.mark.parametrize( + "max_dynamic_patch", + [DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE]) +def test_default_mapper_with_processor_kwargs(image_assets, max_dynamic_patch): """Ensure that the mapper processor kwargs can fall back to HF models.""" # NOTE - we don't validate bad inputs for the default mapper, because it's # through the automodel interface in transformers, so we can't easily # inspect what kwargs are or are not allowed. - ctx = build_model_context(MULTIMODAL_MODEL_ID, - task="generate", - trust_remote_code=True, - mm_processor_kwargs={"num_crops": num_crops}, - limit_mm_per_prompt={"image": 1}) + ctx = build_model_context( + MULTIMODAL_MODEL_ID, + task="generate", + trust_remote_code=True, + mm_processor_kwargs={"max_dynamic_patch": max_dynamic_patch}, + limit_mm_per_prompt={"image": 1}) mm_registry = MultiModalRegistry() mm_registry.init_mm_limits_per_prompt(ctx.model_config) @@ -307,20 +321,22 @@ def test_default_mapper_with_processor_kwargs(image_assets, num_crops): mm_inputs = {"image": image} mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs) - # Phi3v pixel vals should have shape: [batch, num_crops+1, 3, 336, 336] - assert mapped_inputs["pixel_values"].shape[1] == num_crops + 1 + # pixel vals should have shape: [batch, max_dynamic_patch+1, ...] + assert mapped_inputs["pixel_values"].shape[1] == max_dynamic_patch + 1 -@pytest.mark.parametrize("init_num_crops,inference_num_crops", [ - (None, None), - (NUM_CROPS_OVERRIDE, None), - (DEFAULT_NUM_CROPS, NUM_CROPS_OVERRIDE), -]) -def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops, - inference_num_crops): +@pytest.mark.parametrize( + "init_max_dynamic_patch,inference_max_dynamic_patch", [ + (None, None), + (MAX_DYNAMIC_PATCH_OVERRIDE, None), + (DEFAULT_MAX_DYNAMIC_PATCH, MAX_DYNAMIC_PATCH_OVERRIDE), + ]) +def test_custom_mapper_kwarg_overrides(image_assets, init_max_dynamic_patch, + inference_max_dynamic_patch): """Ensure custom mappers can use processor kwargs.""" - init_kwargs, inference_kwargs, expected_seq_count = _get_num_crops_info( - init_num_crops, inference_num_crops) + (init_kwargs, inference_kwargs, + expected_seq_count) = _get_max_dynamic_patch_info( + init_max_dynamic_patch, inference_max_dynamic_patch) ctx = build_model_context(MULTIMODAL_MODEL_ID, task="generate", @@ -335,7 +351,7 @@ def test_custom_mapper_kwarg_overrides(image_assets, init_num_crops, # Patch the image registry for phi3v with our lambda that is compatible # with overrides, then ensure that calling the method correctly echos - # our num_crops value back from the mm_processor_kwargs. + # our max_dynamic_patch value back from the mm_processor_kwargs. mm_registry._get_plugin("image").register_input_mapper(custom_mapper)( mm_model_cls()) mapped_inputs = mm_registry.map_input(ctx.model_config, mm_inputs, @@ -373,11 +389,12 @@ def test_custom_mapper_with_sad_kwarg_overrides(image_assets, # Patch the image registry for phi3v with our lambda that is compatible # with overrides, then ensure that calling the method correctly echos - # our num_crops value back from the mm_processor_kwargs. + # our max_dynamic_patch value back from the mm_processor_kwargs. mm_registry._get_plugin("image").register_input_mapper(custom_mapper)( mm_model_cls()) # Should filter out the inference time kwargs mapped_inputs = mm_registry.map_input( ctx.model_config, mm_inputs, mm_processor_kwargs=mm_processor_kwargs) - assert mapped_inputs["pixel_values"].shape[1] == DEFAULT_NUM_CROPS + 1 + assert mapped_inputs["pixel_values"].shape[1] == ( + DEFAULT_MAX_DYNAMIC_PATCH + 1) diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index 3ebd7864b8fc8..2f4194a63fc25 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -2,19 +2,15 @@ import torch -from vllm.inputs import INPUT_REGISTRY from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, - dummy_data_for_llava, - get_max_llava_image_tokens, - input_processor_for_llava) + LlavaProcessor, + get_max_llava_image_tokens) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava) +@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor) class MyLlava(LlavaForConditionalGeneration): def compute_logits( diff --git a/tests/test_lazy_torch_compile.py b/tests/standalone_tests/lazy_torch_compile.py similarity index 100% rename from tests/test_lazy_torch_compile.py rename to tests/standalone_tests/lazy_torch_compile.py diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh new file mode 100644 index 0000000000000..f00895c0997f1 --- /dev/null +++ b/tests/standalone_tests/python_only_compile.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This script tests if the python only compilation works correctly +# for users who do not have any compilers installed on their system + +set -e +set -x + +cd /vllm-workspace/ + +# uninstall vllm +pip3 uninstall -y vllm +# restore the original files +mv test_docs/vllm ./vllm + +# remove all compilers +apt remove --purge build-essential -y +apt autoremove -y + +echo 'import os; os.system("touch /tmp/changed.file")' >> vllm/__init__.py + +VLLM_USE_PRECOMPILED=1 pip3 install -vvv -e . + +# Run the script +python3 -c 'import vllm' + +# Check if the clangd log file was created +if [ ! -f /tmp/changed.file ]; then + echo "changed.file was not created, python only compilation failed" + exit 1 +fi diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index bd11ff1877064..fef44ac29c41f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -27,9 +27,8 @@ def make_request() -> EngineCoreRequest: request_id=uuid.uuid4(), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, - mm_data=None, + mm_inputs=None, mm_placeholders=None, - mm_processor_kwargs=None, sampling_params=SamplingParams(), eos_token_id=None, arrival_time=time.time(), diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 582192196aaf9..4e003a25e91d2 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -29,9 +29,8 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: request_id=str(uuid.uuid4()), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, - mm_data=None, + mm_inputs=None, mm_placeholders=None, - mm_processor_kwargs=None, sampling_params=params, eos_token_id=None, arrival_time=time.time(), diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index 9e166ae64dbfb..5289c91f201cd 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -4,12 +4,12 @@ import pytest import torch +from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner -from vllm.worker.model_runner import _get_graph_batch_size BATCH_SIZES = [1, 4, 16, 64, 256] @@ -548,7 +548,7 @@ def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group): # With CUDA Graph capture and replay enabled, the decoder and encoder # input sequences will be padded. Create the expected padded tensors # accordingly. - graph_batch_size = _get_graph_batch_size(expanded_batch_size) + graph_batch_size = VllmConfig.get_graph_batch_size(expanded_batch_size) cuda_graph_pad_size = graph_batch_size - expanded_batch_size padded_seq_lens = seq_lens + list(itertools.repeat(1, cuda_graph_pad_size)) padded_encoder_seq_lens = encoder_seq_lens + list( diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index 433a9b30ba57a..4055524f3e0c7 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -3,13 +3,14 @@ import pytest import torch +from vllm.config import VllmConfig from vllm.distributed.parallel_state import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata from vllm.utils import get_open_port -from vllm.worker.model_runner import ModelRunner, _get_graph_batch_size +from vllm.worker.model_runner import ModelRunner def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: @@ -176,7 +177,7 @@ def test_prepare_decode_cuda_graph(batch_size): model_input.attn_metadata, model_input.attn_metadata.slot_mapping) assert len(slot_mapping) == len(input_tokens) - expected_bs = _get_graph_batch_size(len(seq_group_metadata_list)) + expected_bs = VllmConfig.get_graph_batch_size(len(seq_group_metadata_list)) # Verify input metadata is correct for prompts. device = model_runner.device assert attn_metadata.num_prefills == 0 diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 5471eec881d85..b3065495ab396 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -111,8 +111,16 @@ def __init__( self.matmul_qk = Matmul() self.softmax = Softmax() self.matmul_av = Matmul() + self.batch2block_matmul = Matmul() + self.block2batch_matmul = Matmul() + # NOTE(kzawora): Contiguous PA is off until model runner supports it self.k_cache = VLLMKVCache() + self.k_cache.use_contiguous_pa = False self.v_cache = VLLMKVCache() + self.v_cache.use_contiguous_pa = False + # NOTE(kzawora): Pipelined PA is off until model runner supports it + ops.pa_impl = ops.pa + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.alibi_slopes = alibi_slopes @@ -229,9 +237,12 @@ def forward( block_mapping=attn_metadata.block_mapping, block_bias=attn_metadata.attn_bias, block_scales=attn_metadata.block_scales, + block_groups=None, scale=self.scale, matmul_qk_op=self.matmul_qk, matmul_av_op=self.matmul_av, + batch2block_matmul_op=self.batch2block_matmul, + block2batch_matmul_op=self.block2batch_matmul, keys_fetch_func=self.k_cache.fetch_from_cache, values_fetch_func=self.v_cache.fetch_from_cache) # Reshape the output tensor. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 29a38285b6750..f191d44ff9bc5 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -3,6 +3,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend @@ -169,6 +170,68 @@ def extra_repr(self) -> str: return s +class MultiHeadAttention(nn.Module): + """Multi-headed attention without any cache, used for ViT.""" + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: Optional[int] = None, + ): + super().__init__() + self.num_heads = num_heads + self.head_size = head_size + self.scale = scale + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + + dtype = torch.get_default_dtype() + attn_backend = get_attn_backend(head_size, + dtype, + kv_cache_dtype=None, + block_size=16, + is_attention_free=False) + if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: + attn_backend = _Backend.XFORMERS + + self.attn_backend = attn_backend if attn_backend in { + _Backend.TORCH_SDPA, _Backend.XFORMERS + } else _Backend.TORCH_SDPA + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + ) -> torch.Tensor: + """Input shape: batch_size x seq_len x hidden_size""" + # TODO(Isotr0py): Use existing backend implementations and support FA2 + bsz, q_len, _ = query.size() + kv_len = key.size(1) + + query = query.view(bsz, q_len, self.num_heads, self.head_size) + key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) + value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + + if self.attn_backend == _Backend.XFORMERS: + from xformers import ops as xops + + out = xops.memory_efficient_attention_forward(query, + key, + value, + scale=self.scale) + elif self.attn_backend == _Backend.TORCH_SDPA: + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) + out = F.scaled_dot_product_attention(query, + key, + value, + scale=self.scale) + out = out.transpose(1, 2) + return out.view(bsz, q_len, -1) + + def unified_attention( query: torch.Tensor, key: torch.Tensor, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 464bc2af8fd6d..f002a8ff905b1 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,5 +1,6 @@ import copy import dataclasses +import time from contextlib import ExitStack from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple from unittest.mock import patch @@ -14,6 +15,7 @@ from .counter import compilation_counter from .inductor_pass import InductorPass +from .monitor import end_monitoring_torch_compile from .pass_manager import PostGradPassManager logger = init_logger(__name__) @@ -22,31 +24,54 @@ def wrap_inductor(graph, example_inputs, additional_inductor_config, - do_logging=False, + compilation_config: CompilationConfig, + graph_index: int = 0, + num_graphs: int = 1, runtime_shape: Optional[int] = None, use_inductor: bool = True): + if graph_index == 0: + # before compiling the first graph, record the start time + global compilation_start_time + compilation_start_time = time.time() + if not use_inductor: return graph compilation_counter.num_inductor_compilations += 1 - if do_logging: - if runtime_shape is None: - logger.info("Compiling a graph for general shape") - else: - logger.info("Compiling a graph for shape %s", runtime_shape) - from torch._inductor import config - current_config = config.shallow_copy_dict() + current_config = config.get_config_copy() from torch._inductor.compile_fx import compile_fx if additional_inductor_config is not None: current_config.update(additional_inductor_config) + if isinstance(runtime_shape, int): + # for a specific batchsize, tuning triton kernel parameters + # can be beneficial + current_config["max_autotune"] = True + current_config["coordinate_descent_tuning"] = True + # inductor can inplace modify the graph, so we need to copy it # see https://github.com/pytorch/pytorch/issues/138980 graph = copy.deepcopy(graph) - return compile_fx(graph, example_inputs, config_patches=current_config) + compiled_graph = compile_fx(graph, + example_inputs, + config_patches=current_config) + + # after compiling the last graph, record the end time + if graph_index == num_graphs - 1: + now = time.time() + elapsed = now - compilation_start_time + compilation_config.compilation_time += elapsed + if runtime_shape is None: + logger.info("Compiling a graph for general shape takes %.2f s", + elapsed) + else: + logger.info("Compiling a graph for shape %s takes %.2f s", + runtime_shape, elapsed) + + return compiled_graph @dataclasses.dataclass @@ -108,6 +133,8 @@ def split_graph(graph: fx.GraphModule, # we share the global graph pool among all the backends global_graph_pool = None +compilation_start_time = 0.0 + class PiecewiseCompileInterpreter(torch.fx.Interpreter): """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`. @@ -151,12 +178,15 @@ def call_module(self, target: torch.fx.node.Target, sym_shape_indices = [ i for i, x in enumerate(args) if isinstance(x, torch.SymInt) ] + global compilation_start_time compiled_graph_for_general_shape = wrap_inductor( submod, args, self.compilation_configs.inductor_compile_config, + self.compilation_configs, + graph_index=index, + num_graphs=len(self.compile_submod_names), runtime_shape=None, - do_logging=index == 0, use_inductor=self.compilation_configs.use_inductor) self.module.__dict__[target] = PiecewiseBackend( @@ -235,17 +265,19 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + # when dynamo calls the backend, it means the bytecode + # transform and analysis are done compilation_counter.num_graphs_seen += 1 + from .monitor import torch_compile_start_time + dynamo_time = time.time() - torch_compile_start_time + logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time) + self.compilation_configs.compilation_time += dynamo_time # we control the compilation process, each instance can only be # called once assert not self._called, "VllmBackend can only be called once" self.graph = graph - # config is updated now, because only here can - # we get the sizes to capture for cudagraph - # from compilation context - self.compilation_configs.init_during_runtime() self.configure_post_pass() self.split_gm, self.piecewise_graphs = split_graph( @@ -377,6 +409,8 @@ def __init__(self, graph: fx.GraphModule, # the entries for different shapes that we need to either # compile or capture cudagraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} + self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union( + self.capture_sizes) for shape in self.compile_sizes.union(self.capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, @@ -387,6 +421,9 @@ def __init__(self, graph: fx.GraphModule, def __call__(self, *args) -> Any: if not self.first_run_finished: self.first_run_finished = True + # no specific sizes to compile + if self.is_last_graph and not self.to_be_compiled_sizes: + end_monitoring_torch_compile(self.compilation_configs) return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] @@ -401,15 +438,22 @@ def __call__(self, *args) -> Any: if entry.need_to_compile and not entry.compiled: entry.compiled = True + self.to_be_compiled_sizes.remove(runtime_shape) # args are real arguments entry.runnable = wrap_inductor( self.graph, args, self.compilation_configs.inductor_compile_config, + self.compilation_configs, + graph_index=self.piecewise_compile_index, + num_graphs=self.total_piecewise_compiles, runtime_shape=runtime_shape, - do_logging=self.is_first_graph, use_inductor=self.compilation_configs.use_inductor) + # finished compilations for all required shapes + if self.is_last_graph and not self.to_be_compiled_sizes: + end_monitoring_torch_compile(self.compilation_configs) + if not entry.use_cudagraph: return entry.runnable(*args) diff --git a/vllm/compilation/compile_context.py b/vllm/compilation/compile_context.py deleted file mode 100644 index 29db3d4c637b9..0000000000000 --- a/vllm/compilation/compile_context.py +++ /dev/null @@ -1,23 +0,0 @@ -from contextlib import contextmanager -from typing import Any - -_compile_context: Any = None - - -def get_compile_context() -> Any: - """Get the current compile context.""" - return _compile_context - - -@contextmanager -def set_compile_context(context: Any): - """A context manager that stores the current compile context, - usually it is a list of sizes to specialize. - """ - global _compile_context - prev_context = _compile_context - _compile_context = context - try: - yield - finally: - _compile_context = prev_context diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 8700243c9d904..938430fe2a501 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -11,6 +11,8 @@ from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo +from .monitor import start_monitoring_torch_compile + logger = init_logger(__name__) _T = TypeVar("_T", bound=type[nn.Module]) @@ -143,6 +145,7 @@ def _support_torch_compile( def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) + self.vllm_config = vllm_config # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner # will handle the compilation, so we don't need to do anything here. self.do_not_compile = \ @@ -181,6 +184,8 @@ def __call__(self, *args, **kwargs): raise ValueError( "Unsupported dynamic dimensions" f" {dims} for argument {k} with type {type(arg)}.") + # here, it is the starting point of the `torch.compile` process + start_monitoring_torch_compile(self.vllm_config.compilation_config) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/monitor.py b/vllm/compilation/monitor.py new file mode 100644 index 0000000000000..3348674b09af2 --- /dev/null +++ b/vllm/compilation/monitor.py @@ -0,0 +1,19 @@ +import time + +from vllm.config import CompilationConfig, CompilationLevel +from vllm.logger import init_logger + +logger = init_logger(__name__) + +torch_compile_start_time: float = 0.0 + + +def start_monitoring_torch_compile(compilation_config: CompilationConfig): + global torch_compile_start_time + torch_compile_start_time = time.time() + + +def end_monitoring_torch_compile(compilation_config: CompilationConfig): + if compilation_config.level == CompilationLevel.PIECEWISE: + logger.info("torch.compile takes %.2f s in total", + compilation_config.compilation_time) diff --git a/vllm/config.py b/vllm/config.py index fe25ea9f76038..7bbb21333beaf 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,3 +1,4 @@ +import ast import copy import enum import hashlib @@ -27,7 +28,7 @@ get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_mi250, is_navi, print_warning_once, + is_mi250, is_navi, print_warning_once, random_uuid, resolve_obj_by_qualname) if TYPE_CHECKING: @@ -511,13 +512,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid - if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"): + if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( - "Async output processing is only supported for CUDA, TPU, XPU " - "and HPU." - "Disabling it for other platforms.") + "Async output processing is not supported on the " + "current platform type %s.", current_platform.device_type) self.use_async_output_proc = False return @@ -527,22 +527,12 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if device_config.device_type == "cuda" and self.enforce_eager: - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used") - self.use_async_output_proc = not self.enforce_eager - return - # Async postprocessor is not necessary with embedding mode # since there is no token generation if self.task == "embedding": self.use_async_output_proc = False - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -933,7 +923,9 @@ def __post_init__(self): if isinstance(model_loader_extra_config, str): self.model_loader_extra_config = json.loads( model_loader_extra_config) - self._verify_load_format() + if isinstance(self.load_format, str): + load_format = self.load_format.lower() + self.load_format = LoadFormat(load_format) if self.ignore_patterns is not None and len(self.ignore_patterns) > 0: logger.info( @@ -942,25 +934,6 @@ def __post_init__(self): else: self.ignore_patterns = ["original/**/*"] - def _verify_load_format(self) -> None: - if not isinstance(self.load_format, str): - return - - load_format = self.load_format.lower() - self.load_format = LoadFormat(load_format) - - rocm_not_supported_load_format: List[str] = [] - if current_platform.is_rocm( - ) and load_format in rocm_not_supported_load_format: - rocm_supported_load_format = [ - f for f in LoadFormat.__members__ - if (f not in rocm_not_supported_load_format) - ] - raise ValueError( - f"load format '{load_format}' is not supported in ROCm. " - f"Supported load formats are " - f"{rocm_supported_load_format}") - @dataclass class ParallelConfig: @@ -1748,7 +1721,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: raise ValueError("LoRA is not supported with chunked prefill yet.") @@ -1816,15 +1789,15 @@ class PoolerConfig: step_tag_id: Optional[int] = None """ - If set, only the score corresponding to the ``step_tag_id`` in the + If set, only the score corresponding to the ``step_tag_id`` in the generated sentence should be returned. Otherwise, the scores for all tokens are returned. """ returned_token_ids: Optional[List[int]] = None """ - A list of indices for the vocabulary dimensions to be extracted, - such as the token IDs of ``good_token`` and ``bad_token`` in the + A list of indices for the vocabulary dimensions to be extracted, + such as the token IDs of ``good_token`` and ``bad_token`` in the ``math-shepherd-mistral-7b-prm`` model. """ @@ -2058,11 +2031,12 @@ def get_served_model_name(model: str, class DecodingConfig: """Dataclass which contains the decoding strategy of the engine""" - # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer' - guided_decoding_backend: str = 'outlines' + # Which guided decoding algo to use. + # 'outlines' / 'lm-format-enforcer' / 'xgrammar' + guided_decoding_backend: str = 'xgrammar' def __post_init__(self): - valid_guided_backends = ['outlines', 'lm-format-enforcer'] + valid_guided_backends = ['outlines', 'lm-format-enforcer', 'xgrammar'] backend = self.guided_decoding_backend if backend not in valid_guided_backends: raise ValueError(f"Invalid guided_decoding_backend '{backend}," @@ -2125,7 +2099,7 @@ class KVTransferConfig(BaseModel): @classmethod def from_cli(cls, cli_value: str) -> "KVTransferConfig": - """Parse the CLI value for the compilation config.""" + """Parse the CLI value for the kv cache transfer config.""" return KVTransferConfig.model_validate_json(cli_value) def model_post_init(self, __context: Any) -> None: @@ -2218,8 +2192,8 @@ class CompilationConfig(BaseModel): TODO: move outside cudagraph logic into compilation. torch.compile will handle cudagraph capture logic in the future. - cudagraph_capture_sizes: sizes to capture cudagraph. - - None: capture sizes are inferred from compilation context. - - List[int]: capture sizes are specified. + - None (default): capture sizes are inferred from vllm config. + - List[int]: capture sizes are specified as given. - cudagraph_num_of_warmups: number of warmup runs for cudagraph. It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded @@ -2233,14 +2207,10 @@ class CompilationConfig(BaseModel): - use_inductor: whether to use inductor compilation. - False: inductor compilation is not used. graph runs in eager. - True: inductor compilation is used. one graph for symbolic shape - is compiled. In addition, compile for different sizes specified - in inductor_compile_sizes, using configurations + is compiled. In addition, compile for cudagraph sizes that are + in candidate_compile_sizes, using configurations in inductor_compile_config. - - inductor_compile_sizes: sizes to compile for inductor. - - inductor_specialize_for_cudagraph_no_more_than: an optional integer - to specialize inductor for cudagraph sizes no more than the - specified size. It is useful when we want to specialize inductor - with a subset of cudagraph sizes. + - candidate_compile_sizes: sizes to compile for inductor. - inductor_compile_config: additional configurations for inductor. - None: use default configurations. - inductor_passes: additional passes for inductor. It is a dictionary @@ -2249,7 +2219,7 @@ class CompilationConfig(BaseModel): from Python, functions can also be passed directly via Python object constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` - custom inductor passes: see PassConfig for more details - + Why we have different sizes for cudagraph and inductor: - cudagraph: a cudagraph captured for a specific size can only be used for the same size. We need to capture all the sizes we want to use. @@ -2269,8 +2239,7 @@ class CompilationConfig(BaseModel): ]) use_inductor: bool = True - inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None - inductor_compile_sizes: Optional[List[int]] = Field(default=None) + candidate_compile_sizes: Optional[List[int]] = Field(default=None) inductor_compile_config: Dict = Field(default_factory=dict) inductor_passes: Dict[str, str] = Field(default_factory=dict) @@ -2324,6 +2293,7 @@ def model_post_init(self, __context: Any) -> None: # keep track of enabled and disabled custom ops enabled_custom_ops: Counter[str] = PrivateAttr disabled_custom_ops: Counter[str] = PrivateAttr + compilation_time: float = PrivateAttr # Per-model forward context # Mainly used to store attention cls @@ -2335,7 +2305,9 @@ def from_cli(cls, cli_value: str) -> "CompilationConfig": """Parse the CLI value for the compilation config.""" if cli_value in ["0", "1", "2", "3"]: return cls(level=int(cli_value)) - return CompilationConfig.model_validate_json(cli_value) + # do not use `eval`, it is dangerous and can execute arbitrary code + dict_value = ast.literal_eval(cli_value) + return CompilationConfig.model_validate(dict_value) def model_post_init(self, __context: Any) -> None: @@ -2362,6 +2334,7 @@ def model_post_init(self, __context: Any) -> None: self.enabled_custom_ops = Counter() self.disabled_custom_ops = Counter() self.static_forward_context = {} + self.compilation_time = 0.0 def init_backend(self) -> Union[str, Callable]: if self.level == CompilationLevel.NO_COMPILATION: @@ -2384,15 +2357,10 @@ def init_backend(self) -> Union[str, Callable]: from vllm.compilation.backends import VllmBackend return VllmBackend(self) - def init_during_runtime(self): + def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): """To complete the initialization of config, - we need to know the compile context, which is only available - during the first run of the model. - """ - from vllm.compilation.compile_context import get_compile_context - context = get_compile_context() - context = copy.deepcopy(context) if context is not None else [] - sizes_to_specialize: List[int] = context + we need to know the cudagraph sizes.""" + if self.cudagraph_capture_sizes is None: self.capture_sizes = sizes_to_specialize else: @@ -2400,18 +2368,35 @@ def init_during_runtime(self): logger.info(("cudagraph sizes specified by model runner" " %s is overridden by config %s"), sizes_to_specialize, self.cudagraph_capture_sizes) - if self.inductor_specialize_for_cudagraph_no_more_than is not None: - assert self.inductor_compile_sizes is None, ( - "inductor_compile_sizes should be None when " - "inductor_specialize_for_cudagraph_no_more_than is not None") - self.compile_sizes = [ - x for x in self.capture_sizes - if x <= self.inductor_specialize_for_cudagraph_no_more_than - ] - else: - if self.inductor_compile_sizes is None: - self.inductor_compile_sizes = [] - self.compile_sizes = self.inductor_compile_sizes + + if self.candidate_compile_sizes is None: + self.candidate_compile_sizes = [] + self.compile_sizes = [ + x for x in self.candidate_compile_sizes if x in self.capture_sizes + ] + ignored_sizes = [ + x for x in self.candidate_compile_sizes + if x not in self.capture_sizes + ] + if ignored_sizes: + logger.warning(("candidate_compile_sizes %s are ignored " + "because they are not cudagraph capture sizes."), + ignored_sizes) + + # sort to make sure cudagraph capture sizes are in descending order + self.capture_sizes.sort(reverse=True) + + +_BATCH_SIZE_ALIGNMENT = 8 +# all the token sizes that **can** be captured by cudagraph. +# they can be arbitrarily large. +# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. +# the actual sizes to capture will be determined by the model, +# depending on the model's max_num_seqs. +# NOTE: get_graph_batch_size needs to be updated if this list is changed. +_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ + _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) +] @dataclass @@ -2439,6 +2424,42 @@ class VllmConfig: init=True) # type: ignore kv_transfer_config: KVTransferConfig = field(default=None, init=True) # type: ignore + instance_id: str = "" + + @staticmethod + def get_graph_batch_size(batch_size: int) -> int: + """Returns the padded batch size given actual batch size. + + Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, + 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... + """ + if batch_size <= 2: + return batch_size + elif batch_size <= 4: + return 4 + else: + return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // + _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) + + @staticmethod + def get_max_graph_batch_size(max_num_seqs: int) -> int: + """ + max_num_seqs: Maximum number of sequences in a batch. + _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. + + pad the max_num_seqs if necessary by calling get_graph_batch_size, + which will deal with some edge cases like 1, 2, 4. + + if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded + size. if not, it means the padded size is larger than the largest size + in _BATCH_SIZES_TO_CAPTURE, return the largest size in + _BATCH_SIZES_TO_CAPTURE. + """ + padded_size = VllmConfig.get_graph_batch_size(max_num_seqs) + if padded_size in _BATCH_SIZES_TO_CAPTURE: + return padded_size + assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] + return _BATCH_SIZES_TO_CAPTURE[-1] @staticmethod def _get_quantization_config( @@ -2468,7 +2489,15 @@ def _get_quantization_config( return quant_config return None - def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig": + def with_hf_config( + self, + hf_config: PretrainedConfig, + architectures: Optional[list[str]] = None, + ) -> "VllmConfig": + if architectures is not None: + hf_config = copy.deepcopy(hf_config) + hf_config.architectures = architectures + model_config = copy.deepcopy(self.model_config) model_config.hf_config = hf_config @@ -2519,10 +2548,33 @@ def __post_init__(self): self.compilation_config.custom_ops = ["none"] self.compilation_config.use_cudagraph = True self.compilation_config.use_inductor = True + self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False self.compilation_config.pass_config.enable_reshape = False self.compilation_config.level = CompilationLevel.PIECEWISE + if not envs.VLLM_USE_V1: + max_batchsize_to_capture = 0 + if self.scheduler_config is not None and \ + self.model_config is not None and \ + not self.model_config.enforce_eager: + max_batchsize_to_capture = \ + self.get_max_graph_batch_size( + self.scheduler_config.max_num_seqs) + batch_size_capture_list = [ + size for size in _BATCH_SIZES_TO_CAPTURE + if size <= max_batchsize_to_capture + ] + else: + batch_size_capture_list = [] + if self.model_config is not None and \ + not self.model_config.enforce_eager: + batch_size_capture_list = [1, 2, 4 + ] + [i for i in range(8, 513, 8)] + + self.compilation_config.init_with_cudagraph_sizes( + batch_size_capture_list) + if self.cache_config is not None and \ self.cache_config.cpu_offload_gb > 0 and \ self.compilation_config.level != CompilationLevel.NO_COMPILATION: @@ -2539,46 +2591,44 @@ def __post_init__(self): current_platform.check_and_update_config(self) + if not self.instance_id: + self.instance_id = random_uuid()[:5] + def __str__(self): - return ("model=%r, speculative_config=%r, tokenizer=%r, " - "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, tokenizer_revision=%s, " - "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " - "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " - "pipeline_parallel_size=%d, " - "disable_custom_all_reduce=%s, quantization=%s, " - "enforce_eager=%s, kv_cache_dtype=%s, " - "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, observability_config=%r, " - "seed=%d, served_model_name=%s, " - "num_scheduler_steps=%d, enable_prefix_caching=%s, " - "use_async_output_proc=%s, mm_processor_kwargs=%s") % \ - (self.model_config.model, self.speculative_config, - self.model_config.tokenizer, - self.model_config.skip_tokenizer_init, - self.model_config.tokenizer_mode, - self.model_config.revision, - self.model_config.override_neuron_config, - self.model_config.tokenizer_revision, - self.model_config.trust_remote_code, - self.model_config.dtype, - self.model_config.max_model_len, - self.load_config.download_dir, - self.load_config.load_format, - self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size, - self.parallel_config.disable_custom_all_reduce, - self.model_config.quantization, - self.model_config.enforce_eager, - self.cache_config.cache_dtype, - self.model_config.quantization_param_path, - self.device_config.device, self.decoding_config, - self.observability_config, self.model_config.seed, - self.model_config.served_model_name, - self.scheduler_config.num_scheduler_steps, - self.cache_config.enable_prefix_caching, - self.model_config.use_async_output_proc, - self.model_config.mm_processor_kwargs) + return ( + f"model={self.model_config.model!r}," + f" speculative_config={self.speculative_config!r}," + f" tokenizer={self.model_config.tokenizer!r}, " + f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}," + f" tokenizer_mode={self.model_config.tokenizer_mode}, " + f"revision={self.model_config.revision}, " + f"override_neuron_config={self.model_config.override_neuron_config}," + f" tokenizer_revision={self.model_config.tokenizer_revision}, " + f"trust_remote_code={self.model_config.trust_remote_code}, " + f"dtype={self.model_config.dtype}, " + f"max_seq_len={self.model_config.max_model_len}," + f" download_dir={self.load_config.download_dir!r}, " + f"load_format={self.load_config.load_format}, " + f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}," + f" pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa + f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa + f"quantization={self.model_config.quantization}, " + f"enforce_eager={self.model_config.enforce_eager}, " + f"kv_cache_dtype={self.cache_config.cache_dtype}, " + f"quantization_param_path={self.model_config.quantization_param_path}," + f" device_config={self.device_config.device}, " + f"decoding_config={self.decoding_config!r}, " + f"observability_config={self.observability_config!r}, " + f"seed={self.model_config.seed}, " + f"served_model_name={self.model_config.served_model_name}, " + f"num_scheduler_steps={self.scheduler_config.num_scheduler_steps}, " + f"multi_step_stream_outputs={self.scheduler_config.multi_step_stream_outputs}, " # noqa + f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " + f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa + f"use_async_output_proc={self.model_config.use_async_output_proc}, " + f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, " + f"pooler_config={self.model_config.pooler_config!r}," + f" compilation_config={self.compilation_config!r}") _current_vllm_config: Optional[VllmConfig] = None diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index d4e3f81747038..a6800f93f167b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -197,6 +197,25 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None): ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) + def broadcast(self, tensor: torch.Tensor, src: int, stream=None): + if self.disabled: + return + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = self.stream + if src == self.rank: + sendbuff = buffer_type(tensor.data_ptr()) + # NCCL requires the sender also to have a receive buffer + recvbuff = buffer_type(tensor.data_ptr()) + else: + sendbuff = buffer_type() + recvbuff = buffer_type(tensor.data_ptr()) + self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + self.comm, cudaStream_t(stream.cuda_stream)) + @contextmanager def change_state(self, enable: Optional[bool] = None, diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index ff88f72470b27..7dea61b6a09f1 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -189,6 +189,15 @@ class NCCLLibrary: ncclComm_t, cudaStream_t ]), + # ncclResult_t ncclBroadcast( + # const void* sendbuff, void* recvbuff, size_t count, + # ncclDataType_t datatype, int root, ncclComm_t comm, + # cudaStream_t stream); + Function("ncclBroadcast", ncclResult_t, [ + buffer_type, buffer_type, ctypes.c_size_t, ncclDataType_t, + ctypes.c_int, ncclComm_t, cudaStream_t + ]), + # be cautious! this is a collective call, it will block until all # processes in the communicator have called this function. # because Python object destruction can happen in random order, @@ -312,6 +321,13 @@ def ncclRecv(self, recvbuff: buffer_type, count: int, datatype: int, self.NCCL_CHECK(self._funcs["ncclRecv"](recvbuff, count, datatype, src, comm, stream)) + def ncclBroadcast(self, sendbuff: buffer_type, recvbuff: buffer_type, + count: int, datatype: int, root: int, comm: ncclComm_t, + stream: cudaStream_t) -> None: + self.NCCL_CHECK(self._funcs["ncclBroadcast"](sendbuff, recvbuff, count, + datatype, root, comm, + stream)) + def ncclCommDestroy(self, comm: ncclComm_t) -> None: self.NCCL_CHECK(self._funcs["ncclCommDestroy"](comm)) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 4aa0eebd976c9..3db069ec64ee4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -168,7 +168,7 @@ class EngineArgs: scheduler_delay_factor: float = 0.0 enable_chunked_prefill: Optional[bool] = None - guided_decoding_backend: str = 'outlines' + guided_decoding_backend: str = 'xgrammar' # Speculative decoding configuration. speculative_model: Optional[str] = None speculative_model_quantization: Optional[str] = None @@ -209,12 +209,9 @@ def __post_init__(self): # support `EngineArgs(compilation_config={...})` # without having to manually construct a # CompilationConfig object - if isinstance(self.compilation_config, (int)): + if isinstance(self.compilation_config, (int, dict)): self.compilation_config = CompilationConfig.from_cli( str(self.compilation_config)) - elif isinstance(self.compilation_config, (dict)): - self.compilation_config = CompilationConfig.from_cli( - json.dumps(self.compilation_config)) # Setup plugins from vllm.plugins import load_general_plugins @@ -364,11 +361,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--guided-decoding-backend', type=str, - default='outlines', - choices=['outlines', 'lm-format-enforcer'], + default='xgrammar', + choices=['outlines', 'lm-format-enforcer', 'xgrammar'], help='Which engine will be used for guided decoding' ' (JSON schema / regex etc) by default. Currently support ' - 'https://github.com/outlines-dev/outlines and ' + 'https://github.com/outlines-dev/outlines,' + 'https://github.com/mlc-ai/xgrammar, and ' 'https://github.com/noamgat/lm-format-enforcer.' ' Can be overridden per request via guided_decoding_backend' ' parameter.') @@ -432,6 +430,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'capping to sliding window size') parser.add_argument('--use-v2-block-manager', action='store_true', + default=True, help='[DEPRECATED] block manager v1 has been ' 'removed and SelfAttnBlockSpaceManager (i.e. ' 'block manager v2) is now the default. ' @@ -1051,9 +1050,12 @@ def create_engine_config(self, # long context (> 32K) models. This is to avoid OOM errors in the # initial memory profiling phase. - # Chunked prefill is currently disabled for multimodal models by - # default. - if use_long_context and not model_config.is_multimodal_model: + # For multimodal models, chunked prefill is disabled by default in + # V0, but enabled by design in V1 + if model_config.is_multimodal_model: + self.enable_chunked_prefill = bool(envs.VLLM_USE_V1) + + elif use_long_context: is_gpu = device_config.device_type == "cuda" use_sliding_window = (model_config.get_sliding_window() is not None) @@ -1110,7 +1112,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: @@ -1242,12 +1244,9 @@ def _override_v1_engine_config(self, engine_config: VllmConfig) -> None: Override the EngineConfig's configs based on the usage context for V1. """ assert envs.VLLM_USE_V1, "V1 is not enabled" - # TODO (ywang96): Enable APC by default when VLM supports it. if engine_config.model_config.is_multimodal_model: - logger.warning( - "Prefix caching is currently not supported for multimodal " - "models and has been disabled.") - engine_config.cache_config.enable_prefix_caching = False + # TODO (ywang96): Enable APC by default when VLM supports it. + assert not engine_config.cache_config.enable_prefix_caching @dataclass diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 7b1bb7b05708d..60dccd7a0812c 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1,4 +1,5 @@ import asyncio +import copy import time import weakref from functools import partial @@ -6,6 +7,8 @@ List, Mapping, Optional, Set, Tuple, Type, Union, overload) from weakref import ReferenceType +from typing_extensions import deprecated + import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) @@ -422,7 +425,8 @@ async def get_tokenizer_async(self, return await ( self.get_tokenizer_group().get_lora_tokenizer_async(lora_request)) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") async def add_request_async( self, request_id: str, @@ -504,7 +508,8 @@ async def add_request_async( sampling_params=params, tokenizer=await self.get_tokenizer_async(lora_request), default_guided_backend=self.decoding_config. - guided_decoding_backend) + guided_decoding_backend, + model_config=self.model_config) self._add_processed_request( request_id=request_id, @@ -525,22 +530,30 @@ async def check_health_async(self) -> None: async def build_guided_decoding_logits_processor_async( sampling_params: SamplingParams, tokenizer: AnyTokenizer, - default_guided_backend: str) -> SamplingParams: + default_guided_backend: str, + model_config: ModelConfig) -> SamplingParams: """Constructs logits processors based on the guided_decoding, logits_bias, and allowed_token_ids fields in sampling_params. Deletes those fields and adds the constructed logits processors to the logits_processors field. Modifies sampling params in-place and returns the modified sampling params.""" - if (guided_decoding := sampling_params.guided_decoding) is None: + if sampling_params.guided_decoding is None: return sampling_params + # Defensively copy sampling params since guided decoding logits + # processors can have different state for each request + sampling_params = copy.copy(sampling_params) + guided_decoding = sampling_params.guided_decoding + logger.debug("Building guided decoding logits processor. " "Params: %s", guided_decoding) guided_decoding.backend = guided_decoding.backend or default_guided_backend processor = await get_guided_decoding_logits_processor( - guided_params=guided_decoding, tokenizer=tokenizer) + guided_params=guided_decoding, + tokenizer=tokenizer, + model_config=model_config) if processor: if sampling_params.logits_processors is None: @@ -894,7 +907,8 @@ async def run_engine_loop(engine_ref: ReferenceType): # This method does not need to be async, but kept that way # for backwards compatibility. - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def add_request( self, request_id: str, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d2d50b3238daa..336b61f4daec4 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1,3 +1,4 @@ +import copy import time from collections import Counter as collectionsCounter from collections import deque @@ -10,7 +11,7 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.envs as envs from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -246,60 +247,12 @@ def __init__( ) logger.info( - "Initializing an LLM engine (v%s) with config: " - "model=%r, speculative_config=%r, tokenizer=%r, " - "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, tokenizer_revision=%s, " - "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " - "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " - "pipeline_parallel_size=%d, " - "disable_custom_all_reduce=%s, quantization=%s, " - "enforce_eager=%s, kv_cache_dtype=%s, " - "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, observability_config=%r, " - "seed=%d, served_model_name=%s, " - "num_scheduler_steps=%d, chunked_prefill_enabled=%s " - "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " - "use_async_output_proc=%s, use_cached_outputs=%s, " - "mm_processor_kwargs=%s, pooler_config=%r," - "compilation_config=%r", + "Initializing an LLM engine (v%s) with config: %r," + "use_cached_outputs=%s, ", VLLM_VERSION, - self.model_config.model, - self.speculative_config, - self.model_config.tokenizer, - self.model_config.skip_tokenizer_init, - self.model_config.tokenizer_mode, - self.model_config.revision, - self.model_config.override_neuron_config, - self.model_config.tokenizer_revision, - self.model_config.trust_remote_code, - self.model_config.dtype, - self.model_config.max_model_len, - self.load_config.download_dir, - self.load_config.load_format, - self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size, - self.parallel_config.disable_custom_all_reduce, - self.model_config.quantization, - self.model_config.enforce_eager, - self.cache_config.cache_dtype, - self.model_config.quantization_param_path, - self.device_config.device, - self.decoding_config, - self.observability_config, - self.model_config.seed, - self.model_config.served_model_name, - self.scheduler_config.num_scheduler_steps, - self.scheduler_config.chunked_prefill_enabled, - self.scheduler_config.multi_step_stream_outputs, - self.cache_config.enable_prefix_caching, - self.model_config.use_async_output_proc, + vllm_config, use_cached_outputs, - self.model_config.mm_processor_kwargs, - self.model_config.pooler_config, - vllm_config.compilation_config, ) - # TODO(woosuk): Print more configs in debug mode. self.log_stats = log_stats self.use_cached_outputs = use_cached_outputs @@ -472,6 +425,7 @@ def _initialize_kv_caches(self) -> None: The workers will determine the number of blocks in both the GPU cache and the swap CPU cache. """ + start = time.time() num_gpu_blocks, num_cpu_blocks = ( self.model_executor.determine_num_available_blocks()) @@ -487,6 +441,9 @@ def _initialize_kv_caches(self) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) + elapsed = time.time() - start + logger.info(("init engine (profile, create kv cache, " + "warmup model) took %.2f seconds"), elapsed) @classmethod def _get_executor_cls(cls, @@ -619,7 +576,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup: model_config=self.model_config, scheduler_config=self.scheduler_config, parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) + lora_config=self.lora_config) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) @@ -719,7 +676,8 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def add_request( self, request_id: str, @@ -1023,9 +981,9 @@ def _update_num_computed_tokens_for_multi_step_prefill( This function updates num_computed_tokens for prompt sequences when Multi-Step is enabled. - seq_group: SequenceGroup to update the num_computed_tokens for. + seq_group: SequenceGroup to update the num_computed_tokens for. seq_group_meta: Metadata of the given SequenceGroup. - is_first_step_output: Optional[bool] - + is_first_step_output: Optional[bool] - When available, is_first_step_output indicates if the appended output token is the output of the first-step in multi-step. A value of None indicates that outputs from all steps in @@ -2035,7 +1993,11 @@ def _build_logits_processors( logits_processors = [] - if (guided_decoding := sampling_params.guided_decoding) is not None: + if sampling_params.guided_decoding is not None: + # Defensively copy sampling params since guided decoding logits + # processors can have different state for each request + sampling_params = copy.copy(sampling_params) + guided_decoding = sampling_params.guided_decoding logger.debug( "Building guided decoding logits processor in " @@ -2046,7 +2008,9 @@ def _build_logits_processors( self.decoding_config.guided_decoding_backend processor = get_local_guided_decoding_logits_processor( - guided_params=guided_decoding, tokenizer=tokenizer) + guided_params=guided_decoding, + tokenizer=tokenizer, + model_config=self.model_config) if processor: logits_processors.append(processor) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 4869557ba9b44..a5ae21c3966a7 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -599,9 +599,9 @@ def _log_prometheus(self, stats: Stats) -> None: stats.time_queue_requests) self._log_histogram(self.metrics.histogram_inference_time_request, stats.time_inference_requests) - self._log_histogram(self.metrics.histogram_decode_time_request, - stats.time_prefill_requests) self._log_histogram(self.metrics.histogram_prefill_time_request, + stats.time_prefill_requests) + self._log_histogram(self.metrics.histogram_decode_time_request, stats.time_decode_requests) self._log_histogram(self.metrics.histogram_time_in_queue_request, stats.time_in_queue_requests) diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 34c161e9395ae..7020012e8bb86 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -2,6 +2,8 @@ from enum import Enum from typing import List, Mapping, Optional, Union, overload +from typing_extensions import deprecated + from vllm import PoolingParams from vllm.inputs import PromptType from vllm.lora.request import LoRARequest @@ -32,7 +34,8 @@ class RPCProcessRequest: prompt_adapter_request: Optional[PromptAdapterRequest] = None priority: int = 0 - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def __init__( self, *, diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index d26728e8c6e67..7e4f81b2cf8e2 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -9,6 +9,7 @@ import psutil import zmq import zmq.asyncio +from typing_extensions import deprecated from zmq import Frame # type: ignore[attr-defined] from zmq.asyncio import Socket @@ -93,8 +94,7 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, model_config=self.model_config, scheduler_config=engine_config.scheduler_config, parallel_config=engine_config.parallel_config, - enable_lora=bool(engine_config.lora_config), - ) + lora_config=engine_config.lora_config) self.input_preprocessor = InputPreprocessor(self.model_config, self.tokenizer) @@ -414,7 +414,8 @@ def errored(self) -> bool: def dead_error(self) -> BaseException: return ENGINE_DEAD_ERROR(self._errored_with) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def generate( self, *, @@ -472,8 +473,8 @@ def generate( trace_headers: OpenTelemetry trace headers. prompt_adapter_request: Prompt Adapter request to use for generation, if any. - priority: Priority of the request (lower means earlier handling). - Any priority other than 0 will lead to an error if the + priority: Priority of the request (lower means earlier handling). + Any priority other than 0 will lead to an error if the scheduling policy is not "priority". """ if inputs is not None: @@ -485,7 +486,8 @@ def generate( lora_request, trace_headers, prompt_adapter_request, priority) - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def encode( self, *, @@ -586,6 +588,7 @@ async def _process_request( default_guided_backend=(self.decoding_config.guided_decoding_backend if self.decoding_config else DecodingConfig.guided_decoding_backend), + model_config=self.model_config ) # 1) Create output queue for this requests. diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 7a6ebb430541f..a9b638ed02a1e 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index a25c401b4ea10..8de30ccd18a11 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,11 +1,11 @@ import itertools -import json import warnings from contextlib import contextmanager from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, Union, cast, overload) from tqdm import tqdm +from typing_extensions import deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -185,12 +185,9 @@ def __init__( kwargs["disable_log_stats"] = True if compilation_config is not None: - if isinstance(compilation_config, (int)): + if isinstance(compilation_config, (int, dict)): compilation_config_instance = CompilationConfig.from_cli( str(compilation_config)) - elif isinstance(compilation_config, (dict)): - compilation_config_instance = CompilationConfig.from_cli( - json.dumps(compilation_config)) else: compilation_config_instance = compilation_config else: @@ -256,6 +253,7 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) @overload # LEGACY: single (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: str, @@ -268,6 +266,7 @@ def generate( ... @overload # LEGACY: multi (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: List[str], @@ -280,6 +279,7 @@ def generate( ... @overload # LEGACY: single (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: Optional[str] = None, @@ -293,6 +293,7 @@ def generate( ... @overload # LEGACY: multi (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: Optional[List[str]] = None, @@ -306,6 +307,7 @@ def generate( ... @overload # LEGACY: single or multi token ids [pos-only] + @deprecated("'prompt_token_ids' will become part of 'prompts") def generate( self, prompts: None, @@ -671,6 +673,7 @@ def chat( ) @overload # LEGACY: single (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: str, @@ -683,6 +686,7 @@ def encode( ... @overload # LEGACY: multi (prompt + optional token ids) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: List[str], @@ -695,6 +699,7 @@ def encode( ... @overload # LEGACY: single (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: Optional[str] = None, @@ -708,6 +713,7 @@ def encode( ... @overload # LEGACY: multi (token ids + optional prompt) + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: Optional[List[str]] = None, @@ -721,6 +727,7 @@ def encode( ... @overload # LEGACY: single or multi token ids [pos-only] + @deprecated("'prompt_token_ids' will become part of 'prompts") def encode( self, prompts: None, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 6bc31ef83ded4..c7bc30040279c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -175,8 +175,8 @@ async def build_async_engine_client_from_engine_args( # Select random path for IPC. ipc_path = get_open_zmq_ipc_path() - logger.info("Multiprocessing frontend to use %s for IPC Path.", - ipc_path) + logger.debug("Multiprocessing frontend to use %s for IPC Path.", + ipc_path) # Start RPCServer in separate process (holds the LLMEngine). # the current process might have CUDA context, @@ -249,8 +249,8 @@ def mount_metrics(app: FastAPI): prometheus_multiproc_dir_path = os.getenv("PROMETHEUS_MULTIPROC_DIR", None) if prometheus_multiproc_dir_path is not None: - logger.info("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", - prometheus_multiproc_dir_path) + logger.debug("vLLM to use %s as PROMETHEUS_MULTIPROC_DIR", + prometheus_multiproc_dir_path) registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) diff --git a/vllm/envs.py b/vllm/envs.py index 3e458f9b6f25f..46369e69c35f3 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -8,7 +8,6 @@ VLLM_RPC_BASE_PATH: str = tempfile.gettempdir() VLLM_USE_MODELSCOPE: bool = False VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60 - VLLM_INSTANCE_ID: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None VLLM_ROCM_PREFER_TORCH: bool = False @@ -125,7 +124,8 @@ def get_default_config_root(): # If set, vllm will use precompiled binaries (*.so) "VLLM_USE_PRECOMPILED": - lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")), + lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool( + os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")), # CMake build type # If not set, defaults to "Debug" or "RelWithDebInfo" @@ -186,11 +186,6 @@ def get_default_config_root(): "VLLM_USE_MODELSCOPE": lambda: os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true", - # Instance id represents an instance of the VLLM. All processes in the same - # instance should have the same instance id. - "VLLM_INSTANCE_ID": - lambda: os.environ.get("VLLM_INSTANCE_ID", None), - # Interval in seconds to log a warning message when the ring buffer is full "VLLM_RINGBUFFER_WARNING_INTERVAL": lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")), diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 336f9bc8efb20..2816b5c5c1f88 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -10,8 +10,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest -from vllm.utils import (get_distributed_init_method, get_open_port, - get_vllm_instance_id, make_async) +from vllm.utils import get_distributed_init_method, get_open_port, make_async from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -23,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" @@ -31,9 +30,6 @@ def _init_executor(self) -> None: # Environment variables for CPU executor # - # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers - os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() - # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index a6c05a71d2b6f..c450209f0eb91 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -16,7 +16,7 @@ from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, cuda_is_initialized, get_distributed_init_method, - get_open_port, get_vllm_instance_id, make_async, + get_open_port, make_async, update_environment_variables) if HAS_TRITON: @@ -37,9 +37,6 @@ def _init_executor(self) -> None: world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size - # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers - os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() - # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 6542b18ae70b1..4263fb27265f6 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -15,8 +15,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, get_vllm_instance_id, - make_async) + get_ip, get_open_port, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -189,8 +188,14 @@ def sort_by_driver_then_worker_ip(worker): self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", - use_dummy_driver=True) + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote()) \ + ) # type: ignore node_workers = defaultdict(list) # node id -> list of worker ranks node_gpus = defaultdict(list) # node id -> list of gpu ids @@ -220,14 +225,10 @@ def sort_by_driver_then_worker_ip(worker): " environment variable, make sure it is unique for" " each node.") - VLLM_INSTANCE_ID = get_vllm_instance_id() - # Set environment variables for the driver and workers. all_args_to_update_environment_variables = [({ "CUDA_VISIBLE_DEVICES": ",".join(map(str, node_gpus[node_id])), - "VLLM_INSTANCE_ID": - VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), **({ @@ -334,7 +335,6 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, - use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, ) -> Any: @@ -394,18 +394,10 @@ def _run_workers( driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] # Start the driver worker after all the ray workers. - if not use_dummy_driver: - driver_worker_output = [ - self.driver_worker.execute_method(method, *driver_args, - **driver_kwargs) - ] - else: - assert self.driver_dummy_worker is not None - driver_worker_output = [ - ray.get( - self.driver_dummy_worker.execute_method.remote( - method, *driver_args, **driver_kwargs)) - ] + driver_worker_output = [ + self.driver_worker.execute_method(method, *driver_args, + **driver_kwargs) + ] # Get the results of the ray workers. if self.workers: diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index a74328e5aa272..f3025cb537ab8 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -15,8 +15,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, get_vllm_instance_id, - make_async) + get_ip, get_open_port, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -164,9 +163,14 @@ def sort_by_driver_then_worker_ip(worker): # node will be placed first. self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", - use_dummy_driver=True) + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote()) \ + ) # type: ignore node_workers = defaultdict(list) # node id -> list of worker ranks node_gpus = defaultdict(list) # node id -> list of gpu ids @@ -196,12 +200,8 @@ def sort_by_driver_then_worker_ip(worker): "environment variable, make sure it is unique for" " each node.") - VLLM_INSTANCE_ID = get_vllm_instance_id() - # Set environment variables for the driver and workers. all_args_to_update_environment_variables = [({ - "VLLM_INSTANCE_ID": - VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), }, ) for (node_id, _) in worker_node_and_gpu_ids] @@ -301,7 +301,6 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, - use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, ) -> Any: @@ -361,18 +360,10 @@ def _run_workers( driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] # Start the driver worker after all the ray workers. - if not use_dummy_driver: - driver_worker_output = [ - self.driver_worker.execute_method(method, *driver_args, - **driver_kwargs) - ] - else: - assert self.driver_dummy_worker is not None - driver_worker_output = [ - ray.get( - self.driver_dummy_worker.execute_method.remote( - method, *driver_args, **driver_kwargs)) - ] + driver_worker_output = [ + self.driver_worker.execute_method(method, *driver_args, + **driver_kwargs) + ] # Get the results of the ray workers. if self.workers: diff --git a/vllm/executor/ray_tpu_executor.py b/vllm/executor/ray_tpu_executor.py index c227b5e283c68..5118c13934f0d 100644 --- a/vllm/executor/ray_tpu_executor.py +++ b/vllm/executor/ray_tpu_executor.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, - get_vllm_instance_id, make_async) + make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -137,19 +137,21 @@ def sort_by_driver_then_worker_ip(worker): self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) # Get the set of TPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", - use_dummy_driver=True) + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote()) \ + ) # type: ignore node_workers = defaultdict(list) for i, (node_id, _) in enumerate(worker_node_and_gpu_ids): node_workers[node_id].append(i) - VLLM_INSTANCE_ID = get_vllm_instance_id() - # Set environment variables for the driver and workers. all_args_to_update_environment_variables = [({ - "VLLM_INSTANCE_ID": - VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), }, ) for _ in worker_node_and_gpu_ids] @@ -203,7 +205,6 @@ def _run_workers( async_run_remote_workers_only: bool = False, all_args: Optional[List[Tuple[Any, ...]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None, - use_dummy_driver: bool = False, max_concurrent_workers: Optional[int] = None, use_ray_compiled_dag: bool = False, **kwargs, @@ -245,14 +246,8 @@ def _run_workers( driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] # Start the driver worker after all the ray workers. - if not use_dummy_driver: - driver_worker_output = self.driver_worker.execute_method( - method, *driver_args, **driver_kwargs) - else: - assert self.driver_dummy_worker is not None - driver_worker_output = ray.get( - self.driver_dummy_worker.execute_method.remote( - method, *driver_args, **driver_kwargs)) + driver_worker_output = self.driver_worker.execute_method( + method, *driver_args, **driver_kwargs) # Get the results of the ray workers. if self.workers: ray_worker_outputs = ray.get(ray_worker_outputs) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index 2b1cdc09b0a9f..d2086f5fef26c 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -1,11 +1,13 @@ import asyncio from typing import List, Optional +import ray + import vllm.envs as envs from vllm.executor.ray_gpu_executor import RayGPUExecutor, RayGPUExecutorAsync from vllm.executor.xpu_executor import XPUExecutor from vllm.logger import init_logger -from vllm.utils import get_vllm_instance_id, make_async +from vllm.utils import make_async logger = init_logger(__name__) @@ -14,15 +16,16 @@ class RayXPUExecutor(RayGPUExecutor, XPUExecutor): def _get_env_vars_to_be_updated(self): # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", - use_dummy_driver=True) - - VLLM_INSTANCE_ID = get_vllm_instance_id() + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote())) # type: ignore # Set environment variables for the driver and workers. all_args_to_update_environment_variables = [({ - "VLLM_INSTANCE_ID": - VLLM_INSTANCE_ID, "VLLM_TRACE_FUNCTION": str(envs.VLLM_TRACE_FUNCTION), }, ) for (_, _) in worker_node_and_gpu_ids] diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 54fbd7a321a6f..d4402e77a3886 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -38,34 +38,3 @@ "InputProcessingContext", "InputRegistry", ] - - -def __getattr__(name: str): - import warnings - - if name == "PromptInput": - msg = ("PromptInput has been renamed to PromptType. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PromptType - - if name == "LLMInputs": - msg = ("LLMInputs has been renamed to DecoderOnlyInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return DecoderOnlyInputs - - if name == "EncoderDecoderLLMInputs": - msg = ( - "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return EncoderDecoderInputs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index fb7dbbebd7b90..85aaaa776907f 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -7,7 +7,8 @@ from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict + from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict) from vllm.multimodal.inputs import MultiModalInputsV2 @@ -150,6 +151,12 @@ class TokenInputs(TypedDict): if the model supports it. """ + multi_modal_inputs: NotRequired["MultiModalKwargs"] + """ + Optional multi-modal inputs to pass to the model, + if the model supports it. + """ + multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] """ Placeholder ranges for the multi-modal data. @@ -169,6 +176,7 @@ def token_inputs( token_type_ids: Optional[List[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, + multi_modal_inputs: Optional["MultiModalKwargs"] = None, multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: @@ -181,6 +189,8 @@ def token_inputs( inputs["token_type_ids"] = token_type_ids if multi_modal_data is not None: inputs["multi_modal_data"] = multi_modal_data + if multi_modal_inputs is not None: + inputs["multi_modal_inputs"] = multi_modal_inputs if multi_modal_placeholders is not None: inputs["multi_modal_placeholders"] = multi_modal_placeholders if mm_processor_kwargs is not None: @@ -273,6 +283,18 @@ def multi_modal_data(self) -> "MultiModalDataDict": assert_never(inputs) + @cached_property + def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_inputs", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + @cached_property def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": inputs = self.inputs @@ -358,34 +380,3 @@ def to_enc_dec_tuple_list( return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] - - -def __getattr__(name: str): - import warnings - - if name == "PromptInput": - msg = ("PromptInput has been renamed to PromptType. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PromptType - - if name == "LLMInputs": - msg = ("LLMInputs has been renamed to DecoderOnlyInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return DecoderOnlyInputs - - if name == "EncoderDecoderLLMInputs": - msg = ( - "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return EncoderDecoderInputs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 85ab4355cc2e4..0dfed3b7e61bf 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -69,12 +69,12 @@ class InputProcessingContext(InputContext): tokenizer: AnyTokenizer """The tokenizer used to tokenize the inputs.""" - def get_hf_processor(self) -> ProcessorMixin: + def get_hf_processor(self, **kwargs) -> ProcessorMixin: return cached_get_processor( self.model_config.tokenizer, tokenizer=self.tokenizer, # Override the tokenizer with ours trust_remote_code=self.model_config.trust_remote_code, - ) + **kwargs) N = TypeVar("N", bound=Type[nn.Module]) @@ -232,19 +232,35 @@ def dummy_data_for_profiling( """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture - - model_cls, _ = get_model_architecture(model_config) - if is_encoder_data: - dummy_factory = self._get_dummy_encoder_data_factory(model_cls) + from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.utils import cached_get_tokenizer + + if mm_registry.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = mm_registry.create_processor(model_config, tokenizer) + + mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) + mm_max_tokens = mm_registry.get_max_tokens_by_modality( + model_config) + + dummy_data = processor.get_dummy_data(seq_len, mm_counts, + mm_max_tokens) else: - dummy_factory = self._get_dummy_data_factory(model_cls) - mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) - mm_processor_kwargs = get_allowed_kwarg_only_overrides( - dummy_factory, overrides=model_config.mm_processor_kwargs) + model_cls, _ = get_model_architecture(model_config) + if is_encoder_data: + dummy_factory = self._get_dummy_encoder_data_factory(model_cls) + else: + dummy_factory = self._get_dummy_data_factory(model_cls) + mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) + mm_processor_kwargs = get_allowed_kwarg_only_overrides( + dummy_factory, overrides=model_config.mm_processor_kwargs) - dummy_data = dummy_factory(InputContext(model_config), seq_len, - _MultiModalCounts(mm_counts), - **mm_processor_kwargs) + dummy_data = dummy_factory(InputContext(model_config), seq_len, + _MultiModalCounts(mm_counts), + **mm_processor_kwargs) # Having more tokens is over-conservative but otherwise fine num_tokens = dummy_data.seq_data.prompt_token_ids @@ -257,7 +273,9 @@ def dummy_data_for_profiling( raise AssertionError( f"Expected at least {seq_len} dummy tokens for profiling, " f"but found {len(num_tokens)} tokens instead.") - if dummy_data.multi_modal_data is not None: + + if (dummy_data.multi_modal_data is not None and + not isinstance(dummy_data.multi_modal_data, MultiModalKwargs)): for k, v in dummy_data.multi_modal_data.items(): num_items = len(v) if isinstance(v, list) else 1 num_expected = mm_counts[k] diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index f5c2eced9d2bb..545ec21ca74c1 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -1,5 +1,5 @@ # pylint: disable=unused-argument -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -32,6 +32,44 @@ def dec(*args, **kwargs): return dec +def _mcp_apply(x, bias, layer: ColumnParallelLinearWithLoRA): + """ + For `ColumnParallelLinearWithLoRA` or classes that inherit from + `ColumnParallelLinearWithLoRA`, they share the same `apply` logic. + """ + assert (layer.n_slices == len(layer.lora_a_stacked) == len( + layer.lora_b_stacked) == len(layer.output_slices)) + if layer.lora_bias_stacked is not None: + assert layer.n_slices == len(layer.lora_bias_stacked) + + output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) + + x = x.view(-1, x.shape[-1]) + output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape + + # Since communication is needed, the buffer is directly initialized as a + # tensor rather than a tuple of tensor. + buffers = torch.zeros( + (layer.n_slices, x.shape[0], layer.lora_a_stacked[0].shape[2]), + dtype=torch.float32, + device=x.device, + ) + + layer.punica_wrapper.add_shrink(buffers, x, layer.lora_a_stacked, 1.0) + buffers = tensor_model_parallel_all_gather(buffers) + layer.punica_wrapper.add_expand(output, + buffers, + layer.lora_b_stacked, + layer.lora_bias_stacked, + layer.output_slices, + offset_start=0, + add_input=True) + + output = output.view(*out_orig_shape) + # now have column partitioned and packed output + return output + + # these layers are based on the tensor parallelism strategy given in # Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023, # https://arxiv.org/abs/2311.03285. @@ -51,40 +89,15 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA): # gather operation. def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device, - ) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - add_input=True) - # now have column partitioned output - - if self.bias_stacked is not None: - self.bias_stacked = self.bias_stacked.view( - -1, self.bias_stacked.shape[-1]) - self.bias_stacked = self.bias_stacked[ - self.punica_wrapper.token_lora_indices] - output += self.bias_stacked - - output = output.view(*out_orig_shape) - return output + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) @classmethod @_fully_sharded_can_replace @@ -105,59 +118,6 @@ def can_replace_layer( ) -def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): - """ - MergedColumnParallelLinearWithShardedLoRA and - MergedQKVParallelLinearWithShardedLora share the same - LoRa weight application method. - - The main difference is the step by shard_size for lora_b which can - vary for MergedQKVParallelLinearWithShardedLora but is constant for - MergedColumnParallelLinearWithShardedLoRA. - """ - # expecting 2 for column parallel and 3 for qkv - n = len(layer.lora_a_stacked) - output = layer.base_layer.quant_method.apply(layer.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape - buffers = torch.zeros( - (n, x.shape[0], layer.lora_a_stacked[0].shape[2]), - dtype=torch.float32, - device=x.device, - ) - for idx in range(n): - layer.punica_wrapper.add_shrink(buffers[idx], x, - layer.lora_a_stacked[idx], 1.0) - - buffers = tensor_model_parallel_all_gather(buffers) - left_offset = 0 - for idx in range(n): - shard_size = layer.lora_b_stacked[idx].shape[2] - - if layer.bias_stacked is not None: - bias = layer.bias_stacked[idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[layer.punica_wrapper.token_lora_indices] - bias[layer.punica_wrapper.token_lora_indices == -1] = 0 - output[:, left_offset:left_offset + shard_size] += bias - - layer.punica_wrapper.add_expand_slice( - output, - buffers[idx], - layer.lora_b_stacked[idx], - left_offset, - shard_size, - add_input=True, - ) - left_offset += shard_size - - output = output.view(*out_orig_shape) - # now have column partitioned and packed output - return output - - class MergedColumnParallelLinearWithShardedLoRA( MergedColumnParallelLinearWithLoRA): """ @@ -181,8 +141,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -214,30 +175,15 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora): def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() - shard_size = self.lora_a_stacked.shape[2] + shard_size = self.lora_a_stacked[0].shape[2] start_idx = tp_rank * shard_size lora_a = lora_a[:, start_idx:start_idx + shard_size] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - - x = x.view(-1, x.shape[-1]) - output, out_orig_shape = output.view(-1, - output.shape[-1]), output.shape - buffer = torch.zeros((x.shape[0], self.lora_a_stacked.shape[2]), - dtype=torch.float32, - device=x.device) - self.punica_wrapper.add_shrink(buffer, x, self.lora_a_stacked, 1.0) - buffer = tensor_model_parallel_all_gather(buffer) - self.punica_wrapper.add_expand(output, - buffer, - self.lora_b_stacked, - add_input=True) - # now have column partitioned output - output = output.view(*out_orig_shape) - return output + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return _mcp_apply(x, bias, self) @classmethod @_fully_sharded_can_replace @@ -278,8 +224,9 @@ def slice_lora_a( ] return lora_a - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: return _mcp_apply(x, bias, self) @classmethod @@ -312,7 +259,7 @@ class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA): """ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: - shard_size = self.lora_b_stacked.shape[2] + shard_size = self.lora_b_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size lora_b = lora_b[:, start_idx:end_idx] @@ -321,20 +268,24 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias - shard_size = self.bias_stacked.shape[2] + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + shard_size = self.lora_bias_stacked[0].shape[2] start_idx = self.tp_rank * shard_size end_idx = (self.tp_rank + 1) * shard_size bias = bias[start_idx:end_idx] return bias - def apply(self, x: torch.Tensor) -> torch.Tensor: + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) x = x.view(-1, x.shape[-1]) output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape buffer = torch.zeros( - (x.shape[0], self.lora_a_stacked.shape[2]), + (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]), dtype=torch.float32, device=x.device, ) @@ -348,18 +299,18 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # remains is a standard all_reduce. User should be aware though that # the output is not the same as a normal row_parallel, it should be # reduced before being used - shard_size = self.lora_b_stacked.shape[2] - start_idx = self.tp_rank * shard_size - - if self.bias_stacked is not None: - bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) - bias = bias[self.punica_wrapper.token_lora_indices] - bias[self.punica_wrapper.token_lora_indices == -1] = 0 - output += bias - - self.punica_wrapper.add_expand_slice(output, buffer, - self.lora_b_stacked, start_idx, - shard_size) + # NOTE offset are based on the rank. + shard_size = self.lora_b_stacked[0].shape[2] + offset_start = self.tp_rank * shard_size + self.punica_wrapper.add_expand( + output, + buffer, + self.lora_b_stacked, + self.lora_bias_stacked, + self.output_slices, + offset_start=offset_start, + add_input=True, + ) output = output.view(*out_orig_shape) return output diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 3701988ff692f..38cb846578d5c 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1,7 +1,7 @@ # pylint: disable=unused-argument import math from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union, cast import torch import torch.nn as nn @@ -17,12 +17,14 @@ tensor_model_parallel_all_reduce, tensor_model_parallel_gather) from vllm.distributed.utils import divide -from vllm.lora.punica import PunicaWrapper +# yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, + LinearBase, MergedColumnParallelLinear, QKVParallelLinear, ReplicatedLinear, RowParallelLinear) +# yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.rotary_embedding import ( LinearScalingRotaryEmbedding, RotaryEmbedding) @@ -30,7 +32,7 @@ VocabParallelEmbedding) if TYPE_CHECKING: - pass + from vllm.lora.punica_wrapper import PunicaWrapperBase def _get_lora_device(base_layer: nn.Module) -> torch.device: @@ -67,63 +69,6 @@ def dec(*args, **kwargs): return dec -def apply_bias( - indices: torch.Tensor, - output: torch.Tensor, - bias_stacked: torch.Tensor, -): - """Applies bias to output - - Input shapes: - bias_stacked: (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, output_dim) - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) - bias_stacked = bias_stacked[indices] - bias_stacked[indices == -1] = 0 - output += bias_stacked - - return output.view_as(org_output) - - -def apply_bias_packed_nslice( - indices: torch.Tensor, - output: torch.Tensor, - output_slices: Tuple[int, ...], - bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], -): - """Applies bias to output - - Input shapes: - bias_stacked: 3 element tuple of (num_loras, output_dim) - indices: (batch_size) - output: (batch_size, q_slice_size + 2*kv_slice_size) - output_slices: n-1 element tuple of (slice_size...), - where n is number of slices - """ - org_output = output - output = output.view(-1, output.shape[-1]) - indices = indices.view(-1) - - offset_left = 0 - for slice_idx, slice in enumerate(output_slices): - bias = bias_stacked[slice_idx] - if bias is not None: - bias = bias.view(-1, bias.shape[-1]) - bias = bias[indices] - bias[indices == -1] = 0 - output[:, offset_left:offset_left + slice] += bias - - offset_left += slice - - return output.view_as(org_output) - - @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -169,9 +114,9 @@ def set_lora( def set_mapping( self, - punica_wrapper: PunicaWrapper, + punica_wrapper, ): - self.punica_wrapper: PunicaWrapper = punica_wrapper + self.punica_wrapper: PunicaWrapperBase = punica_wrapper @classmethod def can_replace_layer( @@ -306,12 +251,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[1], -1, ) - - # Embedding layer only need expand op - self.punica_wrapper.add_expand(full_output, - full_lora_a_embeddings, - self.lora_b_stacked, - add_input=True) + self.punica_wrapper.add_lora_embedding(full_output, + full_lora_a_embeddings, + self.lora_b_stacked, + add_input=True) return full_output.view_as(full_output_org) @classmethod @@ -325,14 +268,19 @@ def can_replace_layer( return type(source_layer) is VocabParallelEmbedding -class ReplicatedLinearWithLoRA(BaseLayerWithLoRA): +class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): - def __init__(self, base_layer: ReplicatedLinear) -> None: + def __init__(self, base_layer: LinearBase): super().__init__() self.base_layer = base_layer self.input_size = self.base_layer.input_size - self.output_size = self.base_layer.output_size self.device = _get_lora_device(self.base_layer) + self.lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]] = None + + self.output_slices: Tuple[int, ...] + self.tp_size: int + self.output_size: int + self.n_slices: int def create_lora_weights( self, @@ -341,39 +289,64 @@ def create_lora_weights( model_config: Optional[PretrainedConfig] = None, ) -> None: self.lora_config = lora_config - lora_a_output_size = lora_config.max_lora_rank - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( + # + if isinstance(self.base_layer, ReplicatedLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, ColumnParallelLinear): + lora_a_out_size = (lora_config.max_lora_rank if + not lora_config.fully_sharded_loras else divide( + lora_config.max_lora_rank, self.tp_size)) + lora_b_out_size = self.output_size + + elif isinstance(self.base_layer, RowParallelLinear): + lora_a_out_size = lora_config.max_lora_rank + lora_b_out_size = (self.output_size if + not lora_config.fully_sharded_loras else divide( + self.output_size, self.tp_size)) + else: + raise NotImplementedError + + self.lora_a_stacked = tuple( + torch.zeros( max_loras, 1, - self.output_size, + lora_a_out_size, + self.input_size, dtype=lora_config.lora_dtype, device=self.device, - ) - else: - self.bias_stacked = None + ) for _ in range(self.n_slices)) + self.lora_b_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_b_out_size, + lora_config.max_lora_rank, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + if lora_config.bias_enabled: + lora_bias_out_size = lora_b_out_size + self.lora_bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + lora_bias_out_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(self.n_slices)) + self.output_slices = (self.lora_b_stacked[0].shape[2], ) def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + for s_index in range(self.n_slices): + self.lora_a_stacked[s_index][index] = 0 + self.lora_b_stacked[s_index][index] = 0 + if self.lora_config.bias_enabled: + # Make mypy happy + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + self.lora_bias_stacked[s_index][index] = 0 def set_lora( self, @@ -381,35 +354,56 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): - self.reset_lora(index) + # Except for QKVParallelLinearWithLora and + # MergedColumnParallelLinearWithLoRA, all other linear LoRA layers + # store weights in a tuple of size 1. These two layers will + # override this function. + assert (len(self.lora_a_stacked) == len(self.lora_b_stacked) == + self.n_slices == 1) - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: + self.reset_lora(index) + if self.tp_size > 1: + lora_a = self.slice_lora_a(lora_a) + lora_b = self.slice_lora_b(lora_b) + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + + self.lora_a_stacked[0][index, + 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( + lora_a.T, non_blocking=True) + self.lora_b_stacked[0][index, + 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( + lora_b.T, non_blocking=True) + if lora_bias is not None: + + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + assert len(self.lora_bias_stacked) + self.lora_bias_stacked[0][index, 0, :lora_bias.shape[0]].copy_( + lora_bias.T, non_blocking=True) + + def apply(self, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) + self.punica_wrapper.add_lora_linear(output, x, self.lora_a_stacked, + self.lora_b_stacked, + self.lora_bias_stacked, 1.0, + self.output_slices) return output + +class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA): + + def __init__(self, base_layer: ReplicatedLinear) -> None: + super().__init__(base_layer, ) + # To ensure interface compatibility, set to 1 always. + self.tp_size = 1 + self.output_size = self.base_layer.output_size + self.n_slices = 1 + def forward(self, input_): """Forward of ReplicatedLinearWithLoRA @@ -442,73 +436,26 @@ def can_replace_layer( return type(source_layer) is ReplicatedLinear -class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA): +class ColumnParallelLinearWithLoRA(BaseLinearLayerWithLoRA): """ LoRA on top of ColumnParallelLinear layer. - LoRA B is sliced for tensor parallelism. + There are two types for the `base_layer`: + 1. ColumnParallelLinear, e.g.`dense_h_to_4h` in `FalconForCausalLM`. + 2. MergedColumnParallelLinear, e.g.`gate_up_proj` in `Phi3ForCausalLM`. """ def __init__(self, base_layer: ColumnParallelLinear) -> None: - super().__init__() + super().__init__(base_layer) # The base_layer type is ColumnParallelLinear or # MergedColumnParallelLinear, their weight sharding logic is # inconsistent when TP is greater than 1. self.is_merged_col_linear = type( base_layer) is MergedColumnParallelLinear - - self.base_layer = base_layer self.tp_size = get_tensor_model_parallel_world_size() - self.input_size = self.base_layer.input_size self.output_size = self.base_layer.output_size_per_partition - self.device = _get_lora_device(self.base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config - self.tp_size = get_tensor_model_parallel_world_size() - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) - self.lora_a_stacked = torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - self.lora_b_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - max_loras, - 1, - self.output_size, - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - - self.output_dim = self.lora_b_stacked.shape[2] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + # There is only one LoRA layer + self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -547,46 +494,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = bias[start_idx:end_idx] return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) - return output - def forward(self, input_): """Forward of ColumnParallelLinear @@ -634,8 +541,20 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA): Both slices must have the same size. """ - def __init__(self, base_layer: MergedColumnParallelLinear) -> None: + def __init__( + self, base_layer: Union[MergedColumnParallelLinear, + QKVParallelLinear]) -> None: super().__init__(base_layer) + # There are two LoRA layers + self.tp_size = get_tensor_model_parallel_world_size() + self.tp_rank = get_tensor_model_parallel_rank() + # the output_sizes in MergedColumnParallelLinear is not sharded by tp + # we need to divide it by the tp_size to get correct slices size + output_sizes = self.base_layer.output_sizes + self.output_slices = tuple( + divide(output_size, self.tp_size) for output_size in output_sizes) + self.n_slices = len(self.output_slices) + self.output_ids = (self.tp_rank, ) * self.n_slices def create_lora_weights( self, @@ -643,16 +562,11 @@ def create_lora_weights( lora_config: LoRAConfig, model_config: Optional[PretrainedConfig] = None, ) -> None: + """ + The main reason for overriding this function is to enhance code + maintainability. + """ self.lora_config = lora_config - n_slices = 2 - if not (len(self.base_layer.output_sizes) == n_slices - and self.base_layer.output_sizes[0] - == self.base_layer.output_sizes[1]): - raise ValueError( - "LoRAColumnParallelLinear2Slice requires 2 slices with " - "the same size.") - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() lora_a_output_size_per_partition = ( lora_config.max_lora_rank if not lora_config.fully_sharded_loras @@ -666,38 +580,25 @@ def create_lora_weights( self.input_size, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for _ in range(self.n_slices)) self.lora_b_stacked = tuple( torch.zeros( max_loras, 1, - self.output_size // 2, + output_size, lora_config.max_lora_rank, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) + ) for output_size in self.output_slices) if lora_config.bias_enabled: - self.bias_stacked = tuple( + self.lora_bias_stacked = tuple( torch.zeros( max_loras, 1, - self.output_size // 2, + output_size, dtype=lora_config.lora_dtype, device=self.device, - ) for _ in range(n_slices)) - else: - self.bias_stacked = None - - self.output_dim = self.lora_b_stacked[0].shape[2] - - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_b_stacked[1][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 + ) for output_size in self.output_slices) def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -707,27 +608,21 @@ def slice_lora_a( def slice_lora_b( self, lora_b: List[Union[torch.Tensor, None]] ) -> List[Union[torch.Tensor, None]]: - #NOTE: lora_b contains 2 subloras, and each sublora could be None. - shard_size = self.output_dim - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - lora_b = [ - lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None, - lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None, - ] + for i, (shard_id, shard_size) in enumerate( + zip(self.output_ids, self.output_slices)): + if (lora_b_i := lora_b[i]) is not None: + lora_b[i] = lora_b_i[:, shard_size * shard_id:shard_size * + (shard_id + 1)] return lora_b def slice_bias( self, bias: List[Union[torch.Tensor, None]]) -> List[Union[torch.Tensor, None]]: - # NOTE : each bias could be None. - shard_size = self.output_dim - start_idx = self.tp_rank * shard_size - end_idx = (self.tp_rank + 1) * shard_size - bias = [ - bias[0][start_idx:end_idx] if bias[0] is not None else None, - bias[1][start_idx:end_idx] if bias[1] is not None else None - ] + for i, (shard_id, shard_size) in enumerate( + zip(self.output_ids, self.output_slices)): + if (bias_i := bias[i]) is not None: + bias[i] = bias_i[shard_size * shard_id:shard_size * + (shard_id + 1)] return bias def set_lora( @@ -736,54 +631,35 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, + lora_bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - if lora_a[0] is not None: - self.lora_a_stacked[0][ - index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( - lora_a[0].T, non_blocking=True) - self.lora_b_stacked[0][ - index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( - lora_b[0].T, non_blocking=True) - if bias is not None and bias[0] is not None: - self.bias_stacked[0][index, - 0, :bias[0].shape[0]].copy_(bias[0].T, - non_blocking=True) - if lora_a[1] is not None: - self.lora_a_stacked[1][ - index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( - lora_a[1].T, non_blocking=True) - self.lora_b_stacked[1][ - index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( - lora_b[1].T, non_blocking=True) - if bias is not None and bias[1] is not None: - self.bias_stacked[1][index, - 0, :bias[1].shape[0]].copy_(bias[1].T, - non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - (self.output_dim, self.output_dim), - self.bias_stacked, - ) - self.punica_wrapper.add_lora_packed_nslice( - output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, - (self.output_dim, self.output_dim)) - return output + if lora_bias is not None: + lora_bias = self.slice_bias(lora_bias) + + for i in range(self.n_slices): + if (lora_a_i := lora_a[i]) is not None: + self.lora_a_stacked[i][ + index, 0, :lora_a_i.shape[1], :lora_a_i.shape[0]].copy_( + lora_a_i.T, non_blocking=True) + if (lora_b_i := lora_b[i]) is not None: + self.lora_b_stacked[i][ + index, 0, :lora_b_i.shape[1], :lora_b_i.shape[0]].copy_( + lora_b_i.T, non_blocking=True) + + if lora_bias is not None: + self.lora_bias_stacked = cast(Tuple[torch.Tensor, ...], + self.lora_bias_stacked) + for i in range(self.n_slices): + if (lora_bias_i := lora_bias[i]) is not None: + self.lora_bias_stacked[i][index, + 0, :lora_bias_i.shape[0]].copy_( + lora_bias_i.T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -813,7 +689,6 @@ class QKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) - self.tp_size = get_tensor_model_parallel_world_size() self.q_proj_total_size = (self.base_layer.total_num_heads * self.base_layer.head_size) self.q_proj_shard_size = (self.base_layer.num_heads * @@ -822,6 +697,8 @@ def __init__(self, base_layer: QKVParallelLinear) -> None: self.base_layer.head_size) self.kv_proj_total_size = (self.base_layer.total_num_kv_heads * self.base_layer.head_size) + # There is only one LoRA layer + self.n_slices = 1 def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: tp_rank = get_tensor_model_parallel_rank() @@ -856,32 +733,6 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = torch.cat([bias_q, bias_k, bias_v], dim=1) return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - @classmethod @_not_fully_sharded_can_replace def can_replace_layer(cls, source_layer: nn.Module, @@ -891,8 +742,8 @@ def can_replace_layer(cls, source_layer: nn.Module, packed_modules_list) == 1 -class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): - """ColumnParallelLinear layer that is composed of 3 sublayers (slices) +class MergedQKVParallelLinearWithLora(MergedColumnParallelLinearWithLoRA): + """MergedColumnParallelLinear layer that is composed of 3 sublayers (slices) packed together in qkv proj fashion (q_proj + k_proj + v_proj -> qkv_proj). @@ -904,16 +755,11 @@ class MergedQKVParallelLinearWithLora(ColumnParallelLinearWithLoRA): def __init__(self, base_layer: QKVParallelLinear) -> None: super().__init__(base_layer) - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config + # There are three LoRA layer. + self.n_slices = len(self.base_layer.output_sizes) self.tp_size = get_tensor_model_parallel_world_size() self.tp_rank = get_tensor_model_parallel_rank() + self.q_proj_shard_size = (self.base_layer.num_heads * self.base_layer.head_size) self.kv_proj_shard_size = (self.base_layer.num_kv_heads * @@ -921,227 +767,28 @@ def create_lora_weights( self.q_shard_id = self.tp_rank self.kv_shard_id = self.tp_rank // self.base_layer.num_kv_head_replicas - lora_a_output_size_per_partition = ( - lora_config.max_lora_rank if not lora_config.fully_sharded_loras - else divide(lora_config.max_lora_rank, self.tp_size)) - # q, k, v - self.lora_a_stacked = ( - torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - lora_a_output_size_per_partition, - self.input_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - ) - self.lora_b_stacked = ( - torch.zeros( - max_loras, - 1, - self.q_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - lora_config.max_lora_rank, - dtype=lora_config.lora_dtype, - device=self.device, - ), - ) - if lora_config.bias_enabled: - self.bias_stacked = ( - torch.zeros( - max_loras, - 1, - self.q_proj_shard_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - torch.zeros( - max_loras, - 1, - self.kv_proj_shard_size, - dtype=lora_config.lora_dtype, - device=self.device, - ), - ) - else: - self.bias_stacked = None - self.output_slices = ( self.q_proj_shard_size, self.kv_proj_shard_size, self.kv_proj_shard_size, ) - self.packed_indices: Optional[torch.Tensor] = None - self.standard_indices: Optional[torch.Tensor] = None - # lazily initialized. - self.indices: torch.Tensor - self.indices_len: List[int] - - def reset_lora(self, index: int): - self.lora_a_stacked[0][index] = 0 - self.lora_b_stacked[0][index] = 0 - self.lora_a_stacked[1][index] = 0 - self.lora_b_stacked[1][index] = 0 - self.lora_a_stacked[2][index] = 0 - self.lora_b_stacked[2][index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[0][index] = 0 - self.bias_stacked[1][index] = 0 - self.bias_stacked[2][index] = 0 - - def slice_lora_a( - self, lora_a: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: - return lora_a - - def slice_lora_b( - self, lora_b: List[Union[torch.Tensor, None]] - ) -> List[Union[torch.Tensor, None]]: - lora_b_q, lora_b_k, lora_b_v = None, None, None - if lora_b[0] is not None: - lora_b_q = lora_b[0][:, self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1), ] - if lora_b[1] is not None: - lora_b_k = lora_b[1][:, self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1), ] - if lora_b[2] is not None: - lora_b_v = lora_b[2][:, self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1), ] - lora_b = [lora_b_q, lora_b_k, lora_b_v] - return lora_b - - def slice_bias( - self, bias: List[Union[torch.Tensor, - None]]) -> List[Union[torch.Tensor, None]]: - bias_q, bias_k, bias_v = bias - if bias_q is not None: - bias_q = bias_q[self.q_proj_shard_size * - self.q_shard_id:self.q_proj_shard_size * - (self.q_shard_id + 1)] - if bias_k is not None: - bias_k = bias_k[self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] - if bias_v is not None: - bias_v = bias_v[self.kv_proj_shard_size * - self.kv_shard_id:self.kv_proj_shard_size * - (self.kv_shard_id + 1)] - bias = [bias_q, bias_k, bias_v] - return bias + self.output_ids = ( + self.q_shard_id, + self.kv_shard_id, + self.kv_shard_id, + ) - def set_lora( + def create_lora_weights( self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - if lora_b[0] is not None: - lora_b_q = lora_b[0] - self.lora_b_stacked[0][ - index, 0, :lora_b_q.shape[1], :lora_b_q.shape[0]].copy_( - lora_b_q.T, non_blocking=True) - if lora_b[1] is not None: - lora_b_k = lora_b[1] - self.lora_b_stacked[1][ - index, 0, :lora_b_k.shape[1], :lora_b_k.shape[0]].copy_( - lora_b_k.T, non_blocking=True) - if lora_b[2] is not None: - lora_b_v = lora_b[2] - self.lora_b_stacked[2][ - index, 0, :lora_b_v.shape[1], :lora_b_v.shape[0]].copy_( - lora_b_v.T, non_blocking=True) - - if lora_a[0] is not None: - self.lora_a_stacked[0][ - index, 0, :lora_a[0].shape[1], :lora_a[0].shape[0]].copy_( - lora_a[0].T, non_blocking=True) - if lora_a[1] is not None: - self.lora_a_stacked[1][ - index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( - lora_a[1].T, non_blocking=True) - if lora_a[2] is not None: - self.lora_a_stacked[2][ - index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( - lora_a[2].T, non_blocking=True) - - if bias is not None: - if bias[0] is not None: - self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( - bias[0].T, non_blocking=True) - if bias[1] is not None: - self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( - bias[1].T, non_blocking=True) - if bias[2] is not None: - self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( - bias[2].T, non_blocking=True) - - def apply(self, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias_packed_nslice( - self.indices, - output, - self.output_slices, - self.bias_stacked, - ) - self.punica_wrapper.add_lora_packed_nslice(output, x, - self.lora_a_stacked, - self.lora_b_stacked, 1.0, - self.output_slices) - return output + max_loras: int, + lora_config: LoRAConfig, + model_config: Optional[PretrainedConfig] = None, + ) -> None: + """ + The main reason for overloading this function is to handle inconsistent + weight dimensions in qkv lora. + """ + super().create_lora_weights(max_loras, lora_config, model_config) @classmethod @_not_fully_sharded_can_replace @@ -1156,76 +803,25 @@ def can_replace_layer( and len(packed_modules_list) == 3) -class RowParallelLinearWithLoRA(BaseLayerWithLoRA): +class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA): def __init__(self, base_layer: RowParallelLinear) -> None: - super().__init__() - self.base_layer = base_layer + super().__init__(base_layer) + + self.tp_size = get_tensor_model_parallel_world_size() + # reset input_size self.input_size = self.base_layer.input_size_per_partition self.output_size = self.base_layer.output_size - self.device = _get_lora_device(self.base_layer) - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - self.lora_config = lora_config self.tp_rank = get_tensor_model_parallel_rank() - self.lora_a_stacked = torch.zeros( - ( - max_loras, - 1, - lora_config.max_lora_rank, - self.input_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - tp_size = get_tensor_model_parallel_world_size() - lora_b_output_size_per_partition = ( - self.output_size if not lora_config.fully_sharded_loras else - divide(self.output_size, tp_size)) - - self.lora_b_stacked = torch.zeros( - ( - max_loras, - 1, - lora_b_output_size_per_partition, - lora_config.max_lora_rank, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - - if lora_config.bias_enabled: - self.bias_stacked = torch.zeros( - ( - max_loras, - 1, - self.output_size, - ), - dtype=lora_config.lora_dtype, - device=self.device, - ) - else: - self.bias_stacked = None - # Lazily initialized - self.indices: torch.Tensor - self.indices_len: List[int] - - def reset_lora(self, index: int): - self.lora_a_stacked[index] = 0 - self.lora_b_stacked[index] = 0 - if self.lora_config.bias_enabled: - self.bias_stacked[index] = 0 + # There is only one LoRA layer. + self.n_slices = 1 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: - tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.input_size - start_idx = tensor_model_parallel_rank * shard_size - end_idx = (tensor_model_parallel_rank + 1) * shard_size + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size lora_a = lora_a[start_idx:end_idx, :] return lora_a @@ -1235,46 +831,6 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - self.reset_lora(index) - - if self.base_layer.tp_size > 1: - lora_a = self.slice_lora_a(lora_a) - lora_b = self.slice_lora_b(lora_b) - if bias is not None: - bias = self.slice_bias(bias) - - self.lora_a_stacked[index, - 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( - lora_a.T, non_blocking=True) - self.lora_b_stacked[index, - 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( - lora_b.T, non_blocking=True) - if bias is not None: - self.bias_stacked[index, - 0, :bias.shape[0]].copy_(bias.T, - non_blocking=True) - - def apply(self, x: torch.Tensor) -> torch.Tensor: - output = self.base_layer.quant_method.apply(self.base_layer, x) - if self.bias_stacked is not None: - self.indices = self.punica_wrapper.token_lora_indices - output = apply_bias( - self.indices, - output, - self.bias_stacked, - ) - self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, - self.lora_b_stacked, 1.0) - return output - def forward(self, input_): """Forward of RowParallelLinear @@ -1292,10 +848,9 @@ def forward(self, input_): input_parallel = input_ else: # TODO: simplify code below - tp_rank = get_tensor_model_parallel_rank() splitted_input = split_tensor_along_last_dim( input_, num_partitions=self.base_layer.tp_size) - input_parallel = splitted_input[tp_rank].contiguous() + input_parallel = splitted_input[self.tp_rank].contiguous() # Matrix multiply. output_parallel = self.apply(input_parallel) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 2ffefe61427e3..49cd9f0c236ad 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -21,7 +21,7 @@ LinearScalingRotaryEmbeddingWithLora, LoRAMapping) from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights -from vllm.lora.punica import PunicaWrapper +from vllm.lora.punica_wrapper import get_punica_wrapper from vllm.lora.utils import (from_layer, from_layer_logits_processor, is_regex_target_modules, parse_fine_tuned_lora_name, replace_submodule) @@ -331,9 +331,9 @@ def __init__( self.lora_index_to_id: List[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size self.long_lora_context: Optional[LongContextLoRAContext] = None - self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, - max_batches=self.max_num_seqs, - device=self.device) + self.punica_wrapper = get_punica_wrapper(max_num_batched_tokens, + max_batches=self.max_num_seqs, + device=self.device) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} @@ -555,17 +555,17 @@ def create_dummy_lora( input_dim, output_dim, rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", embeddings_tensor_dim=embeddings_tensor_dim, bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, - module.lora_a_stacked.shape[-1], - module.lora_b_stacked.shape[-2], + module.lora_a_stacked[0].shape[-1], + module.lora_b_stacked[0].shape[-2], rank, - module.lora_a_stacked.dtype, + module.lora_a_stacked[0].dtype, "cpu", bias_enabled=bias_enabled, ) diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py deleted file mode 100644 index 082041f390750..0000000000000 --- a/vllm/lora/punica.py +++ /dev/null @@ -1,611 +0,0 @@ -""" -Based on: -Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). -Punica: Multi-Tenant LoRA Serving. -https://arxiv.org/abs/2310.18547 -""" - -from typing import TYPE_CHECKING, Callable, List, Optional, Tuple, Union - -import torch - -from vllm.triton_utils import HAS_TRITON - -if HAS_TRITON: - from vllm.lora.ops.bgmv_expand import bgmv_expand - from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice - from vllm.lora.ops.bgmv_shrink import bgmv_shrink - from vllm.lora.ops.sgmv_expand import sgmv_expand - from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice - from vllm.lora.ops.sgmv_shrink import sgmv_shrink - -if TYPE_CHECKING: - # avoid circuit import - from vllm.lora.layers import LoRAMapping - from vllm.lora.models import LongContextLoRAContext - - -def compute_meta( - token_lora_tensor: torch.Tensor -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: - """ - Get the information required for the sgmv kernel. With the features: - 1. If consecutive requests in the batch use the same LoRA, this function - will combine them into a single request, improving sgmv kernel inference - performance. - 2. At the beginning of each prefill stage inference, recalculations are - needed based on the input, but only once. - """ - - lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( - token_lora_tensor, return_counts=True) - cum_result = torch.cumsum(seq_length_tensor, dim=0) - b_seq_start_tensor = torch.zeros_like(seq_length_tensor) - b_seq_start_tensor[1:].copy_(cum_result[:-1]) - max_length = seq_length_tensor.max().item() - token_nums = seq_length_tensor.sum().item() - batch_size = lora_indices_tensor.size(0) - no_lora = False - # -1 means no lora should be applied. Use `no_lora` to determine whether - # the current step requires LoRA. If LoRA is not needed, the prefill stage - # does not need to launch the triton kernel, which can improve performance - if batch_size == 1 and lora_indices_tensor == -1: - no_lora = True - return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, - batch_size, max_length, token_nums, no_lora) - - -# TODO see if this can be vectorized -def convert_mapping( - mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - device: torch.device, - long_lora_context: Optional["LongContextLoRAContext"] = None, -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], List[int]]: - """Converts LoRAMapping to index tensors. - - Args: - mapping: LoRAMapping mapping rows in a batch to LoRA ids. - lora_index_to_id: List mapping LoRA ids to LoRA indices. - max_loras: Maximum number of LoRAs. - vocab_size: Model vocab size. - extra_vocab_size: Extra vocab size each LoRA can have. - long_lora_context: Passed if there are long context lora in a batch. - - Returns: - A tuple of tensors: - base_indices: Tensor of shape [batch_size] mapping batch rows to - LoRA indices. - sampler_indices: Tensor of shape [batch_size] mapping requests to - LoRA indices for sampler. For generation, this will be the - same as base_indicies. For prefill, this will map requests - to LoRA indices. - sampler_indices_padded: Tensor of shape [batch_size] mapping - requests to LoRA indices for sampler with padding. - Same as sampler_indicies, but -1 is replaced with - max_loras. - embeddings_indices: Tensor of shape [2, batch_size] mapping - requests to embedding indices. First row is for embeddings - added by the LoRAs, second row is for the LoRA.lora_a - embeddings. - long_lora_indices: Tensor of shape [batch_size] mapping - requests to RoPE offsets and rot dims for long LoRAs. - None if long context lora doesn't exist. - indices_len: List of lengths of the above tensors. It contains - (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices). - """ - index_mapping_indices: List[int] = list(mapping.index_mapping).copy() - embedding_indices = index_mapping_indices.copy() - lora_indices = index_mapping_indices.copy() - long_lora_offsets: Optional[torch.Tensor] = None - if long_lora_context: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=device, - dtype=torch.long) - prompt_mapping: List[int] = [ - lora_index_to_id.index(x) if x > 0 else -1 - for x in mapping.prompt_mapping - ] - lora_idx = None - for i in range(len(index_mapping_indices)): - # TODO index can be slow. optimize - lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) - if index_mapping_indices[i] > 0 else -1) - embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 - lora_indices[i] = lora_idx - if long_lora_context: - assert long_lora_offsets is not None - lora_offset: int = long_lora_context.offsets_by_lora_id.get( - index_mapping_indices[i], 0) - long_lora_offsets[i] = lora_offset - - indices_list: List[Union[List[int], torch.Tensor]] = [ - index_mapping_indices, - lora_indices, - embedding_indices, - ] - if long_lora_context: - assert long_lora_offsets is not None - indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device=device) - prompt_mapping_tensor = torch.tensor(prompt_mapping, - dtype=torch.long, - device=device) - embeddings_indices = torch.stack([ - indices[2] * extra_vocab_size, - indices[2] * (vocab_size + extra_vocab_size), - ]) - embeddings_indices[embeddings_indices == -1] = max_loras - 1 - base_indices = indices[1] - sampler_indices = prompt_mapping_tensor - sampler_indices_padded = sampler_indices.clone() - sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 - sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( - sampler_indices_padded * len(sampler_indices_padded)) - long_lora_indices = None - long_lora_indices_len: Optional[int] = None - if long_lora_context: - long_lora_indices = indices[3] - long_lora_indices_len = long_lora_indices.shape[-1] - # Contain length of indices tensors. Used to index into each tensor. - indices_len = [ - base_indices.shape[-1], - sampler_indices.shape[-1], - sampler_indices_padded.shape[-1], - embeddings_indices.shape[-1], - ] - if long_lora_indices_len is not None: - indices_len.append(long_lora_indices_len) - else: - # If long_lora doesn't exist,append None - indices_len.append(None) - - return ( - base_indices, - sampler_indices, - sampler_indices_padded, - embeddings_indices, - long_lora_indices, - indices_len, - ) - - -class PunicaWrapper: - """ - PunicaWrapper is designed to manage and provide metadata for the punica - kernel. The main function is to maintain the state information for - Multi-LoRA, and to provide the interface for the punica kernel. - """ - - def __init__(self, max_num_batched_tokens: int, max_batches: int, - device: Union[torch.device, str]): - self._token_lora_indices = torch.empty(max_num_batched_tokens, - dtype=torch.long, - device=device) - self._sampler_indices = torch.empty(max_num_batched_tokens, - dtype=torch.long, - device=device) - self._sampler_indices_padded = torch.empty(max_num_batched_tokens, - dtype=torch.long, - device=device) - self._embeddings_indices = torch.empty(2, - max_num_batched_tokens, - dtype=torch.long, - device=device) - self._long_lora_indices = torch.empty(max_num_batched_tokens, - dtype=torch.long, - device=device) - - # 5 is the number of indicies tensors. - # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,long_lora_indices - self.indices_len: List[Optional[int]] = [None] * 5 - # these attributes are the information required for sgmv kernel - self._seq_start_locs = torch.empty(max_batches, - dtype=torch.long, - device=device) - self._seq_lengths = torch.empty(max_batches, - dtype=torch.long, - device=device) - self._lora_indices_per_batch = torch.empty(max_batches, - dtype=torch.long, - device=device) - self.device: torch.device = device - self.max_length: int = 0 - self.token_nums: int = 0 - self.batch_size: int = -1 - self.is_prefill = False - self.no_lora = False - - def update_metadata( - self, - mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, - ): - - self._update_base_metadata(mapping, lora_index_to_id, max_loras, - vocab_size, extra_vocab_size, - long_lora_context) - if mapping.is_prefill: - # Update metadata required for prefill-related operators. - self._update_prefill_metada(self.token_lora_indices) - self.is_prefill = True - else: - self.is_prefill = False - - def _update_base_metadata( - self, - mapping: "LoRAMapping", - lora_index_to_id: List[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, - ): - ( - base_indices, - sampler_indices, - sampler_indices_padded, - embeddings_indices, - long_lora_offsets_tensor, - indices_len, - ) = convert_mapping( - mapping, - lora_index_to_id, - max_loras, - vocab_size, - extra_vocab_size, - self.device, - long_lora_context, - ) - self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) - self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) - self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( - sampler_indices_padded) - self._embeddings_indices[:embeddings_indices. - shape[0], :embeddings_indices.shape[1]].copy_( - embeddings_indices) - if long_lora_offsets_tensor is not None: - self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( - long_lora_offsets_tensor) - else: - self._long_lora_indices.zero_() - self.indices_len[:] = indices_len - - def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: - - (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, - batch_size, max_length, token_nums, - no_lora) = compute_meta(token_lora_tensor) - - self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( - b_seq_start_tensor) - self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) - self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( - lora_indices_tensor) - self.batch_size = batch_size - self.max_length = max_length - self.token_nums = token_nums - self.no_lora = no_lora - - @property - def prefill_metadata( - self - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: - """ - This property provides a convenient way to access the necessary - metadata for prefill-related kernel computations. - 1. seq_start_locs: Tensor of sequence start positions. - 2. seq_lengths: Tensor of sequence lengths. - 3. lora_indices_per_batch: Tensor of lora indices, and an index of - -1 means no lora should be applied. - 4. batch_size: Batch size after clustering identical lora indices. - 5. max_length: The maximum sequence length in the batch. - 6. token_nums: The token numbers in the batch. - """ - return (self._seq_start_locs[:self.batch_size], - self._seq_lengths[:self.batch_size], - self._lora_indices_per_batch[:self.batch_size], - self.batch_size, self.max_length, self.token_nums) - - @property - def token_lora_indices(self) -> torch.Tensor: - """ - This property provides the lora indices corresponding to each token - in the batch. An index of -1 means no lora should be applied. - """ - token_lora_len = self.indices_len[0] - return self._token_lora_indices[:token_lora_len] - - @property - def sampler_indices(self) -> torch.Tensor: - """ - This property is used to access the lora indices specifically for - LogitsProcessorWithLoRA. - """ - sampler_indices_len = self.indices_len[1] - return self._sampler_indices[:sampler_indices_len] - - @property - def sampler_indices_padded(self) -> torch.Tensor: - """ - This property provides access to padded sampler indices. - """ - indices_padded_len = self.indices_len[2] - return self._sampler_indices_padded[:indices_padded_len] - - @property - def embeddings_indices(self) -> torch.Tensor: - """ - This property provides access to the indices used for lora embeddings, - specifically for VocabParallelEmbeddingWithLoRA. - """ - embeddings_indices_len = self.indices_len[3] - return self._embeddings_indices[:, :embeddings_indices_len] - - @property - def long_lora_indices(self) -> torch.Tensor: - """ - This property provides access to the indices used for long context - lora, specifically for LinearScalingRotaryEmbeddingWithLora. - """ - long_lora_len = self.indices_len[4] - return self._long_lora_indices[:long_lora_len] - - def shrink_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - scale: float, - ): - #No LoRA request, so return directly - if self.no_lora: - return - sgmv_shrink( - x, - w_t_all, - y, - *self.prefill_metadata, - scale, - ) - - def shrink_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - scale: float, - ): - bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) - - def expand_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - add_input: bool, - ): - #No LoRA request, so return directly - if self.no_lora: - return - sgmv_expand( - x, - w_t_all, - y, - *self.prefill_metadata, - add_input, - ) - - def expand_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - add_input: bool, - ): - bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) - - def expand_slice_prefill( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool, - ): - #No LoRA request, so return directly - if self.no_lora: - return - sgmv_expand_slice( - x, - w_t_all, - y, - *self.prefill_metadata, - y_offset, - y_slice_size, - add_input, - ) - - def expand_slice_decode( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool, - ): - bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, - y_slice_size, add_input) - - def add_shrink( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - scale: float, - ): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'a. - When `is_prefill is` true, it indicates that it is currently the - prefill stage, and the `shrink_prefill` function should be called. - Otherwise, it is the decode stage, and the shrink_decode function - should be called. - """ - shrink_fun: Callable = (self.shrink_prefill - if self.is_prefill else self.shrink_decode) - shrink_fun(y, x, w_t_all, scale) - - def add_expand( - self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - add_input: bool = True, - ): - """ - Perform the ` y+=x@w_t_all` computation, which is suitable for the - GEMM of lora'b. - When `is_prefill` is true, it indicates that it is currently the - prefill stage, and the `expand_prefill` function should be called. - Otherwise, it is the decode stage, and the expand_decode function - should be called. - """ - - expand_fun: Callable = (self.expand_prefill - if self.is_prefill else self.expand_decode) - expand_fun(y, x, w_t_all, add_input) - - def add_expand_slice(self, - y: torch.Tensor, - x: torch.Tensor, - w_t_all: torch.Tensor, - y_offset: Optional[int], - y_slice_size: Optional[int], - add_input: bool = True): - """ - Similar to `add_expand` - """ - - expand_slice_fun: Callable = (self.expand_slice_prefill - if self.is_prefill else - self.expand_slice_decode) - expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) - - def add_lora(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - scale: float, - y_offset: Optional[int] = None, - y_slice_size: Optional[int] = None, - *, - buffer: Optional[torch.Tensor] = None) -> None: - """ - Semantics: - y[i] += ( - x[i].unsqueeze(0) - @ wa_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - @ wb_t_all[indices[i], layer_idx, :, :].transpose(-1, -2) - * scale - ).squeeze(0) - Args: - y (torch.Tensor): Output tensor. Will be changed in-place. - x (torch.Tensor): Input tensor - wa_t_all (torch.Tensor): lora_a's weight - wb_t_all (torch.Tensor): lora_b's weight - scale (float): Scaling factor. - y_offset (Optional[int], optional): Offset to apply to the starting - column of y. - y_slice_size (Optional[int], optional): Size of the y column slice. - buffer (Optional[torch.Tensor], optional): Defaults to None. - """ - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - - self.add_shrink(buffer, x, wa_t_all, scale) - if y_offset is None and y_slice_size is None: - self.add_expand(y, buffer, wb_t_all, add_input=True) - else: - self.add_expand_slice(y, - buffer, - wb_t_all, - y_offset, - y_slice_size, - add_input=True) - y = y.view_as(y_org) - - def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor, - lora_a_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - lora_b_stacked: Tuple[torch.Tensor, - torch.Tensor, - torch.Tensor], - scale: float, - output_slices: Tuple[int, ...]) -> None: - """ - Applies lora to each input. Similar to add_lora, This method is - used for layers that are composed of multiple sublayers - (slices) packed together. - """ - y_org = y - x = x.view(-1, x.shape[-1]) - y = y.view(-1, y.shape[-1]) - offset_left = 0 - # TODO fuse these kernels - for slice_idx in range(len(output_slices)): - self.add_lora(y, x, lora_a_stacked[slice_idx], - lora_b_stacked[slice_idx], scale, offset_left, - output_slices[slice_idx]) - offset_left += output_slices[slice_idx] - - y = y.view_as(y_org) - - def add_lora_logits(self, - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - scale, - *, - buffer: Optional[torch.Tensor] = None) -> None: - """ - LogitsProcessorWithLoRA always using bgmv - """ - y_org = y - y = y.view(-1, y.shape[-1]) - x = x.view(-1, x.shape[-1]) - r = wb_t_all.size(-1) - if buffer is None: - # We set the buffer to be float32 by default ,refer to: - # https://github.com/triton-lang/triton/issues/1387 - buffer = torch.zeros((x.size(0), r), - dtype=torch.float32, - device=x.device) - - bgmv_shrink(x, wa_t_all, buffer, self.sampler_indices, scale) - bgmv_expand(buffer, wb_t_all, y, self.sampler_indices, add_inputs=True) - y = y.view_as(y_org) diff --git a/vllm/lora/punica_wrapper/__init__.py b/vllm/lora/punica_wrapper/__init__.py new file mode 100644 index 0000000000000..48ada3926ea46 --- /dev/null +++ b/vllm/lora/punica_wrapper/__init__.py @@ -0,0 +1,7 @@ +from vllm.lora.punica_wrapper.punica_base import PunicaWrapperBase +from vllm.lora.punica_wrapper.punica_selector import get_punica_wrapper + +__all__ = [ + "PunicaWrapperBase", + "get_punica_wrapper", +] diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py new file mode 100644 index 0000000000000..0a5a84bdd8deb --- /dev/null +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -0,0 +1,480 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +import torch + +from .utils import compute_meta, convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext + + +class PunicaWrapperABC(ABC): + """ + PunicaWrapper ABC. + """ + + @abstractmethod + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + **kwargs, + ) -> None: + """ + Update the lora-related metadata + """ + raise NotImplementedError + + @abstractmethod + def add_shrink( + self, + y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, + **kwargs, + ) -> None: + """ + Performs GEMM for multiple slices of lora_a. + """ + + raise NotImplementedError + + @abstractmethod + def add_expand( + self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_input=True, + **kwargs, + ) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + """ + raise NotImplementedError + + @abstractmethod + def add_lora_embedding( + self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_input: bool = True, + **kwargs, + ) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA, + and this layer only requires the expand operation. + """ + raise NotImplementedError + + @abstractmethod + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + """ + + raise NotImplementedError + + @abstractmethod + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + """ + raise NotImplementedError + + +class PunicaWrapperBase(PunicaWrapperABC): + """ + PunicaWrapperBase is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica. + """ + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: Union[torch.device, str], **kwargs): + self._token_lora_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + self._sampler_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + self._sampler_indices_padded = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + self._embeddings_indices = torch.empty(2, + max_num_batched_tokens, + dtype=torch.long, + device=device) + self._long_lora_indices = torch.empty(max_num_batched_tokens, + dtype=torch.long, + device=device) + + # 5 is the number of indicies tensors. + # base_indices, sampler_indices, sampler_indices_padded, + # embeddings_indices,long_lora_indices + self.indices_len: List[Optional[int]] = [None] * 5 + # these attributes are the information required for sgmv kernel + self._seq_start_locs = torch.empty(max_batches, + dtype=torch.long, + device=device) + self._seq_lengths = torch.empty(max_batches, + dtype=torch.long, + device=device) + self._lora_indices_per_batch = torch.empty(max_batches, + dtype=torch.long, + device=device) + self.device: torch.device = device + self.max_length: int = 0 + self.token_nums: int = 0 + self.batch_size: int = -1 + self.is_prefill = False + self.no_lora = False + + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping( + mapping, + lora_index_to_id, + max_loras, + vocab_size, + extra_vocab_size, + self.device, + long_lora_context, + ) + self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) + self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) + self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( + sampler_indices_padded) + self._embeddings_indices[:embeddings_indices. + shape[0], :embeddings_indices.shape[1]].copy_( + embeddings_indices) + if long_lora_offsets_tensor is not None: + self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( + long_lora_offsets_tensor) + else: + self._long_lora_indices.zero_() + self.indices_len[:] = indices_len + + def _update_prefill_metada(self, token_lora_tensor: torch.Tensor) -> None: + + (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, + batch_size, max_length, token_nums, + no_lora) = compute_meta(token_lora_tensor) + + self._seq_start_locs[:b_seq_start_tensor.shape[0]].copy_( + b_seq_start_tensor) + self._seq_lengths[:seq_length_tensor.shape[0]].copy_(seq_length_tensor) + self._lora_indices_per_batch[:lora_indices_tensor.shape[0]].copy_( + lora_indices_tensor) + self.batch_size = batch_size + self.max_length = max_length + self.token_nums = token_nums + self.no_lora = no_lora + + def _apply_bias( + self, + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + lora_bias_stacked: Tuple[Optional[torch.Tensor], ...], + ): + """Applies bias to output + + Input shapes: + lora_bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = lora_bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + offset_left += slice + + return output.view_as(org_output) + + @property + def prefill_metadata( + self + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int]: + """ + This property provides a convenient way to access the necessary + metadata for prefill-related kernel computations. + 1. seq_start_locs: Tensor of sequence start positions. + 2. seq_lengths: Tensor of sequence lengths. + 3. lora_indices_per_batch: Tensor of lora indices, and an index of + -1 means no lora should be applied. + 4. batch_size: Batch size after clustering identical lora indices. + 5. max_length: The maximum sequence length in the batch. + 6. token_nums: The token numbers in the batch. + """ + return (self._seq_start_locs[:self.batch_size], + self._seq_lengths[:self.batch_size], + self._lora_indices_per_batch[:self.batch_size], + self.batch_size, self.max_length, self.token_nums) + + @property + def token_lora_indices(self) -> torch.Tensor: + """ + This property provides the lora indices corresponding to each token + in the batch. An index of -1 means no lora should be applied. + """ + token_lora_len = self.indices_len[0] + return self._token_lora_indices[:token_lora_len] + + @property + def sampler_indices(self) -> torch.Tensor: + """ + This property is used to access the lora indices specifically for + LogitsProcessorWithLoRA. + """ + sampler_indices_len = self.indices_len[1] + return self._sampler_indices[:sampler_indices_len] + + @property + def sampler_indices_padded(self) -> torch.Tensor: + """ + This property provides access to padded sampler indices. + """ + indices_padded_len = self.indices_len[2] + return self._sampler_indices_padded[:indices_padded_len] + + @property + def embeddings_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for lora embeddings, + specifically for VocabParallelEmbeddingWithLoRA. + """ + embeddings_indices_len = self.indices_len[3] + return self._embeddings_indices[:, :embeddings_indices_len] + + @property + def long_lora_indices(self) -> torch.Tensor: + """ + This property provides access to the indices used for long context + lora, specifically for LinearScalingRotaryEmbeddingWithLora. + """ + long_lora_len = self.indices_len[4] + return self._long_lora_indices[:long_lora_len] + + def update_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + **kwargs): + + self._update_base_metadata(mapping, lora_index_to_id, max_loras, + vocab_size, extra_vocab_size, + long_lora_context) + if mapping.is_prefill: + # Update metadata required for prefill-related operators. + self._update_prefill_metada(self.token_lora_indices) + self.is_prefill = True + else: + self.is_prefill = False + + @abstractmethod + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, **kwargs) -> None: + """ + Performs GEMM for multiple slices of lora_a. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_expand(self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_input=True, + **kwargs) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_input (bool): Defaults to True. + + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_embedding(self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_input: bool = True, + **kwargs) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + and this layer only requires the expand operation. + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_input (bool): Default to True. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + """ + # TODO: implement it based on torch ops + raise NotImplementedError + + @abstractmethod + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + # TODO: implement it based on torch ops + raise NotImplementedError diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py new file mode 100644 index 0000000000000..b2af29de129ce --- /dev/null +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -0,0 +1,358 @@ +""" +Based on: +Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., & Krishnamurthy, A. (2023). +Punica: Multi-Tenant LoRA Serving. +https://arxiv.org/abs/2310.18547 +""" + +from typing import Callable, Optional, Tuple, Union, final + +import torch + +from vllm.triton_utils import HAS_TRITON + +if HAS_TRITON: + from vllm.lora.ops.bgmv_expand import bgmv_expand + from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice + from vllm.lora.ops.bgmv_shrink import bgmv_shrink + from vllm.lora.ops.sgmv_expand import sgmv_expand + from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice + from vllm.lora.ops.sgmv_shrink import sgmv_shrink + +from .punica_base import PunicaWrapperBase + + +@final +class PunicaWrapperGPU(PunicaWrapperBase): + """ + PunicaWrapperGPU is designed to manage and provide metadata for the punica + kernel. The main function is to maintain the state information for + Multi-LoRA, and to provide the interface for the punica triton kernel. + """ + + def __init__(self, max_num_batched_tokens: int, max_batches: int, + device: Union[torch.device, str], **kwargs): + PunicaWrapperBase.__init__(self, max_num_batched_tokens, max_batches, + device) + + def _shrink_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_shrink( + x, + w_t_all, + y, + *self.prefill_metadata, + scale, + ) + + def _shrink_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + scale: float, + ): + bgmv_shrink(x, w_t_all, y, self.token_lora_indices, scale) + + def _expand_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand( + x, + w_t_all, + y, + *self.prefill_metadata, + add_input, + ) + + def _expand_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + add_input: bool, + ): + bgmv_expand(x, w_t_all, y, self.token_lora_indices, add_input) + + def _expand_slice_prefill( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool, + ): + #No LoRA request, so return directly + if self.no_lora: + return + sgmv_expand_slice( + x, + w_t_all, + y, + *self.prefill_metadata, + y_offset, + y_slice_size, + add_input, + ) + + def _expand_slice_decode( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool, + ): + bgmv_expand_slice(x, w_t_all, y, self.token_lora_indices, y_offset, + y_slice_size, add_input) + + def _apply_expand( + self, + y: torch.Tensor, + x: torch.Tensor, + w_t_all: torch.Tensor, + y_offset: Optional[int], + y_slice_size: Optional[int], + add_input: bool = True, + ): + """ + Perform the ` y[:,y_offset:y_offset+y_slice_size]+=x@w_t_all` + computation, which is suitable for the + GEMM of lora'b. + """ + + expand_slice_fun: Callable = (self._expand_slice_prefill + if self.is_prefill else + self._expand_slice_decode) + expand_slice_fun(y, x, w_t_all, y_offset, y_slice_size, add_input) + + def _apply_shrink(self, y: torch.Tensor, x: torch.Tensor, + w_t_all: torch.Tensor, scale: float): + """ + Perform the ` y+=x@w_t_all` computation, which is suitable for the + GEMM of lora'a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + shrink_fun: Callable = (self._shrink_prefill + if self.is_prefill else self._shrink_decode) + shrink_fun(y, x, w_t_all, scale) + y = y.view_as(y_org) + + def add_shrink(self, y: Union[Tuple[torch.Tensor, ...], torch.Tensor], + x: torch.Tensor, lora_a_stacked: Tuple[torch.Tensor, ...], + scale: float, **kwargs): + """ + Performs GEMM for multiple slices of lora_a. + When `is_prefill is` true, it indicates that it is currently the + prefill stage, and the `_shrink_prefill` function should be called. + Otherwise, it is the decode stage, and the _shrink_decode function + should be called. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += (x @ lora_a_stacked[i]) * scale + + Args: + y (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Output tensors + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weights + scale (float): Scaling factor for the operation + """ + + x = x.view(-1, x.shape[-1]) + # TODO fuse these kernels + for slice_idx in range(len(lora_a_stacked)): + self._apply_shrink(y[slice_idx], x, lora_a_stacked[slice_idx], + scale) + + def add_expand(self, + y: torch.Tensor, + x: Union[Tuple[torch.Tensor, ...], torch.Tensor], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + output_slices: Tuple[int, ...], + offset_start: int = 0, + add_input=True, + **kwargs) -> None: + """ + Performs GEMM and bias addition for multiple slices of lora_b. + + Semantics: + for i in range(len(lora_b_stacked)): + slice = output_slices[i] + y[:, offset:offset+slice] += x[i] @ lora_b_stacked[i] + + lora_bias_stacked[i] + offset += slice + + Args: + y (torch.Tensor): Output tensor. + x (Union[Tuple[torch.Tensor, ...], torch.Tensor]): Input tensors + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): + bias's weight + output_slices (Tuple[int, ...]): Every slice's size + add_input (bool): Defaults to True. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + offset_left = offset_start + if lora_bias_stacked is not None: + self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + for slice_idx in range(len(lora_b_stacked)): + self._apply_expand( + y, + x[slice_idx], + lora_b_stacked[slice_idx], + offset_left, + output_slices[slice_idx], + add_input=add_input, + ) + offset_left += output_slices[slice_idx] + y = y.view_as(y_org) + + def add_lora_embedding(self, + y: torch.Tensor, + x: torch.Tensor, + lora_b_stacked: torch.Tensor, + add_input: bool = True, + **kwargs) -> None: + """ + Applies lora specifically for VocabParallelEmbeddingWithLoRA. + + Semantics: + y += x @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_b_stacked (torch.Tensor): lora_b's weights. + add_input (bool): Default to True. + """ + + # Embedding layer only need expand op + expand_fun: Callable = (self._expand_prefill + if self.is_prefill else self._expand_decode) + expand_fun(y, x, lora_b_stacked, add_input) + + def add_lora_linear(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: Tuple[torch.Tensor, ...], + lora_b_stacked: Tuple[torch.Tensor, ...], + lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]], + scale: float, + output_slices: Tuple[int, ...], + *, + buffer: Optional[Tuple[torch.Tensor, ...]] = None, + **kwargs) -> None: + """ + Applicable to linear-related lora. + + Semantics: + for i in range(len(lora_a_stacked)): + y[i] += ( + x[i].unsqueeze(0) + @ lora_a_stacked[indices[i], layer_idx, :, :] + @ lora_b_stacked[indices[i], layer_idx, :, :] + * scale + ).squeeze(0)+lora_bias_stacked[i] + + Args: + y (torch.Tensor): Output tensor. Will be changed in-place. + x (torch.Tensor): Input tensor + lora_a_stacked (Tuple[torch.Tensor, ...]): lora_a's weight. + lora_b_stacked (Tuple[torch.Tensor, ...]): lora_b's weight. + lora_bias_stacked (Optional[Tuple[torch.Tensor, ...]]): lora's bias. + scale (float): Scaling factor. + output_slices (Tuple[int, ...]): Every slice's size. + buffer (Optional[Tuple[torch.Tensor, ...]]): Defaults to None. + """ + + assert len(lora_a_stacked) == len(lora_b_stacked) == len(output_slices) + if lora_bias_stacked is not None: + assert len(lora_bias_stacked) == len(output_slices) + y = self._apply_bias(self.token_lora_indices, y, output_slices, + lora_bias_stacked) + + if buffer is None: + r = lora_b_stacked[0].size(-1) + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = tuple( + torch.zeros( + (x.size(0), r), dtype=torch.float32, device=x.device) + for _ in range(len(output_slices))) + self.add_shrink(buffer, x, lora_a_stacked, scale, **kwargs) + self.add_expand(y, + buffer, + lora_b_stacked, + None, + output_slices, + add_input=True, + **kwargs) + + def add_lora_logits(self, + y: torch.Tensor, + x: torch.Tensor, + lora_a_stacked: torch.Tensor, + lora_b_stacked: torch.Tensor, + scale, + *, + buffer: Optional[torch.Tensor] = None, + **kwargs) -> None: + """ + Applies lora specifically for LogitsProcessorWithLoRA. + + Semantics: + buffer = (x @ lora_a_stacked) * scale + y += buffer @ lora_b_stacked + + Args: + y (torch.Tensor): Output tensor. + x (torch.Tensor): Input tensor. + lora_a_stacked (torch.Tensor): lora_a's weights. + lora_b_stacked (torch.Tensor):lora_b's weights. + scale (float): Scaling factor. + buffer (Optional[torch.Tensor]):Default to None. + """ + y_org = y + y = y.view(-1, y.shape[-1]) + x = x.view(-1, x.shape[-1]) + r = lora_b_stacked.size(-1) + if buffer is None: + # We set the buffer to be float32 by default ,refer to: + # https://github.com/triton-lang/triton/issues/1387 + buffer = torch.zeros((x.size(0), r), + dtype=torch.float32, + device=x.device) + # LogitsProcessorWithLoRA always using bgmv. + bgmv_shrink(x, lora_a_stacked, buffer, self.sampler_indices, scale) + bgmv_expand(buffer, + lora_b_stacked, + y, + self.sampler_indices, + add_inputs=True) + y = y.view_as(y_org) diff --git a/vllm/lora/punica_wrapper/punica_selector.py b/vllm/lora/punica_wrapper/punica_selector.py new file mode 100644 index 0000000000000..df6c1bdc7dd71 --- /dev/null +++ b/vllm/lora/punica_wrapper/punica_selector.py @@ -0,0 +1,14 @@ +from vllm.platforms import current_platform +from vllm.utils import print_info_once + +from .punica_base import PunicaWrapperBase + + +def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase: + if current_platform.is_cuda_alike(): + # Lazy import to avoid ImportError + from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU + print_info_once("Using PunicaWrapperGPU.") + return PunicaWrapperGPU(*args, **kwargs) + else: + raise NotImplementedError diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py new file mode 100644 index 0000000000000..7360c8c09e3ac --- /dev/null +++ b/vllm/lora/punica_wrapper/utils.py @@ -0,0 +1,159 @@ +from typing import TYPE_CHECKING, List, Optional, Tuple, Union + +import torch + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext + + +def compute_meta( + token_lora_tensor: torch.Tensor +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]: + """ + Get the information required for the sgmv kernel. With the features: + 1. If consecutive requests in the batch use the same LoRA, this function + will combine them into a single request, improving sgmv kernel inference + performance. + 2. At the beginning of each prefill stage inference, recalculations are + needed based on the input, but only once. + """ + + lora_indices_tensor, seq_length_tensor = torch.unique_consecutive( + token_lora_tensor, return_counts=True) + cum_result = torch.cumsum(seq_length_tensor, dim=0) + b_seq_start_tensor = torch.zeros_like(seq_length_tensor) + b_seq_start_tensor[1:].copy_(cum_result[:-1]) + max_length = seq_length_tensor.max().item() + token_nums = seq_length_tensor.sum().item() + batch_size = lora_indices_tensor.size(0) + no_lora = False + # -1 means no lora should be applied. Use `no_lora` to determine whether + # the current step requires LoRA. If LoRA is not needed, the prefill stage + # does not need to launch the triton kernel, which can improve performance + if batch_size == 1 and lora_indices_tensor == -1: + no_lora = True + return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor, + batch_size, max_length, token_nums, no_lora) + + +# TODO see if this can be vectorized +def convert_mapping( + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + device: torch.device, + long_lora_context: Optional["LongContextLoRAContext"] = None, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, + Optional[torch.Tensor], List[int]]: + """Converts LoRAMapping to index tensors. + + Args: + mapping: LoRAMapping mapping rows in a batch to LoRA ids. + lora_index_to_id: List mapping LoRA ids to LoRA indices. + max_loras: Maximum number of LoRAs. + vocab_size: Model vocab size. + extra_vocab_size: Extra vocab size each LoRA can have. + long_lora_context: Passed if there are long context lora in a batch. + + Returns: + A tuple of tensors: + base_indices: Tensor of shape [batch_size] mapping batch rows to + LoRA indices. + sampler_indices: Tensor of shape [batch_size] mapping requests to + LoRA indices for sampler. For generation, this will be the + same as base_indicies. For prefill, this will map requests + to LoRA indices. + sampler_indices_padded: Tensor of shape [batch_size] mapping + requests to LoRA indices for sampler with padding. + Same as sampler_indicies, but -1 is replaced with + max_loras. + embeddings_indices: Tensor of shape [2, batch_size] mapping + requests to embedding indices. First row is for embeddings + added by the LoRAs, second row is for the LoRA.lora_a + embeddings. + long_lora_indices: Tensor of shape [batch_size] mapping + requests to RoPE offsets and rot dims for long LoRAs. + None if long context lora doesn't exist. + indices_len: List of lengths of the above tensors. It contains + (base_indices, sampler_indices, sampler_indices_padded, + embeddings_indices, long_lora_indices). + """ + index_mapping_indices: List[int] = list(mapping.index_mapping).copy() + embedding_indices = index_mapping_indices.copy() + lora_indices = index_mapping_indices.copy() + long_lora_offsets: Optional[torch.Tensor] = None + if long_lora_context: + long_lora_offsets = torch.zeros(len(index_mapping_indices), + device=device, + dtype=torch.long) + prompt_mapping: List[int] = [ + lora_index_to_id.index(x) if x > 0 else -1 + for x in mapping.prompt_mapping + ] + lora_idx = None + for i in range(len(index_mapping_indices)): + # TODO index can be slow. optimize + lora_idx = (lora_index_to_id.index(index_mapping_indices[i]) + if index_mapping_indices[i] > 0 else -1) + embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 + lora_indices[i] = lora_idx + if long_lora_context: + assert long_lora_offsets is not None + lora_offset: int = long_lora_context.offsets_by_lora_id.get( + index_mapping_indices[i], 0) + long_lora_offsets[i] = lora_offset + + indices_list: List[Union[List[int], torch.Tensor]] = [ + index_mapping_indices, + lora_indices, + embedding_indices, + ] + if long_lora_context: + assert long_lora_offsets is not None + indices_list.append(long_lora_offsets) + indices = torch.tensor(indices_list, dtype=torch.long, device=device) + prompt_mapping_tensor = torch.tensor(prompt_mapping, + dtype=torch.long, + device=device) + embeddings_indices = torch.stack([ + indices[2] * extra_vocab_size, + indices[2] * (vocab_size + extra_vocab_size), + ]) + embeddings_indices[embeddings_indices == -1] = max_loras - 1 + base_indices = indices[1] + sampler_indices = prompt_mapping_tensor + sampler_indices_padded = sampler_indices.clone() + sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 + sampler_indices_padded = torch.arange( + 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( + sampler_indices_padded * len(sampler_indices_padded)) + long_lora_indices = None + long_lora_indices_len: Optional[int] = None + if long_lora_context: + long_lora_indices = indices[3] + long_lora_indices_len = long_lora_indices.shape[-1] + # Contain length of indices tensors. Used to index into each tensor. + indices_len = [ + base_indices.shape[-1], + sampler_indices.shape[-1], + sampler_indices_padded.shape[-1], + embeddings_indices.shape[-1], + ] + if long_lora_indices_len is not None: + indices_len.append(long_lora_indices_len) + else: + # If long_lora doesn't exist,append None + indices_len.append(None) + + return ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_indices, + indices_len, + ) diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index d7b67425fcbc0..e631aec928ec5 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,14 +1,96 @@ -from typing import Optional +from __future__ import annotations -from vllm.logits_process import LogitsProcessor -from vllm.sampling_params import GuidedDecodingParams +from typing import TYPE_CHECKING + +from vllm.logger import init_logger +from vllm.platforms import CpuArchEnum, current_platform + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + + from vllm.config import ModelConfig + from vllm.logits_process import LogitsProcessor + from vllm.sampling_params import GuidedDecodingParams + +logger = init_logger(__name__) + + +def has_xgrammar_unsupported_json_features(schema: dict) -> bool: + """Check if JSON schema contains features unsupported by xgrammar.""" + + def check_object(obj: dict) -> bool: + if not isinstance(obj, dict): + return False + + # Check for pattern restrictions + if "pattern" in obj: + return True + + # Check for numeric ranges + if obj.get("type") in ("integer", "number") and any( + key in obj for key in [ + "minimum", "maximum", "exclusiveMinimum", + "exclusiveMaximum", "multipleOf" + ]): + return True + + # Recursively check all nested objects and arrays + for value in obj.values(): + if isinstance(value, dict): + if check_object(value): + return True + elif isinstance(value, list): + for item in value: + if isinstance(item, dict) and check_object(item): + return True + + return False + + return check_object(schema) + + +def maybe_backend_fallback( + guided_params: GuidedDecodingParams) -> GuidedDecodingParams: + # lm-format-enforce doesn't support grammar, fallback to xgrammar + if (guided_params.backend == "lm-format-enforcer" + and guided_params.grammar is not None): + logger.warning( + "lm-format-enforcer does not support grammar guided decoding. " + "Falling back to use xgrammar instead.") + guided_params.backend = "xgrammar" + + if guided_params.backend == "xgrammar": + # xgrammar only has x86 wheels for linux, fallback to outlines + if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: + logger.warning("xgrammar is only supported on x86 CPUs. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + + # xgrammar doesn't support regex or choice, fallback to outlines + if guided_params.regex is not None or guided_params.choice is not None: + logger.warning( + "xgrammar only supports json or grammar guided decoding. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + + # xgrammar doesn't support some JSON schema features + elif (guided_params.json is not None + and has_xgrammar_unsupported_json_features(guided_params.json)): + logger.warning( + "xgrammar does not support advanced JSON schema features like " + "patterns or numeric ranges. " + "Falling back to use outlines instead.") + guided_params.backend = "outlines" + + return guided_params async def get_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer) -> Optional[LogitsProcessor]: + guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer, + model_config: ModelConfig) -> LogitsProcessor | None: + guided_params = maybe_backend_fallback(guided_params) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend == 'outlines' or guided_params.grammar: + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_outlines_guided_decoding_logits_processor) @@ -19,17 +101,23 @@ async def get_guided_decoding_logits_processor( get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) + if guided_params.backend == 'xgrammar': + from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa + get_local_xgrammar_guided_decoding_logits_processor) + return get_local_xgrammar_guided_decoding_logits_processor( + guided_params, tokenizer, model_config) raise ValueError( f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer'") + "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'") def get_local_guided_decoding_logits_processor( - guided_params: GuidedDecodingParams, - tokenizer) -> Optional[LogitsProcessor]: + guided_params: GuidedDecodingParams, tokenizer: PreTrainedTokenizer, + model_config: ModelConfig) -> LogitsProcessor | None: + guided_params = maybe_backend_fallback(guided_params) # CFG grammar not supported by LMFE, so we use outlines instead - if guided_params.backend == 'outlines' or guided_params.grammar: + if guided_params.backend == 'outlines': # NOTE: lazy import outlines to avoid https://github.com/vllm-project/vllm/issues/4193 from vllm.model_executor.guided_decoding.outlines_decoding import ( # noqa get_local_outlines_guided_decoding_logits_processor) @@ -40,7 +128,12 @@ def get_local_guided_decoding_logits_processor( get_local_lm_format_enforcer_guided_decoding_logits_processor) return get_local_lm_format_enforcer_guided_decoding_logits_processor( guided_params, tokenizer) + if guided_params.backend == 'xgrammar': + from vllm.model_executor.guided_decoding.xgrammar_decoding import ( # noqa + get_local_xgrammar_guided_decoding_logits_processor) + return get_local_xgrammar_guided_decoding_logits_processor( + guided_params, tokenizer, model_config) raise ValueError( f"Unknown guided decoding backend '{guided_params.backend}'. " - "Must be one of 'outlines, 'lm-format-enforcer'") + "Must be one of 'outlines, 'lm-format-enforcer', 'xgrammar'") diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py new file mode 100644 index 0000000000000..b59a2269d2cd5 --- /dev/null +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -0,0 +1,266 @@ +# noqa: UP007 +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Any, NamedTuple + +import torch +from transformers import PreTrainedTokenizerFast + +try: + import xgrammar as xgr + from xgrammar.base import _core as xgr_core +except ImportError: + pass + +from vllm.model_executor.guided_decoding.xgrammar_utils import ( + convert_lark_to_gbnf, grammar_is_likely_lark) + +if TYPE_CHECKING: + from transformers import PreTrainedTokenizer + + from vllm.config import ModelConfig + from vllm.sampling_params import GuidedDecodingParams + + +# TODO: passing batch size to max threads here +def get_local_xgrammar_guided_decoding_logits_processor( + guided_params: GuidedDecodingParams, + tokenizer: PreTrainedTokenizer, + model_config: ModelConfig, + max_threads: int = 8): + config = GrammarConfig.from_guided_params(guided_params=guided_params, + model_config=model_config, + tokenizer=tokenizer, + max_threads=max_threads) + return XGrammarLogitsProcessor(config) + + +class TokenizerData(NamedTuple): + """Immutable container for cached tokenizer data.""" + encoded_vocab: list[str] + stop_token_ids: list[int] | None + backend_str: str + + +class TokenizerDataCache: + """Cache manager for tokenizer data to avoid repeated processing.""" + _cache: dict[int, TokenizerData] = {} + + @classmethod + def get_tokenizer_data(cls, + tokenizer: PreTrainedTokenizer) -> TokenizerData: + tokenizer_hash = hash(tokenizer) + + if tokenizer_hash not in cls._cache: + # Vendored from xgrammar logic since we cannot pickle the tokenizer + # https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 # noqa: E501 + try: + encoded_vocab = [ + token for token, _ in sorted(tokenizer.get_vocab().items(), + key=lambda x: x[1]) + ] + except AttributeError as e: + raise ValueError( + f"Cannot get the vocabulary of the tokenizer " + f"{type(tokenizer)}. The tokenizer should have a " + "get_vocab method.") from e + + stop_token_ids = None + backend_str = xgr.VocabType.RAW + if isinstance(tokenizer, PreTrainedTokenizerFast): + backend_str = tokenizer.backend_tokenizer.to_str() + if stop_token_ids is None and hasattr( + tokenizer, + "eos_token_id") and tokenizer.eos_token_id is not None: + stop_token_ids = [tokenizer.eos_token_id] + + cls._cache[tokenizer_hash] = TokenizerData( + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str) + + return cls._cache[tokenizer_hash] + + +class GrammarCompilerCache: + """ + Cache for GrammarCompiler instances based on tokenizer. + + This cache reduces the overhead of creating new compiler instances when + using the same tokenizer configuration. + """ + _cache: dict[str, xgr.GrammarCompiler] = {} + + @classmethod + def get_compiler(cls, config: GrammarConfig) -> xgr.GrammarCompiler: + cache_key = str(config.tokenizer_hash) + + if cache_key not in cls._cache: + assert config.encoded_vocab is not None + tokenizer_info = xgr.TokenizerInfo._create_from_handle( + xgr_core.TokenizerInfo.from_huggingface( + config.encoded_vocab, config.backend_str, + config.vocab_size, config.stop_token_ids)) + cls._cache[cache_key] = xgr.GrammarCompiler( + tokenizer_info, max_threads=config.max_threads) + + return cls._cache[cache_key] + + +@dataclass +class GrammarConfig: + """Serializable configuration for grammar compilation""" + tokenizer_hash: int + vocab_size: int + json_str: str | None = None + grammar_str: str | None = None + json_object: bool | None = None + max_threads: int = 8 + # Only populated if tokenizer_hash not in cache + encoded_vocab: list[str] | None = None + stop_token_ids: list[int] | None = None + backend_str: str | None = None + + @classmethod + def from_guided_params(cls, + guided_params: GuidedDecodingParams, + model_config: ModelConfig, + tokenizer: PreTrainedTokenizer, + max_threads: int = 8) -> GrammarConfig: + + tokenizer_hash = hash(tokenizer) + # Only get tokenizer data if not already cached + if tokenizer_hash in TokenizerDataCache._cache: + encoded_vocab = None + stop_token_ids = None + backend_str = None + else: + tokenizer_data = TokenizerDataCache.get_tokenizer_data(tokenizer) + encoded_vocab = tokenizer_data.encoded_vocab + stop_token_ids = tokenizer_data.stop_token_ids + backend_str = tokenizer_data.backend_str + + if guided_params.json: + if not isinstance(guided_params.json, str): + json_str = json.dumps(guided_params.json) + else: + json_str = guided_params.json + return cls(json_str=json_str, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + elif guided_params.grammar: + # XGrammar only supports GBNF grammars, so we must convert Lark + if grammar_is_likely_lark(guided_params.grammar): + try: + grammar_str = convert_lark_to_gbnf(guided_params.grammar) + except ValueError as e: + raise ValueError( + "Failed to convert the grammar from Lark to GBNF. " + "Please either use GBNF grammar directly or specify" + " --guided-decoding-backend=outlines.\n" + f"Conversion error: {str(e)}") from e + else: + grammar_str = guided_params.grammar + return cls(grammar_str=grammar_str, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + elif guided_params.json_object: + return cls(json_object=True, + vocab_size=model_config.hf_config.vocab_size, + encoded_vocab=encoded_vocab, + stop_token_ids=stop_token_ids, + backend_str=backend_str, + tokenizer_hash=tokenizer_hash, + max_threads=max_threads) + else: + raise ValueError( + "Currently only support JSON and EBNF grammar mode for xgrammar" + ) + + +@dataclass +class XGrammarLogitsProcessor: + """Wrapper class to support pickle protocol""" + config: GrammarConfig + + ctx: xgr.CompiledGrammar | None = None + token_bitmask: torch.Tensor = None # type: ignore[assignment] + matchers: list[xgr.GrammarMatcher] = field(default_factory=list) + batch_size: int = field(default=1) + prefilled: bool = field(default=False) + + def __getstate__(self) -> dict[str, Any]: + return {'config': self.config} + + def __setstate__(self, state: dict[str, Any]): + self.config = state['config'] + + self.ctx = None + self.matchers = [] + self.batch_size = 1 + self.token_bitmask = None # type: ignore[assignment] + self.prefilled = False + + def _ensure_ctx(self): + """Lazily initialize the processor in the worker process""" + if self.ctx is None: + compiler = GrammarCompilerCache.get_compiler(self.config) + if self.config.json_str is not None: + self.ctx = compiler.compile_json_schema(self.config.json_str) + elif self.config.grammar_str is not None: + self.ctx = compiler.compile_grammar(self.config.grammar_str) + elif self.config.json_object: + self.ctx = compiler.compile_builtin_json_grammar() + else: + raise ValueError( + "Invalid configuration for xgrammar logits processor") + + def __call__(self, input_ids: list[int], + scores: torch.Tensor) -> torch.Tensor: + if self.ctx is None: + self._ensure_ctx() + + if len(self.matchers) == 0: + self.matchers = [ + xgr.GrammarMatcher(self.ctx) for _ in range(self.batch_size) + ] + self.token_bitmask = xgr.allocate_token_bitmask( + self.batch_size, self.config.vocab_size) + + if not self.prefilled: + # Have not sampled a token yet + self.prefilled = True + else: + for i, matcher in enumerate(self.matchers): + if not matcher.is_terminated(): + sampled_token = input_ids[-1] + assert self.matchers[i].accept_token(sampled_token) + + for i, matcher in enumerate(self.matchers): + if not matcher.is_terminated(): + # @ubospica: ideally, fill_next_token_bitmask should be + # parallelized with model decoding + # See https://github.com/vllm-project/vllm/pull/10785/files#r1864278303 + matcher.fill_next_token_bitmask(self.token_bitmask, i) + + # token_bitmask is a CPU tensor for use with accept_token and + # fill_next_token_bitmask so we move it to the device of scores + device_type = scores.device.type + if device_type != "cuda": + scores = scores.to("cpu") + xgr.apply_token_bitmask_inplace(scores, + self.token_bitmask.to(scores.device)) + if device_type != "cuda": + scores = scores.to(device_type) + + return scores diff --git a/vllm/model_executor/guided_decoding/xgrammar_utils.py b/vllm/model_executor/guided_decoding/xgrammar_utils.py new file mode 100644 index 0000000000000..12b42245f4e3d --- /dev/null +++ b/vllm/model_executor/guided_decoding/xgrammar_utils.py @@ -0,0 +1,162 @@ +import re + + +def grammar_is_likely_lark(grammar_str: str) -> bool: + """ + Check if grammar appears to use Lark syntax. + + Args: + grammar_str: Input grammar string + + Returns: + bool: True if grammar appears to be in Lark format, False otherwise + + Examples: + >>> grammar_is_likely_lark("rule: 'abc'") + True + >>> grammar_is_likely_lark("rule ::= 'abc'") + False + """ + if not grammar_str or not isinstance(grammar_str, str): + return False + + for line in grammar_str.split('\n'): + # Remove both comment styles + line = re.sub(r'(#|//).*$', '', line).strip() + if not line: + continue + + # Look for Lark-style rule definitions + if ':' in line and '::=' not in line: + return True + + # Look for Lark-specific features + if any(pattern in line for pattern in ['?start:', '|', '~']): + return True + + return False + + +def convert_lark_to_gbnf(grammar_str: str) -> str: + """ + Convert a Lark grammar string to GBNF format. + + GBNF reference: + https://github.com/ggerganov/llama.cpp/blob/master/grammars/README.md + Lark grammar reference: + https://lark-parser.readthedocs.io/en/latest/grammar.html + + Args: + grammar_str: Input grammar in Lark format + + Returns: + str: Converted grammar in GBNF format + + Examples: + >>> print(convert_lark_to_gbnf("rule: 'hello'")) + root ::= rule + rule ::= "hello" + """ + if not isinstance(grammar_str, str): + raise ValueError(f"Grammar must be a string, got {type(grammar_str)}") + if not grammar_str.strip(): + raise ValueError("Grammar string cannot be empty") + + defined_rules = set() + referenced_rules = set() + output_lines = [] + + def clean_line(line: str) -> str: + """Remove comments and whitespace from line.""" + return re.sub(r'(#|//).*$', '', line).strip() + + def check_quotes(text: str, rule_name: str, line_num: int) -> None: + """Validate quote matching in text.""" + if text.count("'") % 2 != 0 or text.count('"') % 2 != 0: + raise ValueError( + f"Mismatched quotes in {rule_name} on line {line_num}") + + def extract_references(text: str) -> set: + """Extract rule references from text.""" + # Remove quoted strings and special characters + text = re.sub(r'"[^"]*"', '', text) + text = re.sub(r'[+*?()|\[\]{}]', ' ', text) + return set(re.findall(r'\b[a-zA-Z_][a-zA-Z0-9_]*\b', text)) + + # First pass: Find root rule and validate rule definitions + lines = [clean_line(line) for line in grammar_str.split('\n')] + first_rule = None + + for line_num, line in enumerate(lines, 1): + if not line or line.startswith('|'): + continue + + if ':' in line: + try: + name = line.split(':', 1)[0].strip().strip('?') + defined_rules.add(name) + if first_rule is None: + first_rule = name + if name == 'start': + first_rule = 'start' + except IndexError as e: + raise ValueError(f"Invalid rule format on line {line_num}. " + "Expected 'rule_name: definition'") from e + + if not defined_rules: + raise ValueError("No valid rules found in grammar") + + # Add root rule + output_lines.append(f"root ::= {first_rule}") + + # Second pass: Process rule definitions and alternatives + current_rule = None + current_definition = [] + + for line_num, line in enumerate(lines, 1): + if not line: + continue + + try: + if ':' in line and not line.startswith('|'): + # Save previous rule if exists + if current_rule: + output_lines.append( + f"{current_rule} ::= {' | '.join(current_definition)}") + + # Process new rule + name, definition = line.split(':', 1) + current_rule = name.strip().strip('?') + + check_quotes(definition, f"rule '{current_rule}'", line_num) + definition = re.sub(r"'([^']*)'", r'"\1"', definition) + referenced_rules.update(extract_references(definition)) + current_definition = [definition.strip()] + + elif line.startswith('|'): + if not current_rule: + raise ValueError(f"Alternative '|' on line {line_num} " + "without a preceding rule definition") + + alt_def = line[1:].strip() + check_quotes(alt_def, f"alternative for rule '{current_rule}'", + line_num) + alt_def = re.sub(r"'([^']*)'", r'"\1"', alt_def) + referenced_rules.update(extract_references(alt_def)) + current_definition.append(alt_def) + + except ValueError as e: + raise ValueError(f"Error on line {line_num}: {str(e)}") from e + + # Add final rule if exists + if current_rule: + output_lines.append( + f"{current_rule} ::= {' | '.join(current_definition)}") + + # Validate all rules are defined + undefined_rules = referenced_rules - defined_rules - {'root'} + if undefined_rules: + raise ValueError("Referenced rules are not defined: " + f"{', '.join(sorted(undefined_rules))}") + + return '\n'.join(output_lines) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 0e12bc5691538..fdc4c6305bd5e 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -6,7 +6,6 @@ import glob import inspect import itertools -import json import math import os import warnings @@ -18,7 +17,7 @@ import huggingface_hub import numpy as np import torch -from huggingface_hub import HfApi, hf_hub_download +from huggingface_hub import HfApi from torch import nn from transformers import AutoModelForCausalLM from transformers.utils import SAFE_WEIGHTS_INDEX_NAME @@ -102,12 +101,10 @@ def _initialize_model( vllm_config: VllmConfig, *, prefix: str = "", - architectures: Optional[list[str]] = None, ) -> nn.Module: """Initialize a model with the given configurations.""" model_config = vllm_config.model_config - model_class, _ = get_model_architecture(model_config, - architectures=architectures) + model_class, _ = get_model_architecture(model_config) signatures = inspect.signature(model_class.__init__) all_params = [param.name for param in signatures.parameters.values()] @@ -704,51 +701,9 @@ def __init__(self, load_config: LoadConfig): self.unsharded_weights_modules: List[str] = [] # Save the module names that are sharded by column. self.column_sharded_weights_modules: List[str] = [] - # we don't need to quantize the whole model, only the target modules - # that are specified in the adapter config file. If the adapter config - # file is not provided, we will quantize the default modules. - if (not load_config.model_loader_extra_config - or "qlora_adapter_name_or_path" - not in load_config.model_loader_extra_config): - self.target_modules = [] - return - - qlora_adapter = load_config.model_loader_extra_config[ - "qlora_adapter_name_or_path"] - - config_file_path = self._get_config_file(qlora_adapter) - - with open(config_file_path) as f: - config = json.load(f) - self.target_modules = config["target_modules"] - # TODO: target_modules could be either a list or a regex string. - # We need to handle both cases. - assert isinstance(self.target_modules, - list), "Unsupported target_modules: " - f"{self.target_modules}" - - def _get_config_file(self, qlora_adapter: str) -> str: - is_local = os.path.isdir(qlora_adapter) - config_file_path = None - if is_local: - for file in self.possible_config_file_names: - config_file_path = os.path.join(qlora_adapter, file) - if os.path.exists(config_file_path): - break - else: - hf_api = HfApi() - repo_files = hf_api.list_repo_files(repo_id=qlora_adapter) - for file in self.possible_config_file_names: - if file in repo_files: - config_file_path = hf_hub_download(repo_id=qlora_adapter, - filename=file) - break - - if not config_file_path: - raise ValueError( - f"Cannot find adapter config file in {qlora_adapter}") - - return config_file_path + # Store all module names (from transformers) that support + # BNB quantization. + self.target_modules: List[str] = [] def _get_weight_files( self, @@ -1030,25 +985,16 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: inverse_stacked_mapping[packed] = [] inverse_stacked_mapping[packed].insert(idx, orig) - linear_module_lst = [] for name, module in model.named_modules(): if isinstance(module, (LinearBase, )): last_name = name.split(".")[-1] if sub_modules := inverse_stacked_mapping.get(last_name, []): # Map vllm's names to transformers' names. for sub_name in sub_modules: - linear_module_lst.append( + self.target_modules.append( name.replace(last_name, sub_name)) else: - linear_module_lst.append(name) - if self.target_modules: - # Update self.target_modules - self.target_modules = [ - qual_name for qual_name in linear_module_lst - if any(t in qual_name for t in self.target_modules) - ] - else: - self.target_modules = linear_module_lst + self.target_modules.append(name) assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" @@ -1120,7 +1066,14 @@ def _load_weights(self, model_config: ModelConfig, model_config.revision, pre_quant, load_8bit)) - model.load_weights(qweight_iterator) + weights_to_load = {name for name, _ in model.named_parameters()} + loaded_weights = model.load_weights(qweight_iterator) + # Some models may have weights loading tracker unimplemented. + if loaded_weights is not None: + weights_not_loaded = weights_to_load - loaded_weights + if weights_not_loaded: + raise ValueError("Following weights were not initialized from " + f"checkpoint: {weights_not_loaded}") torch.cuda.empty_cache() @@ -1152,9 +1105,10 @@ def _load_weights(self, model_config: ModelConfig, shard_name, weight_name) break + # Models like Clip/Siglip may skip some layers in initialization, + # causing unused quant_param_name in state_dict. if quant_param_name not in param_dict: - raise ValueError( - f"Parameter {quant_param_name} not found in the model.") + continue if quant_param_name not in stacked_quant_state_dict: stacked_quant_state_dict[quant_param_name] = {} diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 864dd04e79921..cfb89e0f336bc 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -1,6 +1,6 @@ """Utilities for selecting and loading models.""" import contextlib -from typing import Optional, Tuple, Type +from typing import Tuple, Type import torch from torch import nn @@ -20,12 +20,8 @@ def set_default_torch_dtype(dtype: torch.dtype): def get_model_architecture( - model_config: ModelConfig, - *, - architectures: Optional[list[str]] = None, -) -> Tuple[Type[nn.Module], str]: - if architectures is None: - architectures = getattr(model_config.hf_config, "architectures", []) + model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: + architectures = getattr(model_config.hf_config, "architectures", []) # Special handling for quantized Mixtral. # FIXME(woosuk): This is a temporary hack. diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index fa6b95f5481ad..dd4b0c75cb84d 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -32,9 +32,8 @@ maybe_prefix, merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -451,7 +450,7 @@ def get_max_multimodal_tokens(ctx): def input_mapper_for_aria(ctx, data): - return MultiModalInputs(data) + return MultiModalKwargs(data) def input_processor(ctx, llm_inputs): diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 6af59697160a0..42a239cadac46 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,11 +4,10 @@ import torch import torch.nn as nn -import torch.nn.functional as F from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -22,8 +21,6 @@ repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -205,11 +202,8 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - # Detect attention implementation. - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"BLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -220,41 +214,10 @@ def forward( hidden_states: torch.Tensor, ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, _ = hidden_states.size() qkv_states, _ = self.qkv(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(bsz, tgt_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.projection(out) return attn_output, None diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index cd89519e95986..a5300dfd986f3 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -5,11 +5,10 @@ import numpy as np import torch import torch.nn as nn -import torch.nn.functional as F from PIL import Image from transformers import CLIPVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -25,8 +24,6 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -235,11 +232,8 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - # Detect attention implementation. - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"CLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, @@ -250,42 +244,10 @@ def forward( hidden_states: torch.Tensor, ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - - query_states = query_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(bsz, tgt_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(bsz, tgt_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output, None diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 5ca26d53a17e7..0398f0943a70a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -473,10 +473,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index b2557f67edbda..5db9786ea05b7 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -8,6 +8,7 @@ from torch import nn from torch.nn import LayerNorm +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -78,6 +79,8 @@ def __init__( quant_config=quant_config, ) + self.attn = MultiHeadAttention(self.num_heads_per_rank, self.head_dim, + self.scale) self.output_dropout = torch.nn.Dropout(config.dropout_prob) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -105,24 +108,11 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: except ModuleNotFoundError: pass - B, L, _ = x.shape qkv, _ = self.query_key_value(x) # B, L, 3 * H * D q, k, v = qkv.chunk(3, dim=-1) - q = q.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - k = k.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - v = v.reshape(B, L, self.num_heads_per_rank, - self.head_dim).permute(0, 2, 1, 3) # B, H, L, D - - out = torch.nn.functional.scaled_dot_product_attention(q, - k, - v, - attn_mask=None, - dropout_p=0., - is_causal=False) - - output, _ = self.dense(out.transpose(1, 2).view(B, L, -1)) + + out = self.attn(q, k, v) + output, _ = self.dense(out) output = self.output_dropout(output) return output diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index bd2394e71c973..f9e0443b9a508 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -400,16 +400,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head.weight = self.model.embed_tokens.weight logit_scale = getattr(config, "logit_scale", 1.0) - if hasattr(config, "logits_scaling"): logit_scale /= config.logits_scaling + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 16192928beb1f..e430a158d869a 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -21,8 +21,8 @@ from torch import nn from transformers.models.idefics2.configuration_idefics2 import ( Idefics2Config, Idefics2VisionConfig) -from xformers import ops as xops +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -141,35 +141,18 @@ def __init__( ) self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.is_causal = False + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: - batch_size, q_len, _ = hidden_states.size() qkv, _ = self.qkv_proj( hidden_states ) # batch_size, q_len, 3 * num_heads_per_partition * head_dim query_states, key_states, value_states = qkv.chunk(3, dim=-1) - query_states = query_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - # see: https://facebookresearch.github.io/xformers/components/ops.html - out = xops.memory_efficient_attention_forward( - query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale, - ) - out = out.view(batch_size, q_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 01a381381ccec..c3979eab905db 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -36,6 +36,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]: """ Returns multimodal embeddings generated from multimodal kwargs to be merged with text embeddings. + + The output embeddings must be one of the following formats: + - A list or tuple of 2D tensors, where each tensor corresponds to + each input image. + - A single 3D tensor, with the batch dimension grouping the 2D tensors. """ ... diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index c4346fcb3bd2a..7ff68bd60e8ad 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -12,7 +12,7 @@ import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -25,8 +25,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from .utils import get_vit_attn_backend - NORM2FN = { 'rms_norm': RMSNorm, 'layer_norm': nn.LayerNorm, @@ -183,10 +181,8 @@ def __init__( prefix=f"{prefix}.proj", ) - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"InternViT does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): if self.tp_size > 1: @@ -209,23 +205,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: if self.qk_normalization: q, k = self._apply_qk_norm(q, k) - q = q.view(B, N, self.num_heads_per_partition, self.head_dim) - k = k.view(B, N, self.num_heads_per_partition, self.head_dim) - v = v.view(B, N, self.num_heads_per_partition, self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(q, - k, - v, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - q, k, v = (x.transpose(1, 2) for x in (q, k, v)) - out = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(B, N, -1) + out = self.attn(q, k, v) out, _ = self.proj(out) return out diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 86aab38032450..42c769f79e202 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -26,7 +26,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -52,12 +52,18 @@ class InternVLImagePixelInputs(TypedDict): Shape: `(batch_size * num_images * (1 + num_patches), num_channels, height, width)` """ + patches_per_image: List[int] + """ + List of number of total patches for each image in the batch. + """ class InternVLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + data: NestedTensors + """ + A tensor of shape `(num_images, total_image_feature_size, hidden_size)` + or a list of tensors of shape `(total_image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. """ @@ -349,10 +355,32 @@ def input_processor( new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, num_patches) new_prompt_token_ids = tokenizer.encode(new_prompt) + img_context_token_id = tokenizer.encode(self.img_context_token, + add_special_tokens=False) + assert len(img_context_token_id) == 1, \ + (f"Invalid image token '{self.img_context_token}': A valid image " + f"token encodes to a single token ID, got {img_context_token_id}.") + img_context_token_id = img_context_token_id[0] + + # Get precise tracking of placeholder positions + token_idx = image_idx = 0 + placeholder_ranges = [] + while token_idx < len(new_prompt_token_ids): + if new_prompt_token_ids[token_idx] == img_context_token_id: + curr_image_featue_size = image_feature_sizes[image_idx] + placeholder_ranges.append( + PlaceholderRange(offset=token_idx, + length=curr_image_featue_size)) + image_idx += 1 + token_idx += curr_image_featue_size + else: + token_idx += 1 - return token_inputs(prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data) + return token_inputs( + prompt=prompt, + prompt_token_ids=new_prompt_token_ids, + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) def input_mapper( self, @@ -482,6 +510,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.mlp1 = self._init_mlp1(config) self.img_context_token_id = None + self.visual_token_mask = None self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -613,35 +642,54 @@ def _parse_and_validate_image_input( if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") + + patches_per_image = [] + for request_pixel_values in pixel_values: + for image_pixel_values in request_pixel_values: + patches_per_image.append(image_pixel_values.shape[0]) # We need to flatten (B, N, P) to (B*N*P), # so we call flatten_bn twice. return InternVLImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( flatten_bn(flatten_bn(pixel_values), concat=True)), - ) + patches_per_image=patches_per_image) raise AssertionError("This line should be unreachable.") def _process_image_input( self, image_input: InternVLImageInputs, - ) -> torch.Tensor: + ) -> Tuple[torch.Tensor]: if image_input["type"] == "image_embeds": return image_input["data"] assert self.vision_model is not None + image_embeds = self.extract_feature(image_input["data"]) + patches_per_image = image_input["patches_per_image"] + if len(patches_per_image) == 1: + image_embeds = image_embeds.unsqueeze(0) + return image_embeds + + # NOTE: Image embeddings are split into separate tensors for each image + # by the size of each embedding. + feature_size = image_embeds.shape[1] + image_embeds = image_embeds.view(-1, + self.config.text_config.hidden_size) + image_feature_sizes = [ + num_patches * feature_size for num_patches in patches_per_image + ] + image_embeds = image_embeds.split(image_feature_sizes) return image_embeds - def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: + def _set_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: if self.is_mono: - visual_token_mask = ( + self.visual_token_mask = ( input_ids == self.img_context_token_id).reshape(-1, 1) else: - visual_token_mask = None - return visual_token_mask + self.visual_token_mask = None def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) @@ -658,6 +706,7 @@ def get_input_embeddings( inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: assert self.img_context_token_id is not None + self._set_visual_token_mask(input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.img_context_token_id) @@ -674,7 +723,6 @@ def forward( **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: - visual_token_mask = None if intermediate_tensors is not None: input_ids = None inputs_embeds = None @@ -695,15 +743,12 @@ def forward( "intermediate_tensors": intermediate_tensors, "inputs_embeds": inputs_embeds, } - if self.img_context_token_id is not None: - visual_token_mask = self._get_visual_token_mask(input_ids) - # We always overwrite it back to None after computing visual token - # mask so that this doesn't need to depend on encoder output - self.img_context_token_id = None - - if self.is_mono: - forward_kwargs.update({"visual_token_mask": visual_token_mask}) + # Only required if the model is mono-architecture + if self.visual_token_mask is not None: + forward_kwargs.update( + {"visual_token_mask": self.visual_token_mask}) + self.visual_token_mask = None hidden_states = self.language_model.model(**forward_kwargs) return hidden_states diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 099ca7e12b288..5d5e8ae1ee532 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention -from vllm.config import CacheConfig, VllmConfig +from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -25,8 +25,6 @@ MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, - _get_graph_batch_size) from .interfaces import HasInnerState, SupportsLoRA from .utils import maybe_prefix @@ -404,7 +402,7 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (_get_graph_batch_size( + max_batch_size = (VllmConfig.get_graph_batch_size( self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index bc24a7d306afa..2208efdf10796 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -575,10 +575,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index db7fa82ceb9b7..65c6bd07bfff0 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,37 +1,41 @@ from functools import cached_property +from types import MethodType from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) import torch import torch.nn as nn -from PIL import Image -from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, - PretrainedConfig, SiglipVisionConfig) +from PIL.Image import Image +from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, + PixtralVisionConfig, PretrainedConfig, + ProcessorMixin, SiglipVisionConfig) +from transformers.models.pixtral import PixtralProcessor from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.processing import (BaseMultiModalProcessor, + InputProcessingContext, + ModalityProcessingMetadata, + MultiModalProcessingMetadata, + PromptReplacement) from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of from .clip import (CLIPVisionModel, dummy_image_for_clip, - dummy_seq_data_for_clip, get_max_clip_image_tokens, - input_processor_for_clip) + get_max_clip_image_tokens) from .interfaces import SupportsMultiModal, SupportsPP from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, - dummy_seq_data_for_pixtral_hf, - get_max_pixtral_hf_image_tokens, - input_processor_for_pixtral_hf) + get_max_pixtral_hf_image_tokens) from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip, get_max_siglip_image_tokens, - input_processor_for_siglip) + get_max_siglip_image_tokens) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -59,25 +63,32 @@ class LlavaImageEmbeddingInputs(TypedDict): LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs] -# TODO(xwjiang): Run benchmark and decide if TP. class LlavaMultiModalProjector(nn.Module): - def __init__(self, vision_hidden_size: int, text_hidden_size: int, - projector_hidden_act: str): + def __init__(self, + vision_hidden_size: int, + text_hidden_size: int, + projector_hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = ""): super().__init__() - self.linear_1 = nn.Linear(vision_hidden_size, - text_hidden_size, - bias=True) + self.linear_1 = ColumnParallelLinear(vision_hidden_size, + text_hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_1") self.act = get_act_fn(projector_hidden_act) - self.linear_2 = nn.Linear(text_hidden_size, - text_hidden_size, - bias=True) + self.linear_2 = RowParallelLinear(text_hidden_size, + text_hidden_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.linear_2") def forward(self, image_features: torch.Tensor) -> torch.Tensor: - hidden_states = self.linear_1(image_features) + hidden_states, _ = self.linear_1(image_features) hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) + hidden_states, _ = self.linear_2(hidden_states) return hidden_states @@ -104,102 +115,115 @@ def get_max_llava_image_tokens(ctx: InputContext): raise ValueError(f"Unexpected select feature strategy: {strategy}") -def dummy_data_for_llava(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): +def dummy_mm_kwargs_for_llava(ctx: InputProcessingContext, + mm_counts: Mapping[str, int]): hf_config = ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config num_images = mm_counts["image"] - image_feature_size = get_max_llava_image_tokens(ctx) - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - - mm_data = dummy_image_for_clip(vision_config, num_images) - return DummyData(seq_data, mm_data, ranges) + data = dummy_image_for_clip(vision_config, num_images) elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - - mm_data = dummy_image_for_siglip(vision_config, num_images) - return DummyData(seq_data, mm_data, ranges) + data = dummy_image_for_siglip(vision_config, num_images) elif isinstance(vision_config, PixtralVisionConfig): - seq_data, ranges = dummy_seq_data_for_pixtral_hf( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - - mm_data = dummy_image_for_pixtral_hf(vision_config, num_images) - return DummyData(seq_data, mm_data, ranges) + data = dummy_image_for_pixtral_hf(vision_config, num_images) + else: + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + hf_processor = ctx.get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt") + is_pixtral = isinstance(hf_processor, PixtralProcessor) + return MultiModalKwargs( + **hf_inputs, + is_pixtral=torch.tensor(is_pixtral), + ) -def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - model_config = ctx.model_config +def create_metadata_for_llava( + ctx: InputProcessingContext) -> MultiModalProcessingMetadata: hf_config = ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config + image_token_id = hf_config.image_token_index + + def get_repl_count( + mm_items: list[Image], + hf_inputs: BatchFeature, + item_idx: int, + ) -> int: + return get_max_llava_image_tokens(ctx) + + return { + "image": + ModalityProcessingMetadata(prompt_repls=[ + PromptReplacement(target=[image_token_id], + repl_unit=[image_token_id], + repl_count=get_repl_count), + ]), + } - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - image_feature_size = get_max_llava_image_tokens(ctx) - elif is_list_of(image_data, Image.Image): - image_feature_size = [get_max_llava_image_tokens(ctx) - ] * len(image_data) - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - elif isinstance(vision_config, PixtralVisionConfig): - # We ignore image_feature_size_override since we have non-uniform - # image sizes for Pixtral - return input_processor_for_pixtral_hf( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, +class LlavaProcessor(BaseMultiModalProcessor): + + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__( + ctx=ctx, + metadata=create_metadata_for_llava(ctx), ) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _patch_pixtral_processor(self, hf_processor: PixtralProcessor): + if getattr(hf_processor, "__is_patched__", False): + return # Already patched + + image_processor = hf_processor.image_processor # type: ignore + orig_preprocess = image_processor.preprocess + + def preprocess(__self, *args, **kwargs): + hf_inputs = orig_preprocess(*args, **kwargs) + hf_inputs["is_pixtral"] = torch.tensor(True) + return hf_inputs + + image_processor.preprocess = MethodType(preprocess, image_processor) + + hf_processor.__is_patched__ = True # type: ignore + + def _get_hf_processor(self) -> ProcessorMixin: + hf_processor = self.ctx.get_hf_processor() + + if isinstance(hf_processor, PixtralProcessor): + self._patch_pixtral_processor(hf_processor) + + return hf_processor + + def _get_dummy_mm_kwargs( + self, + mm_counts: Mapping[str, int], + ) -> MultiModalKwargs: + hf_config = self.ctx.get_hf_config(LlavaConfig) + vision_config = hf_config.vision_config + num_images = mm_counts["image"] + + if isinstance(vision_config, CLIPVisionConfig): + data = dummy_image_for_clip(vision_config, num_images) + elif isinstance(vision_config, SiglipVisionConfig): + data = dummy_image_for_siglip(vision_config, num_images) + elif isinstance(vision_config, PixtralVisionConfig): + data = dummy_image_for_pixtral_hf(vision_config, num_images) + else: + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) + + hf_processor = self._get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + hf_inputs = image_processor.preprocess(data['image'], + return_tensors="pt") + is_pixtral = isinstance(hf_processor, PixtralProcessor) + + return MultiModalKwargs( + **hf_inputs, + is_pixtral=torch.tensor(is_pixtral), + ) class LlavaLikeConfig(Protocol): @@ -282,10 +306,8 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava) +@MULTIMODAL_REGISTRY.register_processor(LlavaProcessor) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { @@ -325,7 +347,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, - projector_hidden_act=config.projector_hidden_act) + projector_hidden_act=config.projector_hidden_act, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector")) self.language_model = init_vllm_registered_model( vllm_config=vllm_config, @@ -356,38 +380,10 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: return data - def _validate_image_sizes(self, images: List[torch.Tensor], - sizes: List[torch.Tensor]) -> List[torch.Tensor]: - if not isinstance(sizes, list): - sizes = [sizes] - - total_images = sum(size.numel() // 2 for size in sizes) - if total_images != len(images): - raise ValueError("Mismatch in number of images. " - f"Expected {total_images}, got {len(images)}") - img_idx = 0 - for size in sizes: - # Flatten the size tensor to a list of (height, width) pairs - size = size.view(-1, 2).tolist() - for expected_h, expected_w in size: - if img_idx >= len(images): - raise ValueError("Ran out of images before sizes. " - f"{img_idx} >= {len(images)}") - img = images[img_idx] - if img.shape[-2:] != (expected_h, expected_w): - raise ValueError( - "Image size mismatch. Expected " - f"{(expected_h, expected_w)}, got {img.shape[-2:]}") - if img.shape[-3] != 3: - raise ValueError("Image channel mismatch. Expected 3, " - f"got {img.shape[-3]}") - img_idx += 1 - return images - def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) - image_sizes = kwargs.pop("image_sizes", None) + is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False])) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: @@ -398,9 +394,8 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - # Case for models like PixtralHF that have dynamic image sizes - # so we need to produce a list of tensors - if image_sizes is not None: + assert isinstance(is_pixtral, torch.Tensor) + if is_pixtral.any(): images = pixel_values def flatten_to_3d_tensors(item): @@ -423,7 +418,7 @@ def flatten_to_3d_tensors(item): return LlavaImagePixelInputs( type="pixel_values", - data=self._validate_image_sizes(images, image_sizes), + data=images, ) return LlavaImagePixelInputs( @@ -591,3 +586,28 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + +class MantisProcessor(LlavaProcessor): + + def _get_hf_processor(self) -> ProcessorMixin: + try: + from mantis.models.mllava import MLlavaProcessor + except ModuleNotFoundError as exc: + raise ModuleNotFoundError( + "You need to `pip install " + "git+https://github.com/TIGER-AI-Lab/Mantis.git` " + "to use this model") from exc + + processor = MLlavaProcessor.from_pretrained( + self.ctx.model_config.tokenizer) + assert isinstance(processor, ProcessorMixin) + return processor + + +# To use this model, please use +# `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) +@MULTIMODAL_REGISTRY.register_processor(MantisProcessor) +class MantisForConditionalGeneration(LlavaForConditionalGeneration): + pass diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index ac0d265a961f0..b32032e411b0a 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -6,7 +6,7 @@ from transformers import MambaConfig from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import _BATCH_SIZES_TO_CAPTURE, CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -23,8 +23,6 @@ MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, - _get_graph_batch_size) from .utils import maybe_prefix @@ -187,7 +185,7 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - max_batch_size = (_get_graph_batch_size( + max_batch_size = (VllmConfig.get_graph_batch_size( self.scheduler_config.max_num_seqs) if self.scheduler_config else max(_BATCH_SIZES_TO_CAPTURE) + 2) self.mamba_cache = MambaCacheManager( diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 6254d26c7060d..5a0f202364f26 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -534,6 +534,16 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 98caa6857e211..a328b5a2aeea7 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -13,6 +13,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata +from vllm.attention.layer import MultiHeadAttention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, @@ -36,23 +37,25 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer -from vllm.platforms import _Backend from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) from vllm.transformers_utils.processor import get_processor from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, - is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) # TODO: hard-coded for now. Consider making it configurable. VIT_LAYERS = [-2, -9] NUM_PREFIX_TOKENS = 1 ADDITIONAL_VOCAB_SIZE = 128 +DEFAULT_IMAGE_PATCH_TOKEN_ID = 152066 +DEFAULT_IM_START_TOKEN_ID = 152067 +DEFAULT_IM_END_TOKEN_ID = 152064 +DEFAULT_IM_COL_TOKEN_ID = 152065 class MolmoImageInputs(TypedDict): @@ -76,6 +79,11 @@ class MolmoImageInputs(TypedDict): `(batch_size, num_crops, num_patch)` """ + image_start_end: Tuple[int, int] + """Starting and ending index of placeholder + tokens + """ + @dataclass class VisionBackboneConfig: @@ -188,13 +196,11 @@ def __init__( quant_config=quant_config, ) - # Detect attention implementation. - self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True) - if self.attn_backend not in { - _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS - }: - raise RuntimeError( - f"Molmo does not support {self.attn_backend} backend now.") + self.scale = self.head_dim**-0.5 + self.attn = MultiHeadAttention(self.num_heads, + self.head_dim, + self.scale, + num_kv_heads=self.num_kv_heads) def forward(self, inputs_q: torch.Tensor, @@ -210,25 +216,8 @@ def forward(self, xq, _ = self.wq(inputs_q) xk, _ = self.wk(inputs_k) xv, _ = self.wv(inputs_v) - q_shape = xq.size()[:-1] + (self.num_heads, self.head_dim) - kv_shape = xk.size()[:-1] + (self.num_kv_heads, self.head_dim) - xq = xq.view(*q_shape) - xk = xk.view(*kv_shape) - xv = xv.view(*kv_shape) - - if self.attn_backend == _Backend.FLASH_ATTN: - from flash_attn import flash_attn_func - output = flash_attn_func(xq, xk, xv, dropout_p=0.0, causal=False) - elif self.attn_backend == _Backend.TORCH_SDPA: - xq, xk, xv = (rearrange(x, "b s h d -> b h s d") - for x in (xq, xk, xv)) - output = F.scaled_dot_product_attention(xq, xk, xv) - output = rearrange(output, "b h s d -> b s h d ") - elif self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - output = xops.memory_efficient_attention_forward(xq, xk, xv, p=0) - - output = rearrange(output, "b s h d -> b s (h d)").contiguous() + + output = self.attn(xq, xk, xv) output, _ = self.wo(output) return output @@ -938,6 +927,8 @@ def image_input_mapper_for_molmo( ctx: InputContext, data: object, ): + if isinstance(data, list): + data = data[0] return MultiModalKwargs(data) @@ -987,7 +978,22 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long) - return DummyData(dummy_seqdata, {"image": dummy_imgdata}) + size = 0 + offset = -1 + for i in range(len(token_ids)): + if token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID, + DEFAULT_IM_START_TOKEN_ID, DEFAULT_IM_END_TOKEN_ID, + DEFAULT_IM_COL_TOKEN_ID): + if offset < 0: + offset = i + size += 1 + dummy_imgdata["image_start_end"] = (offset, offset + size) + return DummyData(seq_data=dummy_seqdata, + multi_modal_data={"image": dummy_imgdata}, + multi_modal_placeholders={ + "image": + [PlaceholderRange(offset=offset, length=size)] + }) def pad_images( @@ -1075,19 +1081,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): if image_masks is not None: image_data["image_masks"] = image_masks - image_data["seq_len"] = torch.tensor(len(out["input_ids"]), + new_prompt_token_ids = out["input_ids"].tolist() + image_data["seq_len"] = torch.tensor(len(new_prompt_token_ids), dtype=torch.long) multi_modal_data = dict(image=image_data) + size = 0 + offset = -1 + for i in range(len(new_prompt_token_ids)): + if new_prompt_token_ids[i] in (DEFAULT_IMAGE_PATCH_TOKEN_ID, + DEFAULT_IM_START_TOKEN_ID, + DEFAULT_IM_END_TOKEN_ID, + DEFAULT_IM_COL_TOKEN_ID): + if offset < 0: + offset = i + size += 1 + image_data["image_start_end"] = (offset, offset + size) prompt = inputs.get("prompt") if prompt is None: - prompt = tokenizer.decode(out["input_ids"]) + prompt = tokenizer.decode(new_prompt_token_ids) return token_inputs( - prompt_token_ids=out["input_ids"], + prompt_token_ids=new_prompt_token_ids, prompt=prompt, multi_modal_data=multi_modal_data, + multi_modal_placeholders={ + "image": [PlaceholderRange(offset=offset, length=size)] + }, ) @@ -1133,6 +1154,7 @@ def _parse_and_validate_image_input( ) -> Optional[MolmoImageInputs]: images = kwargs.pop("images", None) image_masks = kwargs.pop("image_masks", None) + image_start_end = kwargs.pop("image_start_end", None) if images is None: return None @@ -1150,6 +1172,7 @@ def _parse_and_validate_image_input( image_input_idx=image_input_idx, seq_len=seq_len, image_masks=image_masks, + image_start_end=image_start_end, ) def _process_image_input( @@ -1198,9 +1221,16 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: # Note: In this original implementation from AI2, the final # vision_embeddings will be always be the same length - # of input embedddings, which is not very efficient. - # TODO(ywang96): see if this can be optimized. + # of input embeddings. vision_embeddings = torch.einsum('nd,nm->md', image_features, mat) + + # Split by the sizes of the input sequences. For each full embedding, + # extract the actual vision embeddings to be merged. + vision_embeddings = list(vision_embeddings.split(seq_len.tolist())) + for i in range(len(vision_embeddings)): + start, end = image_input['image_start_end'][i] + vision_embeddings[i] = vision_embeddings[i][start:end] + return vision_embeddings def get_input_embeddings( @@ -1210,7 +1240,11 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - inputs_embeds = inputs_embeds + multimodal_embeddings + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, [ + DEFAULT_IMAGE_PATCH_TOKEN_ID, DEFAULT_IM_START_TOKEN_ID, + DEFAULT_IM_END_TOKEN_ID, DEFAULT_IM_COL_TOKEN_ID + ]) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index c7b4c22b6896b..34cb9981c167b 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -435,9 +435,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index eef23029a2aca..3c7854ce388ab 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -12,22 +12,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import itertools -import re -from functools import cached_property, lru_cache -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from functools import cached_property +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from PIL import Image -from transformers import CLIPVisionConfig, PretrainedConfig +from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig, + ProcessorMixin) from vllm.attention import AttentionMetadata -from vllm.config import ModelConfig, VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -36,12 +32,18 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors, PlaceholderRange -from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token +from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.processing import (BaseMultiModalProcessor, + InputProcessingContext, + ModalityProcessingMetadata, + MultiModalDataDict, + MultiModalProcessingMetadata, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip, dummy_seq_data_for_clip +from .clip import dummy_image_for_clip from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, @@ -303,231 +305,99 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57 -def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336): - target_height = int(np.ceil(height / padding_unit) * padding_unit) - top_padding = int((target_height - height) / 2) - bottom_padding = target_height - height - top_padding - padded_width = width - padded_height = height + top_padding + bottom_padding - return padded_width, padded_height - - -# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90 -def _calc_hd_transform_size(*, width: int, height: int, hd_num: int): - transposed = False - if width < height: - width, height = height, width - transposed = True - - ratio = width / height - scale = 1 - while scale * np.ceil(scale / ratio) <= hd_num: - scale += 1 - scale -= 1 - - new_width = int(scale * 336) - new_height = int(new_width / ratio) - - padded_width, padded_height = _calc_padded_size(width=new_width, - height=new_height) - - if transposed: - padded_width, padded_height = padded_height, padded_width - - return padded_width, padded_height - - -# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181 -def get_phi3v_image_feature_size( - hf_config: Dict[str, Any], - *, - input_height: int, - input_width: int, - num_crops: int, -) -> int: - if num_crops is None: - num_crops = hf_config.get("num_crops", 16) - new_width, new_height = _calc_hd_transform_size(width=input_width, - height=input_height, - hd_num=num_crops) - - return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \ - + (new_height // 336 + 1) * 12 - - def get_max_phi3v_image_tokens(ctx: InputContext, *, num_crops: Optional[int] = None): + mm_processor_kwargs = {} + if num_crops is not None: + mm_processor_kwargs["num_crops"] = num_crops - return get_phi3v_image_feature_size( - ctx.get_hf_image_processor_config(), - input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - num_crops=num_crops, + model_config = ctx.model_config + image_processor = cached_get_image_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **mm_processor_kwargs, + ) + + num_tokens = image_processor.calc_num_image_tokens_from_image_size( + width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, ) + return num_tokens -def dummy_data_for_phi3v(ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], - *, - num_crops: Optional[int] = None): +def dummy_mm_kwargs_for_phi3v(ctx: InputProcessingContext, + mm_counts: Mapping[str, int]): num_images = mm_counts["image"] - image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops) - - seq_data, ranges = dummy_seq_data_for_clip( - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - seq_len, - num_images, - image_token_id=_IMAGE_TOKEN_ID, - image_feature_size_override=image_feature_size, - ) - mm_data = dummy_image_for_clip( + data = dummy_image_for_clip( CLIP_VIT_LARGE_PATCH14_336_CONFIG, num_images, image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, ) - return DummyData(seq_data, mm_data, ranges) - + hf_processor = ctx.get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + hf_inputs = image_processor.preprocess(data['image'], return_tensors="pt") -@lru_cache -def _get_image_placeholder_token_id_candidates( - model_config: ModelConfig, - idx: int, -) -> List[List[int]]: - assert idx > 0 + return MultiModalKwargs(**hf_inputs) - tokenizer = cached_get_tokenizer(model_config.tokenizer) - # This is used when the image token is at the start of the string - start_candidate = tokenizer.encode(f"<|image_{idx}|>", - add_special_tokens=False) +def create_metadata_for_phi3v( + ctx: InputProcessingContext) -> MultiModalProcessingMetadata: + return { + "image": + ModalityProcessingMetadata(prompt_repls=[ + PromptReplacement(target=[_IMAGE_TOKEN_ID], + repl_unit=[_IMAGE_TOKEN_ID], + repl_count=get_max_phi3v_image_tokens(ctx)), + ]), + } - # This is used when the image token is in the middle of the string - # We need to get the token for "<", not "▁<" - # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json - a_token_id, = tokenizer.encode("a", add_special_tokens=False) - a_token_id_, *middle_candidate = tokenizer.encode(f"a<|image_{idx}|>", - add_special_tokens=False) - assert a_token_id == a_token_id_ - return [start_candidate, middle_candidate] +class Phi3VProcessor(BaseMultiModalProcessor): + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__( + ctx=ctx, + metadata=create_metadata_for_phi3v(ctx), + ) -def input_processor_for_phi3v(ctx: InputContext, - inputs: DecoderOnlyInputs, - *, - num_crops: Optional[int] = None): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_image_processor_config() - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - w, h = image_data.size - image_feature_size = [ - get_phi3v_image_feature_size(hf_config, - input_width=w, - input_height=h, - num_crops=num_crops) - ] - image_data = [image_data] - elif is_list_of(image_data, Image.Image): - image_feature_size = [] - for image in image_data: - w, h = image.size - image_feature_size.append( - get_phi3v_image_feature_size(hf_config, - input_width=w, - input_height=h, - num_crops=num_crops)) - elif isinstance(image_data, torch.Tensor): - image_feature_size = [image_data.shape[0]] - image_data = [image_data] - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[0] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - prompt = inputs.get("prompt") - if prompt is None: - # for async server request, we assume prompt and its token_ids is always - # in correct format. And num_image_tags == len(image_data) always True. - image_idx = range(1, len(image_data) + 1) - new_prompt = None - else: - image_idx = sorted(map(int, re.findall(r"<\|image_(\d+)\|>+", prompt))) - if prompt.count("<|image|>") > 0: - logger.warning("Please follow the prompt format that is " - "documented on HuggingFace which does not involve " - "repeating <|image|> tokens.") - elif (num_image_tags := len(image_idx)) > 1: - assert num_image_tags == len( - image_data), "The count of image_placeholder not match image's" - new_prompt = prompt - - prompt_token_ids = inputs["prompt_token_ids"].copy() - - # masked placeholder with image token id - for idx in image_idx: - candidates = _get_image_placeholder_token_id_candidates(model_config, - idx=idx) - - for candidate in candidates: - for i in range(len(prompt_token_ids) - len(candidate) + 1): - if prompt_token_ids[i:i + len(candidate)] == candidate: - prompt_token_ids[i:i + - len(candidate)] = ([_IMAGE_TOKEN_ID] * - len(candidate)) - break - - # merge consecutive tag ids - merged_token_ids: List[int] = [] - for is_placeholder, token_ids in itertools.groupby( - prompt_token_ids, lambda x: x == _IMAGE_TOKEN_ID): - if is_placeholder: - merged_token_ids.append(_IMAGE_TOKEN_ID) - else: - merged_token_ids.extend(list(token_ids)) - - # TODO: Move this to utils or integrate with clip. - new_token_ids: List[int] = [] - placeholder_ranges: List[PlaceholderRange] = [] - placeholder_idx = 0 - while merged_token_ids: - token_id = merged_token_ids.pop(0) - if token_id == _IMAGE_TOKEN_ID: - replacement_ids = repeat_and_pad_token( - _IMAGE_TOKEN_ID, - repeat_count=image_feature_size[placeholder_idx], - ) - placeholder_ranges.append({ - "offset": len(new_token_ids), - "length": len(replacement_ids) - }) - new_token_ids.extend(replacement_ids) - placeholder_idx += 1 - else: - new_token_ids.append(token_id) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": placeholder_ranges}) + def _get_hf_processor( + self, + *, + num_crops: Optional[int] = None, + ) -> ProcessorMixin: + if num_crops is not None: + return self.ctx.get_hf_processor(num_crops=num_crops) + return self.ctx.get_hf_processor() + + def _apply_hf_processor( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._apply_hf_processor( + prompt, mm_data, mm_processor_kwargs) + # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids, + # which will cause OverflowError when decoding the prompt_ids. + # Therefore, we need to do an early replacement here + token_ids = processed_outputs['input_ids'] + token_ids[token_ids < 0] = _IMAGE_TOKEN_ID + processed_outputs['input_ids'] = token_ids + return processed_outputs + + def _get_dummy_mm_kwargs( + self, + mm_counts: Mapping[str, int], + ) -> MultiModalKwargs: + return dummy_mm_kwargs_for_phi3v(self.ctx, mm_counts) -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v) -@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v) +@MULTIMODAL_REGISTRY.register_processor(Phi3VProcessor) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 215727cadd954..c6786c363ab4a 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -48,6 +48,9 @@ except ImportError: USE_XFORMERS_OPS = False +PIXTRAL_IMAGE_BREAK_ID = 12 +PIXTRAL_IMAGE_END_ID = 13 + def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer = cached_get_tokenizer( @@ -68,7 +71,6 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, tokenizer_mode=ctx.model_config.tokenizer_mode) mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder - patch_size = mm_encoder.mm_config.image_patch_size image_token_id = mm_encoder.special_ids.img mm_config = ctx.model_config.multimodal_config @@ -78,8 +80,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, size = 256 image = Image.new("RGB", (size, size), color=0) - image_feature_size = (size**2) // (patch_size**2) - + encoding = tokenizer.instruct.mm_encoder(ImageChunk(image=image)) + image_feature_size = len(encoding.tokens) num_image_tokens = image_feature_size * num_images seq_data = SequenceData.from_prompt_token_counts( (image_token_id, num_image_tokens), @@ -101,14 +103,13 @@ def input_mapper_for_pixtral(ctx: InputContext, Args: ctx: Context of the loaded model. - data: data potentially containing image/image embeddings to be mapped - to pixel_values in .forward() for a visual QWenLMHeadModel model. + data: data potentially containing PIL images to be processed + and mapped to `images`. Returns: MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ - # Early exit if we have provided an image to a language only Qwen model model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode) @@ -116,35 +117,67 @@ def input_mapper_for_pixtral(ctx: InputContext, data_list = data if isinstance(data, list) else [data] images = [] + image_tokens_list = [] for image_data in data_list: image = ImageChunk(image=image_data) encoding = tokenizer.instruct.mm_encoder(image) image = torch.from_numpy(encoding.image).to(device="cuda", dtype=torch.float16) images.append(image) + image_tokens_list.append(encoding.tokens) - return MultiModalKwargs({"images": images}) + image_tokens = torch.tensor([ + token_id for image_tokens in image_tokens_list + for token_id in image_tokens + ]) + return MultiModalKwargs({"images": images, "image_tokens": image_tokens}) def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is not None and "image" in multi_modal_data: - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - tokenizer_mode=ctx.model_config.tokenizer_mode) - - mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder - image_token_id = mm_encoder.special_ids.img + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs - if image_token_id not in inputs['prompt_token_ids']: - raise ValueError( - f"You've passed {inputs=} without {image_token_id=}" - " Make sure to process your input via mistral_common's" - " tokenizer or pass a chat completion request. For more" - " For more info, see: " - "https://github.com/vllm-project/vllm/issues/8411.") + prompt_token_ids = inputs.get("prompt_token_ids") + prompt = inputs.get("prompt") + tokenizer = cached_get_tokenizer( + ctx.model_config.tokenizer, + tokenizer_mode=ctx.model_config.tokenizer_mode) - return inputs + mm_encoder = tokenizer.mistral.instruct_tokenizer.mm_encoder + image_token_id = mm_encoder.special_ids.img + image_break_id = mm_encoder.special_ids.img_break + image_end_id = mm_encoder.special_ids.img_end + + if image_token_id not in inputs['prompt_token_ids']: + raise ValueError( + f"You've passed {inputs=} without {image_token_id=}" + " Make sure to process your input via mistral_common's" + " tokenizer or pass a chat completion request. For more" + " For more info, see: " + "https://github.com/vllm-project/vllm/issues/8411.") + + # Get precise tracking of placeholder positions + placeholder_ranges = [] + curr_offset = -1 + curr_length = 0 + for i in range(len(prompt_token_ids)): + if prompt_token_ids[i] in (image_token_id, image_break_id): + if curr_offset < 0: + curr_offset = i + curr_length += 1 + elif prompt_token_ids[i] == image_end_id: + curr_length += 1 + placeholder_ranges.append( + PlaceholderRange(offset=curr_offset, length=curr_length)) + curr_offset = -1 + curr_length = 0 + else: + pass + return token_inputs(prompt=prompt, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_pixtral) @@ -192,11 +225,29 @@ def sampler(self): return get_sampler() def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: - image_input = self._parse_and_validate_image_input(**kwargs) + image_input, image_tokens = self._parse_and_validate_image_input( + **kwargs) if image_input is None: return None + vision_embeddings = self._process_image_input(image_input) - return vision_embeddings + + # NOTE: We patch the outputs of the vision encoder with embeddings + # from `[IMG_BREAK]` and `[IMG_END]` tokens. + image_embeds = self.language_model.get_input_embeddings(image_tokens) + image_token_mask = image_tokens == self.vision_args.image_token_id + image_embeds[image_token_mask] = vision_embeddings + + # NOTE: Image embeddings are split into separate tensors for each image + # by the indices of `[IMG_END]` token. + split_indices = torch.where( + image_tokens == PIXTRAL_IMAGE_END_ID)[0] + 1 + if len(split_indices) <= 1: + # Do not split, return as tensor of shape [1, fs, hs] + return image_embeds.unsqueeze(0) + + image_embeds = image_embeds.tensor_split(split_indices.cpu()) + return image_embeds def get_input_embeddings( self, @@ -206,8 +257,10 @@ def get_input_embeddings( inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, multimodal_embeddings, - self.vision_args.image_token_id) + input_ids, inputs_embeds, multimodal_embeddings, [ + self.vision_args.image_token_id, PIXTRAL_IMAGE_END_ID, + PIXTRAL_IMAGE_BREAK_ID + ]) return inputs_embeds def forward( @@ -245,10 +298,11 @@ def forward( def _parse_and_validate_image_input( self, images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], - torch.Tensor]] = None + torch.Tensor]] = None, + image_tokens: Optional[torch.Tensor] = None, ) -> Optional[List[torch.Tensor]]: if images is None: - return None + return None, None if isinstance(images, torch.Tensor): # if passed as batch take all images @@ -267,7 +321,16 @@ def _parse_and_validate_image_input( images = flatten_images - return images + if isinstance(image_tokens, torch.Tensor): + # image_tokens are batched + image_tokens = image_tokens.flatten() + elif isinstance(image_tokens, list): + # image_tokens are of different lengths thus passed as a list + image_tokens = torch.cat(image_tokens) + + assert image_tokens.dim() == 1 + + return images, image_tokens def _process_image_input(self, image_input: List[torch.Tensor]) -> torch.Tensor: diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 7d4cc4b69e614..3ce4eb5869f21 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -444,14 +444,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - if config.tie_word_embeddings: - self.lm_head = self.model.embed_tokens + if get_pp_group().is_last_rank: + if config.tie_word_embeddings: + self.lm_head = self.model.embed_tokens + else: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "lm_head")) else: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix( - prefix, "lm_head")) + self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index a0605fee82aca..48a2d470414b9 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -19,7 +19,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" -from functools import lru_cache +from functools import cached_property, lru_cache from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -34,12 +34,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.logger import init_logger -from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead -from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors @@ -47,15 +42,11 @@ from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP -from .utils import merge_multimodal_embeddings +from .utils import (AutoWeightsLoader, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) -_KEYS_TO_MODIFY_MAPPING = { - "language_model.lm_head": "lm_head", - "language_model.model": "language_model", -} - # # === Audio Inputs === # class Qwen2AudioInputs(TypedDict): @@ -281,25 +272,23 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config - self.language_model = Qwen2Model( - vllm_config=vllm_config.with_hf_config(config.text_config), - prefix=prefix) - self.unpadded_vocab_size = config.text_config.vocab_size - if config.text_config.tie_word_embeddings: - self.lm_head = self.language_model.embed_tokens - else: - self.lm_head = ParallelLMHead(config.text_config.vocab_size, - config.text_config.hidden_size, - quant_config=quant_config) - logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.text_config.vocab_size, - logit_scale) - self.sampler = get_sampler() + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler + + return get_sampler() + def _validate_and_reshape_mm_tensor(self, mm_input: Union[torch.Tensor, List[torch.Tensor]], @@ -414,72 +403,30 @@ def forward( multimodal_embeddings) input_ids = None - hidden_states = self.language_model(input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors, - inputs_embeds=inputs_embeds) + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + return self.language_model.sample(logits, sampling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if (self.config.text_config.tie_word_embeddings - and "lm_head.weight" in name): - continue - for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in name: - name = name.replace(key_to_modify, new_key) - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name or 'audio' in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 27175dbae7483..cfc90cdab01e4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -21,7 +21,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" -from functools import partial +from functools import cached_property, partial from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, Optional, Set, Tuple, Type, TypedDict, Union) @@ -40,7 +40,7 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.distributed import get_pp_group, parallel_state +from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) @@ -49,15 +49,12 @@ from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization.gptq import GPTQConfig from vllm.model_executor.layers.quantization.gptq_marlin import ( GPTQMarlinConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler -from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, @@ -69,9 +66,8 @@ from vllm.transformers_utils.processor import cached_get_processor from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (PPMissingLayer, get_vit_attn_backend, - is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, maybe_prefix) +from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, + init_vllm_registered_model, maybe_prefix) logger = init_logger(__name__) @@ -506,6 +502,8 @@ def __init__( mlp_ratio: float = vision_config.mlp_ratio self.spatial_merge_size = spatial_merge_size + self.num_heads = num_heads + self.embed_dim = embed_dim self.patch_embed = Qwen2VisionPatchEmbed( patch_size=patch_size, @@ -595,6 +593,53 @@ def forward( x = self.merger(x) return x + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters(remove_duplicate=False)) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if name.endswith("qkv.weight"): + visual_num_heads = self.num_heads + visual_embed_dim = self.embed_dim + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size, + visual_embed_dim) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) + elif name.endswith("qkv.bias"): + visual_num_heads = self.num_heads + visual_embed_dim = self.embed_dim + head_size = visual_embed_dim // visual_num_heads + loaded_weight = loaded_weight.view(3, visual_num_heads, + head_size) + loaded_weight = loaded_weight.transpose(0, 1) + loaded_weight = loaded_weight.reshape(-1) + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + # === Vision input helpers === # @@ -1082,27 +1127,21 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): prefix=maybe_prefix(prefix, "visual"), ) - self.model = Qwen2Model(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Qwen2ForCausalLM"], + ) - if get_pp_group().is_last_rank: - if config.tie_word_embeddings: - self.lm_head = self.model.embed_tokens - else: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config, - prefix=maybe_prefix( - prefix, "lm_head")) - else: - self.lm_head = PPMissingLayer() + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) - self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = get_sampler() + @cached_property + def sampler(self): + if hasattr(self.language_model, "sampler"): + return self.language_model.sampler - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory( - ["hidden_states", "residual"], config.hidden_size)) + return get_sampler() def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): # GPTQ configs do not have a list of ignored modules, however AutoGPTQ @@ -1261,7 +1300,7 @@ def get_input_embeddings( multimodal_embeddings: Optional[List[Tuple[NestedTensors, str]]] = None, ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings(input_ids) + inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: for embeddings, modality in multimodal_embeddings: if modality == "image": @@ -1330,7 +1369,7 @@ def forward( multimodal_embeddings) input_ids = None - hidden_states = self.model( + hidden_states = self.language_model.model( input_ids=input_ids, positions=positions, kv_caches=kv_caches, @@ -1340,80 +1379,28 @@ def forward( ) return hidden_states - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + return self.language_model.sample(logits, sampling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "up_proj", 1), - ("gate_up_proj", "gate_proj", 0), - ] - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - if "rotary_emb.inv_freq" in name: - continue - if self.config.tie_word_embeddings and "lm_head.weight" in name: - continue - for (param_name, weight_name, shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - name = name.replace(weight_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - if "visual" in name and name.endswith("qkv.weight"): - visual_num_heads = self.config.vision_config.num_heads - visual_embed_dim = self.config.vision_config.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size, - visual_embed_dim) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) - elif "visual" in name and name.endswith("qkv.bias"): - visual_num_heads = self.config.vision_config.num_heads - visual_embed_dim = self.config.vision_config.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1) - try: - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - except KeyError: - raise ValueError(f"Unexpected weight: {name}") from None - - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "lm_head.": "language_model.lm_head.", + "model.": "language_model.model.", + }) + + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index d8bd6d3603e84..36b39b6b773e2 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -153,6 +153,7 @@ "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 + "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 "MiniCPMV": ("minicpmv", "MiniCPMV"), "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"), diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index deaed0ba7e4ce..6fb9e2cc4584f 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -6,12 +6,11 @@ import numpy as np import torch -import torch.nn.functional as F from PIL import Image from torch import nn from transformers import SiglipVisionConfig -from vllm.attention.selector import _Backend +from vllm.attention.layer import MultiHeadAttention from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import DecoderOnlyInputs, token_inputs @@ -29,8 +28,6 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData -from .utils import get_vit_attn_backend - def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: # Since interpolation is applied, the image size need not be divisible @@ -291,52 +288,18 @@ def __init__( self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) - self.attn_backend = get_vit_attn_backend(support_fa=False) - if self.attn_backend not in {_Backend.TORCH_SDPA, _Backend.XFORMERS}: - raise RuntimeError( - f"SIGLIP does not support {self.attn_backend} backend now.") + self.attn = MultiHeadAttention(self.num_heads_per_partition, + self.head_dim, self.scale) def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: """Input shape: Batch x Time x Channel""" - batch_size, q_len, _ = hidden_states.size() - qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - key_states = key_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - value_states = value_states.view(batch_size, q_len, - self.num_heads_per_partition, - self.head_dim) - - if self.attn_backend == _Backend.XFORMERS: - from xformers import ops as xops - - out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - elif self.attn_backend == _Backend.TORCH_SDPA: - query_states, key_states, value_states = (x.transpose(1, 2) - for x in (query_states, - key_states, - value_states)) - out = F.scaled_dot_product_attention(query_states, - key_states, - value_states, - dropout_p=self.dropout, - scale=self.scale) - out = out.transpose(1, 2) - - out = out.view(batch_size, q_len, -1) + out = self.attn(query_states, key_states, value_states) attn_output, _ = self.out_proj(out) return attn_output, None diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index f58710d215056..caae0b65d7d10 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -443,10 +443,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() + self.sampler = get_sampler() + self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 7a1e1f9bf2be4..269b66806adf4 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -17,7 +17,7 @@ from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.platforms import _Backend, current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_pin_memory_available +from vllm.utils import is_pin_memory_available, print_warning_once logger = init_logger(__name__) @@ -251,12 +251,15 @@ def init_vllm_registered_model( """ from vllm.model_executor.model_loader.loader import _initialize_model + if hf_config is None and architectures is not None: + # So that the architectures field is overridden + hf_config = vllm_config.model_config.hf_config + if hf_config is not None: - vllm_config = vllm_config.with_hf_config(hf_config) + vllm_config = vllm_config.with_hf_config(hf_config, + architectures=architectures) - return _initialize_model(vllm_config=vllm_config, - prefix=prefix, - architectures=architectures) + return _initialize_model(vllm_config=vllm_config, prefix=prefix) @overload @@ -406,16 +409,42 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: int, + placeholder_token_id: Union[int, List[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the positions in ``inputs_embeds`` corresponding to placeholder tokens in ``input_ids``. + + ``placeholder_token_id`` can be a list of token ids (e.g, token ids + of img_start, img_break, and img_end tokens) when needed: This means + the order of these tokens in the ``input_ids`` MUST MATCH the order of + their embeddings in ``multimodal_embeddings`` since we need to + slice-merge instead of individually scattering. + + For example, if input_ids is "TTTTTSIIIBIIIBIIIETTT", where + - T is text token + - S is image start token + - I is image embedding token + - B is image break token + - E is image end token. + + Then the image embeddings (that correspond to I's) from vision encoder + must be padded with embeddings of S, B, and E in the same order of + input_ids for a correct embedding merge. Note: This updates ``inputs_embeds`` in place. """ + if isinstance(placeholder_token_id, list): + placeholder_token_id = torch.tensor(placeholder_token_id, + device=input_ids.device) + return _merge_multimodal_embeddings( + inputs_embeds, + torch.isin(input_ids, placeholder_token_id), + multimodal_embeddings, + ) + return _merge_multimodal_embeddings( inputs_embeds, (input_ids == placeholder_token_id), @@ -592,7 +621,7 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if is_flash_attn_2_available(): selected_backend = _Backend.FLASH_ATTN else: - logger.warning( + print_warning_once( "Current `vllm-flash-attn` has a bug inside vision module, " "so we use xformers backend instead. You can run " "`pip install flash-attn` to use flash-attention backend.") diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 03a5f3a91f7a1..928c31a2f2843 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -27,18 +27,3 @@ "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] - - -def __getattr__(name: str): - import warnings - - if name == "MultiModalInputs": - msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " - "The original name will take another meaning in an upcoming " - "version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return MultiModalKwargs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index bbb8fb4bc1cd1..7dba94b885b6d 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -226,16 +226,16 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture + from vllm.model_executor.models import supports_multimodal model_cls, _ = get_model_architecture(model_config) - if model_cls not in self._input_mappers: + if not supports_multimodal(model_cls): return 0 max_mm_tokens = self._max_mm_tokens.get(model_cls) if max_mm_tokens is None: - raise KeyError(f"No maximum number of multi-modal tokens is given " - f"for model class {model_cls.__name__} in {self}.") + return 0 if callable(max_mm_tokens): mm_processor_kwargs = get_allowed_kwarg_only_overrides( @@ -326,26 +326,47 @@ def from_seq_group( src_ranges = [] dest_ranges = [] """ - if (not seq_group.multi_modal_data - or not seq_group.multi_modal_placeholders): - return seq_group.multi_modal_data, {} + seq_mm_data = seq_group.multi_modal_data + seq_mm_placeholders = seq_group.multi_modal_placeholders + + if not seq_mm_data or not seq_mm_placeholders: + return seq_mm_data, {} + + # For merged processor, we directly use mm_kwargs as mm_data + if isinstance(seq_mm_data, MultiModalKwargs): + placeholder_maps = dict[str, MultiModalPlaceholderMap]() + + for modality, placeholders in seq_mm_placeholders.items(): + placeholder_map = MultiModalPlaceholderMap() + + if positions: + placeholder_map.append_items_from_seq_group( + positions, + # Dummy, since we don't care about intersecting items + [None] * len(placeholders), + placeholders, + ) + + placeholder_maps[modality] = placeholder_map + + return seq_mm_data, placeholder_maps - mm_data = {**seq_group.multi_modal_data} - placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict( + mm_data = {**seq_mm_data} + placeholder_maps = defaultdict[str, MultiModalPlaceholderMap]( MultiModalPlaceholderMap) - for ( - modality, - placeholders, - ) in seq_group.multi_modal_placeholders.items(): + for modality, placeholders in seq_mm_placeholders.items(): mm_items = mm_data.pop(modality) if not isinstance(mm_items, list): mm_items = [mm_items] if positions: - intersecting_items = placeholder_maps[ - modality].append_items_from_seq_group( - positions, mm_items, placeholders) + intersecting_items = placeholder_maps[modality] \ + .append_items_from_seq_group( + positions, + mm_items, + placeholders, + ) if intersecting_items: mm_data[modality] = intersecting_items @@ -433,18 +454,3 @@ def index_map(self) -> "IndexMap": return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) - - -def __getattr__(name: str): - import warnings - - if name == "MultiModalInputs": - msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " - "The original name will take another meaning in an upcoming " - "version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return MultiModalKwargs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 640c7c04b8817..229a8fbdf5831 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -96,7 +96,8 @@ class PlaceholderRange(TypedDict): """The length of the placeholder.""" -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] +NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor, + Tuple[torch.Tensor, ...]] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 28c8dda581982..922c83b6fd8a9 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -3,16 +3,17 @@ from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass from functools import lru_cache -from itertools import groupby -from typing import Any, Generic, NamedTuple, Optional, Protocol, TypeVar, Union +from typing import (Any, Dict, Generic, NamedTuple, Optional, Protocol, + TypeVar, Union, cast) -import numpy as np -from transformers import BatchFeature +import torch +from transformers import BatchFeature, ProcessorMixin from typing_extensions import TypeAlias, TypedDict -from vllm.inputs import InputProcessingContext +from vllm.inputs import DummyData, InputProcessingContext from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import flatten_2d_lists, full_groupby, is_list_of +from vllm.utils import (flatten_2d_lists, full_groupby, is_list_of, + resolve_mm_processor_kwargs) from .inputs import (AudioItem, ImageItem, MultiModalDataDict, MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, @@ -256,63 +257,6 @@ def to_multi_format(data: MultiModalDataDict) -> dict[str, list[Any]]: return multi_data -class _TokenRun(NamedTuple): - token_id: int - - start_idx: int - length: int - - -def iter_token_runs(token_ids: list[int]) -> Iterable[_TokenRun]: - """ - Yield the starting index and length of each run of tokens that are the same. - """ - start_idx = 0 - - for token_id, it in groupby(token_ids): - length = sum(1 for _ in it) - yield _TokenRun(token_id=token_id, start_idx=start_idx, length=length) - - start_idx += length - - -class _PlaceholderInfo(NamedTuple): - modality: str - offset: int - length: int - - def to_range(self) -> PlaceholderRange: - return PlaceholderRange(offset=self.offset, length=self.length) - - -def iter_placeholders( - prompt_repls: Sequence[_BoundPromptReplacement[Any]], - token_ids: list[int], - *, - min_placeholder_count: int, -) -> Iterable[_PlaceholderInfo]: - """Yield each set of placeholder tokens found in :code:`token_ids`.""" - placeholder_ids_by_modality = { - modality: { - token_id - for prompt_repl in repls - for token_id in prompt_repl.repl_unit.token_ids - } - for modality, repls in full_groupby_modality(prompt_repls) - } - - for run_info in iter_token_runs(token_ids): - if run_info.length > min_placeholder_count: - for (modality, - placeholder_ids) in placeholder_ids_by_modality.items(): - if run_info.token_id in placeholder_ids: - yield _PlaceholderInfo( - modality=modality, - offset=run_info.start_idx, - length=run_info.length, - ) - - class _TokenMatch(NamedTuple): start_idx: int end_idx: int @@ -353,13 +297,9 @@ def start_idx(self) -> int: def end_idx(self) -> int: raise NotImplementedError + @property @abstractmethod - def get_repl( - self, - mm_items: list[_T], - hf_inputs: BatchFeature, - item_idx: int, - ) -> _S: + def repl_unit(self) -> _S: raise NotImplementedError def __repr__(self) -> str: @@ -380,15 +320,9 @@ def start_idx(self) -> int: def end_idx(self) -> int: return self.match.end_idx - def get_repl( - self, - mm_items: list[_T], - hf_inputs: BatchFeature, - item_idx: int, - ) -> list[int]: - prompt_repl = self.prompt_repl - count = prompt_repl.get_count(mm_items, hf_inputs, item_idx) - return prompt_repl.repl_unit.token_ids * count + @property + def repl_unit(self) -> list[int]: + return self.prompt_repl.repl_unit.token_ids @dataclass(repr=False) @@ -404,15 +338,26 @@ def start_idx(self) -> int: def end_idx(self) -> int: return self.match.end() - def get_repl( - self, - mm_items: list[_T], - hf_inputs: BatchFeature, - item_idx: int, - ) -> str: - prompt_repl = self.prompt_repl - count = prompt_repl.get_count(mm_items, hf_inputs, item_idx) - return prompt_repl.repl_unit.text * count + @property + def repl_unit(self) -> str: + return self.prompt_repl.repl_unit.text + + +class _PlaceholderInfo(NamedTuple): + modality: str + start_idx: int + unit: list[int] + unit_count: int + + @property + def length(self) -> int: + return len(self.unit) * self.unit_count + + def to_range(self) -> PlaceholderRange: + return PlaceholderRange( + offset=self.start_idx, + length=self.length, + ) def find_token_matches( @@ -447,15 +392,17 @@ def _resolve_matches( Resolve :code:`matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ - num_matches_by_idx = np.zeros(len(prompt), dtype=int) + seen_matches: list[Optional[_PromptReplacementMatch[_T, _S]]] \ + = [None] * len(prompt) + for match in matches: - num_matches_by_idx[match.start_idx:match.end_idx] += 1 + for idx in range(match.start_idx, match.end_idx): + if seen_matches[idx] is not None: + raise ValueError("Found overlapping matches " + f"({seen_matches[idx]} and {match}) " + f"at index={idx} of prompt={prompt}") - duplicate_matches_idxs, = np.nonzero(num_matches_by_idx > 1) - if len(duplicate_matches_idxs) > 0: - raise ValueError("Unable to find a unique replacement " - f"at indices={duplicate_matches_idxs} " - f"of prompt={prompt}") + seen_matches[idx] = match return sorted(matches, key=lambda x: x.start_idx) @@ -480,9 +427,12 @@ def _replace_matches( start_idx = match.start_idx end_idx = match.end_idx - repl_ids = match.get_repl(mm_items, hf_inputs, item_idx) + repl_unit = match.repl_unit + repl_info = match.prompt_repl + repl_count = repl_info.get_count(mm_items, hf_inputs, item_idx) - out_seqs.append(prompt[prev_end_idx:start_idx] + repl_ids) + out_seqs.append(prompt[prev_end_idx:start_idx] + + repl_unit * repl_count) prev_end_idx = end_idx next_idx_by_modality[modality] += 1 @@ -531,9 +481,59 @@ def replace_text_matches( return "".join(texts) -class MultiModalProcessor: +def _merge_placeholder_matches( + matches: Iterable[_PromptReplacementTokenMatch], +) -> Iterable[_PromptReplacementTokenMatch]: + current_match = None + + for match in sorted(matches, key=lambda x: x.start_idx): + if current_match is None: + current_match = match + elif (current_match.prompt_repl == match.prompt_repl + and current_match.end_idx == match.start_idx): + current_match = _PromptReplacementTokenMatch( + current_match.prompt_repl, + match=_TokenMatch(current_match.start_idx, match.end_idx), + ) + else: + yield current_match + current_match = match + + if current_match is not None: + yield current_match + + +def iter_placeholders( + prompt_repls: Sequence[_BoundPromptReplacement[Any]], + prompt: list[int], + *, + min_unit_count: int = 1, +) -> Iterable[_PlaceholderInfo]: + """Yield each set of placeholder tokens found in :code:`token_ids`.""" + if min_unit_count <= 0: + raise ValueError("`min_unit_count` must be a positive integer") + + matches = (_PromptReplacementTokenMatch(prompt_repl, match) + for prompt_repl in prompt_repls + if len(repl_unit := prompt_repl.repl_unit.token_ids) > 0 + for match in iter_token_matches(prompt, repl_unit)) + + for match in _merge_placeholder_matches(matches): + unit = match.repl_unit + placeholder = _PlaceholderInfo( + modality=match.modality, + start_idx=match.start_idx, + unit=unit, + unit_count=(match.end_idx - match.start_idx) // len(unit), + ) + + if placeholder.unit_count >= min_unit_count: + yield placeholder + + +class BaseMultiModalProcessor(ABC): """ - Helper class to process multi-modal inputs to be used in vLLM. + Abstract base class to process multi-modal inputs to be used in vLLM. """ def __init__( @@ -545,6 +545,18 @@ def __init__( self.ctx = ctx self.metadata = metadata + self.init_mm_processor_kwargs = (ctx.model_config.mm_processor_kwargs + or {}) + + def _get_hf_processor( + self, + **mm_processor_kwargs: Mapping[str, object], + ) -> ProcessorMixin: + # by default, we won't pass any kwargs to the processor initialization + return self.ctx.get_hf_processor() + + def _get_tokenizer(self) -> AnyTokenizer: + return self.ctx.tokenizer def __call__( self, @@ -562,13 +574,13 @@ def _find_placeholders( # To avoid false positives from multi-input when detecting # whether placeholder tokens have been inserted, in case # the target sequence is a subset of the replacement tokens - min_placeholder_count: int = 16, + min_unit_count: int = 16, ) -> list[_PlaceholderInfo]: return list( iter_placeholders( all_prompt_repls, new_token_ids, - min_placeholder_count=min_placeholder_count, + min_unit_count=min_unit_count, )) def _apply_hf_processor( @@ -577,19 +589,62 @@ def _apply_hf_processor( mm_data: MultiModalDataDict, mm_processor_kwargs: Mapping[str, object], ) -> BatchFeature: - hf_processor = self.ctx.get_hf_processor() - - return hf_processor( - text=prompt, # type: ignore - **mm_data, + # some mm_processor_kwargs may be used in processor initialization + # instead of processor call + processor_init_kwargs = { + **self.init_mm_processor_kwargs, **mm_processor_kwargs, + } + hf_processor = self._get_hf_processor(**processor_init_kwargs) + + processor_data = dict[str, Any]() + passthrough_data = dict[str, Any]() + for k, v in mm_data.items(): + # TODO: Make a separate modality for embedding inputs + # to avoid confusion + if k in ("image", "video", "audio"): + if isinstance(v, torch.Tensor) and v.ndim == 3: + # Pass through embedding inputs (single) + passthrough_data[f"{k}_embeds"] = [v] + elif is_list_of(v, torch.Tensor) and v[0].ndim == 2: + # Pass through embedding inputs (multi) + passthrough_data[f"{k}_embeds"] = v + else: + # Map keys to plural form, e.g.: image -> images + processor_data[f"{k}s"] = v + else: + processor_data[k] = v + + # filter mm_processor_kwargs used in processor call + mm_processor_kwargs = resolve_mm_processor_kwargs( + self.init_mm_processor_kwargs, + cast(Dict[str, Any], mm_processor_kwargs), + hf_processor, ) + try: + hf_inputs = hf_processor( + text=prompt, # type: ignore + **processor_data, + **mm_processor_kwargs, + return_tensors="pt", + ) + except Exception as exc: + data = dict(text=prompt, **processor_data) + + raise RuntimeError( + f"Failed to apply {type(hf_processor).__name__} " + f"on data={data} with kwargs={mm_processor_kwargs}") from exc + + hf_inputs.update(passthrough_data) + + return hf_inputs + def _bind_prompt_replacements( self, mm_data: MultiModalDataDict, ) -> list[_BoundPromptReplacement[Any]]: - tokenizer = self.ctx.tokenizer + tokenizer = self._get_tokenizer() return [ prompt_repl.bind(modality, tokenizer) @@ -604,7 +659,7 @@ def _apply_prompt_replacements( token_ids: list[int], prompt_repls: Sequence[_BoundPromptReplacement[Any]], ) -> tuple[list[int], str, list[_PlaceholderInfo]]: - tokenizer = self.ctx.tokenizer + tokenizer = self._get_tokenizer() mm_items = to_multi_format(mm_data) token_matches = find_token_matches(token_ids, prompt_repls) @@ -620,7 +675,7 @@ def _apply_prompt_replacements( # of the search text in the prompt, we instead perform string # replacement on the decoded token IDs, then encode them back. if all( - len(matches) >= len(mm_data[modality]) + len(matches) >= len(mm_items[modality]) for modality, matches in full_groupby_modality(token_matches) ): # yapf: disable token_ids = replace_token_matches( @@ -648,15 +703,6 @@ def _apply_prompt_replacements( placeholders = self._find_placeholders(matched_repls, token_ids) - # Sanity check - assert len(placeholders) == len(matched_repls), dict( - # Log this information for easier debugging - text=text, - token_ids=token_ids, - placeholders=placeholders, - matched_repls=matched_repls, - ) - return token_ids, text, placeholders def apply( @@ -678,7 +724,7 @@ def apply( 3. Extract information about the placeholder tokens from the processed token IDs. """ - tokenizer = self.ctx.tokenizer + tokenizer = self._get_tokenizer() hf_inputs = self._apply_hf_processor(prompt_text, mm_data, mm_processor_kwargs) @@ -717,3 +763,59 @@ def apply( mm_kwargs=mm_kwargs, mm_placeholders=mm_placeholders, ) + + @abstractmethod + def _get_dummy_mm_kwargs( + self, + mm_counts: Mapping[str, int], + ) -> MultiModalKwargs: + """ + Build the input that corresponds to `mm_max_tokens` in + :meth:`get_dummy_data`. + """ + raise NotImplementedError + + def get_dummy_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_max_tokens: Mapping[str, int], + ) -> DummyData: + # Avoid circular import + from vllm.sequence import SequenceData + + tokenizer = self._get_tokenizer() + + mm_placeholders = dict[str, _PlaceholderInfo]() + offset = 0 + + for modality, max_tokens in mm_max_tokens.items(): + if max_tokens == 0: + continue + + metadata = self.metadata[modality] + repl = metadata.prompt_repls[0].bind(modality, tokenizer) + repl_token_ids = repl.repl_unit.token_ids + + placeholders = _PlaceholderInfo( + modality=modality, + start_idx=offset, + unit=repl_token_ids, + unit_count=max_tokens // len(repl_token_ids), + ) + + mm_placeholders[modality] = placeholders + offset += placeholders.length + + prompt_token_ids = flatten_2d_lists( + [p.unit * p.unit_count for p in mm_placeholders.values()]) + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) + + return DummyData( + seq_data=SequenceData.from_seqs(prompt_token_ids), + multi_modal_data=self._get_dummy_mm_kwargs(mm_counts), + multi_modal_placeholders={ + modality: [p.to_range()] + for modality, p in mm_placeholders.items() + }, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b73daee98bd80..6ab6c0fe2f12e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -15,7 +15,7 @@ from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors -from .processing import MultiModalProcessor +from .processing import BaseMultiModalProcessor from .video import VideoPlugin if TYPE_CHECKING: @@ -26,7 +26,7 @@ N = TypeVar("N", bound=Type[nn.Module]) MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], - MultiModalProcessor] + BaseMultiModalProcessor] """ Constructs a :class:`MultiModalProcessor` instance from the context. @@ -200,9 +200,12 @@ def register_max_image_tokens( """ return self.register_max_multimodal_tokens("image", max_mm_tokens) - def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: + def get_max_tokens_by_modality( + self, + model_config: "ModelConfig", + ) -> Mapping[str, int]: """ - Get the maximum number of multi-modal tokens + Get the maximum number of tokens from each modality for profiling the memory usage of a model. See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. @@ -212,9 +215,23 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ limits_per_plugin = self._limits_by_model[model_config] - return sum((limits_per_plugin[key] * - plugin.get_max_multimodal_tokens(model_config)) - for key, plugin in self._plugins.items()) + return { + key: (limits_per_plugin[key] * + plugin.get_max_multimodal_tokens(model_config)) + for key, plugin in self._plugins.items() + } + + def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: + """ + Get the maximum number of multi-modal tokens + for profiling the memory usage of a model. + + See :meth:`MultiModalPlugin.get_max_multimodal_tokens` for more details. + + Note: + This should be called after :meth:`init_mm_limits_per_prompt`. + """ + return sum(self.get_max_tokens_by_modality(model_config).values()) def init_mm_limits_per_prompt( self, @@ -270,7 +287,8 @@ def register_processor( factory: MultiModalProcessorFactory, ): """ - Register a multi-modal processor to a model class. + Register a multi-modal processor to a model class. The processor + is constructed lazily, hence a factory method should be passed. When the model receives multi-modal data, the provided function is invoked to transform the data into a dictionary of model inputs. @@ -307,7 +325,7 @@ def create_processor( self, model_config: "ModelConfig", tokenizer: AnyTokenizer, - ) -> MultiModalProcessor: + ) -> BaseMultiModalProcessor: """ Create a multi-modal processor for a specific model and tokenizer. """ diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index d4333b7519b47..c898ca4e6573e 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -535,11 +535,13 @@ def repeat_and_pad_placeholder_tokens( return new_prompt, new_token_ids, placeholder_ranges -def consecutive_placeholder_ranges(num_items: int, - item_size: int) -> List[PlaceholderRange]: +def consecutive_placeholder_ranges( + num_items: int, + item_size: int, + initial_offset: int = 0) -> List[PlaceholderRange]: """Returns a list of consecutive PlaceholderRanges of a fixed size""" return [ - PlaceholderRange(offset=i * item_size, length=item_size) - for i in range(num_items) + PlaceholderRange(offset=initial_offset + i * item_size, + length=item_size) for i in range(num_items) ] diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 7cb8ac4b0a1e0..419237c252ffd 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,5 +1,5 @@ from .interface import _Backend # noqa: F401 -from .interface import Platform, PlatformEnum, UnspecifiedPlatform +from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform current_platform: Platform @@ -120,4 +120,4 @@ def cuda_is_jetson() -> bool: else: current_platform = UnspecifiedPlatform() -__all__ = ['Platform', 'PlatformEnum', 'current_platform'] +__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index b5333fbd6f502..e5142b985d1f2 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import psutil import torch @@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: def get_device_total_memory(cls, device_id: int = 0) -> int: return psutil.virtual_memory().total + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + @classmethod def inference_mode(cls): return torch.no_grad() @@ -46,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 846a1869da228..edaf377b501df 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -4,7 +4,7 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, Callable, List, TypeVar +from typing import TYPE_CHECKING, Callable, List, Optional, TypeVar import pynvml import torch @@ -88,6 +88,16 @@ def get_device_name(cls, device_id: int = 0) -> str: def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + if enforce_eager: + logger.warning( + "To see benefits of async output processing, enable CUDA " + "graph. Since, enforce-eager is enabled, async output " + "processor cannot be used") + return False + return True + @classmethod def is_full_nvlink(cls, device_ids: List[int]) -> bool: raise NotImplementedError diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 10aaa6d54962c..7f22bee3eaa74 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import torch @@ -20,6 +20,10 @@ class HpuPlatform(Platform): def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: return _Backend.HPU_ATTN + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + @staticmethod def inference_mode(): return torch.no_grad() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index eac2b413f9271..db06d2c18e681 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,15 +1,20 @@ import enum +import platform import random from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union import numpy as np import torch +from vllm.logger import init_logger + if TYPE_CHECKING: from vllm.config import VllmConfig else: VllmConfig = None +logger = init_logger(__name__) + class _Backend(enum.Enum): FLASH_ATTN = enum.auto() @@ -37,6 +42,14 @@ class PlatformEnum(enum.Enum): UNSPECIFIED = enum.auto() +class CpuArchEnum(enum.Enum): + X86 = enum.auto() + ARM = enum.auto() + POWERPC = enum.auto() + OTHER = enum.auto() + UNKNOWN = enum.auto() + + class DeviceCapability(NamedTuple): major: int minor: int @@ -138,6 +151,13 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: """Get the total memory of a device in bytes.""" raise NotImplementedError + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + """ + Check if the current platform supports async output. + """ + raise NotImplementedError + @classmethod def inference_mode(cls): """A device-specific wrapper of `torch.inference_mode`. @@ -184,6 +204,23 @@ def verify_quantization(cls, quant: str) -> None: f"{quant} quantization is currently not supported in " f"{cls.device_name}.") + @classmethod + def get_cpu_architecture(cls) -> CpuArchEnum: + """ + Determine the CPU architecture of the current system. + Returns CpuArchEnum indicating the architecture type. + """ + machine = platform.machine().lower() + + if machine in ("x86_64", "amd64", "i386", "i686"): + return CpuArchEnum.X86 + elif machine.startswith("arm") or machine.startswith("aarch"): + return CpuArchEnum.ARM + elif machine.startswith("ppc"): + return CpuArchEnum.POWERPC + + return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 87655ea198303..1e5c4bddfa24f 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional from .interface import Platform, PlatformEnum @@ -18,6 +18,10 @@ class NeuronPlatform(Platform): def get_device_name(cls, device_id: int = 0) -> str: return "neuron" + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config = vllm_config.parallel_config diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 29b61e955d9ab..e0f8e8b4b49fe 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import torch @@ -37,6 +37,10 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: def get_device_name(self, device_id: int = 0) -> str: return "openvino" + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return False + @classmethod def inference_mode(self): return torch.inference_mode(mode=True) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d2f7cd40e25b2..a553956d6cd07 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,6 +1,6 @@ import os from functools import lru_cache, wraps -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional import torch from amdsmi import (AmdSmiException, amdsmi_get_gpu_board_info, @@ -139,6 +139,16 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + if enforce_eager: + logger.warning( + "To see benefits of async output processing, enable CUDA " + "graph. Since, enforce-eager is enabled, async output " + "processor cannot be used") + return False + return True + @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config = vllm_config.parallel_config diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index b138f7e1c54c5..10d874349f36b 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import torch @@ -35,6 +35,10 @@ def get_device_name(cls, device_id: int = 0) -> str: def get_device_total_memory(cls, device_id: int = 0) -> int: raise NotImplementedError + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + @classmethod def inference_mode(cls): return torch.no_grad() diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 9665786f4c499..11dbd04d55671 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import torch @@ -41,6 +41,10 @@ def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.xpu.get_device_properties(device_id) return device_props.total_memory + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + @staticmethod def inference_mode(): return torch.no_grad() diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 3c64726ca3344..17f604ea0e202 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -4,6 +4,7 @@ import torch import vllm.envs as envs +from vllm.platforms import current_platform logger = logging.getLogger(__name__) @@ -25,6 +26,23 @@ def load_general_plugins(): os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 + if current_platform.is_xpu(): + # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa + os.environ['TORCH_COMPILE_DISABLE'] = 'True' + if current_platform.is_hpu(): + # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) + # does not support torch.compile + # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for + # torch.compile support + is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' + if is_lazy: + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True + # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) + # requires enabling lazy collectives + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' + global plugins_loaded if plugins_loaded: return @@ -39,7 +57,7 @@ def load_general_plugins(): discovered_plugins = entry_points(group='vllm.general_plugins') if len(discovered_plugins) == 0: - logger.info("No plugins found.") + logger.debug("No plugins found.") return logger.info("Available plugins:") for plugin in discovered_plugins: diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 166a33c03451e..0310be0d9d886 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -305,8 +305,9 @@ def __post_init__(self) -> None: raise ValueError( f"best_of must be greater than or equal to n, " f"got n={self.n} and best_of={self.best_of}.") - self._real_n = self.n - self.n = self.best_of + if not self._real_n: + self._real_n = self.n + self.n = self.best_of if 0 < self.temperature < _MAX_TEMP: logger.warning( diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index d249b37c780e4..676ac5eb3609d 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -120,6 +120,9 @@ def sampler_output( indices_of_seq_with_bonus_tokens) model_outputs.append(model_output) + # move indices to device to avoid stream sync + indices_of_seq_with_bonus_tokens = torch.tensor( + indices_of_seq_with_bonus_tokens, device=self.device) filtered_model_outputs = self._filter_model_output( model_outputs, indices_of_seq_with_bonus_tokens) return filtered_model_outputs, True @@ -189,7 +192,7 @@ def _expand_execute_model_request( @staticmethod def _filter_model_output( expanded_batch_outputs: List[SamplerOutput], - output_indices_to_retain: List[int]) -> List[SamplerOutput]: + output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: """ Filters the model output to include only the specified sequence outputs. This method contracts the expanded batch output from the @@ -199,8 +202,8 @@ def _filter_model_output( Args: expanded_batch_output (List[SamplerOutput]): The expanded output batch from the model. - output_indices_to_retain (List[int]): Indices of the model outputs - to retain. + output_indices_to_retain (torch.Tensor): Indices of the model + outputs to retain. Returns: List[SamplerOutput]: A list containing the filtered model diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 53634f7b0b366..2689802161987 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -54,6 +54,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": speculative_config: SpeculativeConfig = vllm_config.speculative_config assert speculative_config is not None + if vllm_config.parallel_config.pipeline_parallel_size > 1: + raise NotImplementedError("Speculative decoding is currently " + "incompatible with pipeline parallelism") + draft_worker_kwargs = kwargs.copy() kwargs["model_runner_cls"] = TargetModelRunner @@ -104,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 6a114b513f382..c0b3d2585a962 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -1,7 +1,7 @@ from typing import Optional, Type -from vllm.config import (ModelConfig, ParallelConfig, SchedulerConfig, - TokenizerPoolConfig) +from vllm.config import (LoRAConfig, ModelConfig, ParallelConfig, + SchedulerConfig, TokenizerPoolConfig) from vllm.executor.ray_utils import ray from .base_tokenizer_group import AnyTokenizer, BaseTokenizerGroup @@ -16,10 +16,11 @@ def init_tokenizer_from_configs(model_config: ModelConfig, scheduler_config: SchedulerConfig, parallel_config: ParallelConfig, - enable_lora: bool): + lora_config: LoRAConfig): init_kwargs = dict(tokenizer_id=model_config.tokenizer, - enable_lora=enable_lora, + enable_lora=bool(lora_config), max_num_seqs=scheduler_config.max_num_seqs, + max_loras=lora_config.max_loras if lora_config else 0, max_input_length=None, tokenizer_mode=model_config.tokenizer_mode, trust_remote_code=model_config.trust_remote_code, diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index e516eeabaadef..761b07f34d2f9 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -21,8 +21,9 @@ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int, self.enable_lora = enable_lora self.max_input_length = max_input_length self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) + max_loras = tokenizer_config.get("max_loras", 0) self.lora_tokenizers = LRUCache[AnyTokenizer]( - capacity=max_num_seqs if enable_lora else 0) + capacity=max(max_loras, max_num_seqs) if enable_lora else 0) @classmethod def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig], diff --git a/vllm/utils.py b/vllm/utils.py index 04e48bfb6b645..35940c72f5d9b 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -24,9 +24,9 @@ from collections.abc import Iterable, Mapping from functools import lru_cache, partial, wraps from platform import uname -from typing import (Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, - Hashable, List, Literal, Optional, OrderedDict, Set, Tuple, - Type, TypeVar, Union, overload) +from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, + Dict, Generic, Hashable, List, Literal, Optional, + OrderedDict, Set, Tuple, Type, TypeVar, Union, overload) from uuid import uuid4 import numpy as np @@ -43,11 +43,14 @@ from vllm.logger import enable_trace_function_call, init_logger from vllm.platforms import current_platform +if TYPE_CHECKING: + from vllm.config import VllmConfig + logger = init_logger(__name__) # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/serving/compatibility_matrix.rst +# Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ @@ -452,17 +455,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -@lru_cache(maxsize=None) -def get_vllm_instance_id() -> str: - """ - If the environment variable VLLM_INSTANCE_ID is set, return it. - Otherwise, return a random UUID. - Instance id represents an instance of the VLLM. All processes in the same - instance should have the same instance id. - """ - return envs.VLLM_INSTANCE_ID or f"vllm-instance-{random_uuid()}" - - @lru_cache(maxsize=None) def in_wsl() -> bool: # Reference: https://github.com/microsoft/WSL/issues/4071 @@ -1114,7 +1106,7 @@ def find_nccl_library() -> str: return so_file -def enable_trace_function_call_for_thread() -> None: +def enable_trace_function_call_for_thread(vllm_config: "VllmConfig") -> None: """Set up function tracing for the current thread, if enabled via the VLLM_TRACE_FUNCTION environment variable """ @@ -1126,7 +1118,8 @@ def enable_trace_function_call_for_thread() -> None: filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" f"_thread_{threading.get_ident()}_" f"at_{datetime.datetime.now()}.log").replace(" ", "_") - log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(), + log_path = os.path.join(tmp_dir, "vllm", + f"vllm-instance-{vllm_config.instance_id}", filename) os.makedirs(os.path.dirname(log_path), exist_ok=True) enable_trace_function_call(log_path) @@ -1772,7 +1765,7 @@ def direct_register_custom_op( library object. If you want to bind the operator to a different library, make sure the library object is alive when the operator is used. """ - if is_in_doc_build(): + if is_in_doc_build() or not supports_custom_op(): return import torch.library if hasattr(torch.library, "infer_schema"): diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index d37989055c2e5..251a103e60f06 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -138,14 +138,25 @@ def forward( # Profiling run. return output - num_actual_tokens = attn_metadata.num_actual_tokens + # IMPORTANT! + # NOTE(woosuk): With piece-wise CUDA graphs, this method is executed in + # eager-mode PyTorch. Thus, we need to be careful about any CPU overhead + # in this method. For example, `view` and `slice` (or `[:n]`) operations + # are surprisingly slow even in the case they do not invoke any GPU ops. + # Minimize the PyTorch ops in this method as much as possible. + # Whenever making a change in this method, please benchmark the + # performance to make sure it does not introduce any overhead. + num_actual_tokens = attn_metadata.num_actual_tokens # Reshape the input keys and values and store them in the cache. - key_cache = kv_cache[0] - value_cache = kv_cache[1] + # NOTE(woosuk): Here, key and value are padded while slot_mapping is + # not padded. However, we don't need to do key[:num_actual_tokens] and + # value[:num_actual_tokens] because the reshape_and_cache_flash op uses + # the slot_mapping's shape to determine the number of actual tokens. + key_cache, value_cache = kv_cache.unbind(0) torch.ops._C_cache_ops.reshape_and_cache_flash( - key[:num_actual_tokens], - value[:num_actual_tokens], + key, + value, key_cache, value_cache, attn_metadata.slot_mapping, diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index f1f26f4e8d443..1203d35fc985f 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -73,12 +73,12 @@ def __init__( # has the Transformer architecture (e.g., ViT). # FIXME(woosuk): Below are placeholder values. We need to calculate the # actual values from the configurations. - self.max_num_encoder_input_tokens = 2048 + self.max_num_encoder_input_tokens = 16384 # NOTE(woosuk): For the models without encoder (e.g., text-only models), # the encoder cache will not be initialized and used, regardless of # the cache size. This is because the memory space for the encoder cache # is preallocated in the profiling run. - self.encoder_cache_manager = EncoderCacheManager(cache_size=2048) + self.encoder_cache_manager = EncoderCacheManager(cache_size=16384) def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 967124fd850ea..3cf0e610ae7af 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,11 +1,11 @@ import enum from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union +from typing import List, Optional, Union import msgspec from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -35,9 +35,8 @@ class EngineCoreRequest: # always be tokenized? prompt: Optional[str] prompt_token_ids: List[int] - mm_data: Optional[MultiModalDataDict] + mm_inputs: Optional[List[MultiModalKwargs]] mm_placeholders: Optional[MultiModalPlaceholderDict] - mm_processor_kwargs: Optional[Dict[str, Any]] sampling_params: SamplingParams eos_token_id: Optional[int] arrival_time: float diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 7335c637f0f79..0bcccda2bf329 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -51,7 +51,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Request streams (map of request_id -> AsyncStream). @@ -65,7 +65,12 @@ def __init__( input_registry) # Detokenizer (converts EngineCoreOutputs --> RequestOutput). - self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer) + self.detokenizer = Detokenizer( + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, + ) # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_client( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 34f99dd30ef2e..751eb3b40a68d 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -67,6 +67,7 @@ def __init__( def _initialize_kv_caches(self, cache_config: CacheConfig) -> Tuple[int, int]: + start = time.time() num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( ) @@ -80,18 +81,14 @@ def _initialize_kv_caches(self, num_cpu_blocks = 0 self.model_executor.initialize_cache(num_gpu_blocks) + elapsed = time.time() - start + logger.info(("init engine (profile, create kv cache, " + "warmup model) took %.2f seconds"), elapsed) return num_gpu_blocks, num_cpu_blocks def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" - req = Request.from_engine_core_request(request) - # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may - # take 10-50 ms, which can cause a spike in the latency. We should - # consider moving this to a separate thread. - if req.mm_data: - req.mm_inputs = self.mm_input_mapper.process_inputs( - req.mm_data, req.mm_processor_kwargs) self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index bd19d998a4adb..994e68669108e 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,5 +1,7 @@ from typing import Dict, List, Mapping, Optional, Type, Union +from typing_extensions import TypeVar + from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase @@ -12,7 +14,8 @@ from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.transformers_utils.tokenizer_group import ( + BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import UsageContext from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer @@ -21,6 +24,8 @@ logger = init_logger(__name__) +_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) + class LLMEngine: """Legacy LLMEngine for backwards compatibility.""" @@ -46,7 +51,7 @@ def __init__( model_config=vllm_config.model_config, scheduler_config=vllm_config.scheduler_config, parallel_config=vllm_config.parallel_config, - enable_lora=bool(vllm_config.lora_config)) + lora_config=vllm_config.lora_config) self.tokenizer.ping() # Processor (convert Inputs --> EngineCoreRequests) @@ -169,5 +174,18 @@ def start_profile(self): def stop_profile(self): self.engine_core.profile(False) - def get_tokenizer_group(self, group_type): - pass + def get_tokenizer_group( + self, + group_type: Type[_G] = BaseTokenizerGroup, + ) -> _G: + tokenizer_group = self.tokenizer + + if tokenizer_group is None: + raise ValueError("Unable to get tokenizer because " + "skip_tokenizer_init is True") + if not isinstance(tokenizer_group, group_type): + raise TypeError("Invalid type of tokenizer group. " + f"Expected type: {group_type}, but " + f"found type: {type(tokenizer_group)}") + + return tokenizer_group diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index 594c973678235..7ad6882b04520 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -12,6 +12,7 @@ def __init__( model_config: ModelConfig, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): + self.model_config = model_config self.mm_registry = mm_registry self.multi_modal_input_mapper = mm_registry.create_input_mapper( model_config) @@ -32,7 +33,7 @@ def process_inputs( num_images = len(image_inputs) for i in range(num_images): mm_input = self.multi_modal_input_mapper( - {"image": [image_inputs[i]]}, + {"image": image_inputs[i]}, mm_processor_kwargs=mm_processor_kwargs, ) mm_inputs.append(mm_input) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5c1577190c75a..120fc64969552 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -7,13 +7,15 @@ from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + MultiModalRegistry) from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.config import try_get_generation_config from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest +from vllm.v1.engine.mm_input_mapper import MMInputMapper class Processor: @@ -39,6 +41,9 @@ def __init__( self.input_processor = input_registry.create_input_processor( model_config) + # Multi-modal (huggingface) input mapper + self.mm_input_mapper = MMInputMapper(model_config) + # TODO: run in an ThreadpoolExecutor or BackgroundProcess. # This ideally should releases the GIL, so we should not block the # asyncio loop while this is running. @@ -96,6 +101,17 @@ def process_inputs( sampling_params.update_from_generation_config( self.generation_config_fields, eos_token_id) + # Preprocess multi-modal data + if len(decoder_inputs.multi_modal_data) == 0: + mm_inputs = None + elif isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs): + mm_inputs = [decoder_inputs.multi_modal_data] + else: + mm_inputs = self.mm_input_mapper.process_inputs( + decoder_inputs.multi_modal_data, + decoder_inputs.mm_processor_kwargs, + ) + # Make Request for Detokenizer. detokenizer_request = DetokenizerRequest( request_id, @@ -113,9 +129,8 @@ def process_inputs( request_id, decoder_inputs.prompt, decoder_inputs.prompt_token_ids, - decoder_inputs.multi_modal_data, + mm_inputs, decoder_inputs.multi_modal_placeholders, - decoder_inputs.mm_processor_kwargs, sampling_params, eos_token_id, arrival_time, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 51fb4003e5fe0..6bc1e4d5c769f 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -45,9 +45,6 @@ def __init__( self._all_token_ids: List[int] = self.prompt_token_ids.copy() self.num_computed_tokens = 0 - # Raw multimodal data before the mm input mapper (e.g., PIL images). - self.mm_data = self.inputs.multi_modal_data - self.mm_processor_kwargs = self.inputs.mm_processor_kwargs mm_positions = self.inputs.multi_modal_placeholders if mm_positions: # FIXME(woosuk): Support other modalities. @@ -55,7 +52,10 @@ def __init__( else: self.mm_positions = [] # Output of the mm input mapper (e.g., image tensors). - self.mm_inputs: List[MultiModalKwargs] = [] + if self.inputs.multi_modal_inputs: + self.mm_inputs = self.inputs.multi_modal_inputs + else: + self.mm_inputs: List[MultiModalKwargs] = [] @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": @@ -64,9 +64,10 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": inputs=token_inputs( prompt_token_ids=request.prompt_token_ids, prompt=request.prompt, - multi_modal_data=request.mm_data, + multi_modal_data=None, + multi_modal_inputs=request.mm_inputs, multi_modal_placeholders=request.mm_placeholders, - mm_processor_kwargs=request.mm_processor_kwargs, + mm_processor_kwargs=None, ), sampling_params=request.sampling_params, eos_token_id=request.eos_token_id, @@ -110,7 +111,7 @@ def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: - return len(self.mm_data) > 0 + return len(self.mm_inputs) > 0 @property def num_encoder_inputs(self) -> int: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py new file mode 100644 index 0000000000000..457784bb0287c --- /dev/null +++ b/vllm/v1/worker/gpu_input_batch.py @@ -0,0 +1,280 @@ +# Datastructures defining an input batch + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Optional, Set + +import numpy as np +import torch + +from vllm.multimodal import MultiModalKwargs +from vllm.sampling_params import SamplingParams, SamplingType +from vllm.v1.sample.metadata import SamplingMetadata + +if TYPE_CHECKING: + from vllm.multimodal.inputs import PlaceholderRange + + +@dataclass +class CachedRequestState: + + req_id: str + prompt_token_ids: List[int] + prompt: Optional[str] + mm_inputs: List[MultiModalKwargs] + mm_positions: List["PlaceholderRange"] + sampling_params: SamplingParams + generator: Optional[torch.Generator] + + block_ids: List[int] + num_computed_tokens: int + output_token_ids: List[int] + + @property + def num_tokens(self) -> int: + return len(self.prompt_token_ids) + len(self.output_token_ids) + + +class InputBatch: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_blocks_per_req: int, + device: torch.device, + pin_memory: bool, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req + self.device = device + self.pin_memory = pin_memory + + self.req_ids: List[Optional[str]] = [None] * max_num_reqs + self.req_id_to_index: Dict[str, int] = {} + + self.token_ids_cpu = np.empty((max_num_reqs, max_model_len), + dtype=np.int32) + self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + + # Attention-related. + self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req), + device=self.device, + dtype=torch.int32) + self.block_table_cpu_tensor = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_cpu = self.block_table_cpu_tensor.numpy() + + # Sampling-related. + self.temperature = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.temperature_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.temperature_cpu = self.temperature_cpu_tensor.numpy() + self.greedy_reqs: Set[str] = set() + self.random_reqs: Set[str] = set() + + self.top_p = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device=device) + self.top_p_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.float32, + device="cpu", + pin_memory=pin_memory) + self.top_p_cpu = self.top_p_cpu_tensor.numpy() + self.top_p_reqs: Set[str] = set() + + self.top_k = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device=device) + self.top_k_cpu_tensor = torch.empty((max_num_reqs, ), + dtype=torch.int32, + device="cpu", + pin_memory=pin_memory) + self.top_k_cpu = self.top_k_cpu_tensor.numpy() + self.top_k_reqs: Set[str] = set() + + # req_index -> generator + self.generators: Dict[int, torch.Generator] = {} + + self.num_logprobs: Dict[str, int] = {} + self.prompt_logprob_reqs: Set[str] = set() + + def add_request( + self, + request: "CachedRequestState", + req_index: Optional[int] = None, + ) -> None: + if req_index is None: + req_index = self.num_reqs + assert req_index < self.max_num_reqs + + req_id = request.req_id + self.req_ids[req_index] = req_id + self.req_id_to_index[req_id] = req_index + + # Copy the prompt token ids and output token ids. + num_prompt_tokens = len(request.prompt_token_ids) + self.token_ids_cpu[ + req_index, :num_prompt_tokens] = request.prompt_token_ids + start_idx = num_prompt_tokens + end_idx = start_idx + len(request.output_token_ids) + self.token_ids_cpu[req_index, + start_idx:end_idx] = request.output_token_ids + + self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens + num_blocks = len(request.block_ids) + self.block_table_cpu[req_index, :num_blocks] = request.block_ids + + sampling_params = request.sampling_params + self.temperature_cpu[req_index] = sampling_params.temperature + if sampling_params.sampling_type == SamplingType.GREEDY: + self.greedy_reqs.add(req_id) + else: + self.random_reqs.add(req_id) + + self.top_p_cpu[req_index] = sampling_params.top_p + if sampling_params.top_p < 1: + self.top_p_reqs.add(req_id) + self.top_k_cpu[req_index] = sampling_params.top_k + if sampling_params.top_k > 0: + self.top_k_reqs.add(req_id) + + self.generators[req_index] = request.generator + + num_logprobs = sampling_params.logprobs + if num_logprobs is not None and num_logprobs > 0: + self.num_logprobs[req_id] = num_logprobs + if sampling_params.prompt_logprobs: + self.prompt_logprob_reqs.add(req_id) + + def remove_request(self, req_id: str) -> Optional[int]: + req_index = self.req_id_to_index.pop(req_id, None) + if req_index is None: + return None + self.req_ids[req_index] = None + + self.greedy_reqs.discard(req_id) + self.random_reqs.discard(req_id) + self.top_p_reqs.discard(req_id) + self.top_k_reqs.discard(req_id) + self.generators.pop(req_index, None) + self.num_logprobs.pop(req_id, None) + self.prompt_logprob_reqs.discard(req_id) + return req_index + + def clear(self) -> None: + self.req_ids = [None] * self.max_num_reqs + self.req_id_to_index.clear() + self.greedy_reqs.clear() + self.random_reqs.clear() + self.top_p_reqs.clear() + self.top_k_reqs.clear() + self.generators.clear() + self.num_logprobs.clear() + self.prompt_logprob_reqs.clear() + + def condense(self, empty_req_indices: List[int]) -> None: + if self.num_reqs == 0: + # The batched states are empty. + return + + # NOTE(woosuk): This function assumes that the empty_req_indices + # is sorted in descending order. + last_req_index = self.num_reqs + len(empty_req_indices) - 1 + while empty_req_indices: + # Find the largest non-empty index. + while last_req_index in empty_req_indices: + last_req_index -= 1 + + # Find the smallest empty index. + empty_index = empty_req_indices.pop() + if empty_index >= last_req_index: + break + + # Swap the states. + req_id = self.req_ids[last_req_index] + self.req_ids[empty_index] = req_id + self.req_ids[last_req_index] = None + self.req_id_to_index[req_id] = empty_index + + # TODO(woosuk): Optimize the copy of token_ids_cpu and + # block_table_cpu. + self.token_ids_cpu[empty_index] = self.token_ids_cpu[ + last_req_index] + self.num_computed_tokens_cpu[ + empty_index] = self.num_computed_tokens_cpu[last_req_index] + self.block_table_cpu[empty_index] = self.block_table_cpu[ + last_req_index] + self.temperature_cpu[empty_index] = self.temperature_cpu[ + last_req_index] + self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] + self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] + generator = self.generators.pop(last_req_index, None) + if generator is not None: + self.generators[empty_index] = generator + + # Decrement last_req_index since it is now empty. + last_req_index -= 1 + + def make_sampling_metadata( + self, + skip_copy: bool = False, + ) -> SamplingMetadata: + if not skip_copy: + self.temperature[:self.num_reqs].copy_( + self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_p[:self.num_reqs].copy_( + self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True) + self.top_k[:self.num_reqs].copy_( + self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True) + return SamplingMetadata( + temperature=self.temperature[:self.num_reqs], + all_greedy=self.all_greedy, + all_random=self.all_random, + top_p=self.top_p[:self.num_reqs], + top_k=self.top_k[:self.num_reqs], + no_top_p=self.no_top_p, + no_top_k=self.no_top_k, + generators=self.generators, + max_num_logprobs=self.max_num_logprobs, + ) + + @property + def num_reqs(self) -> int: + return len(self.req_id_to_index) + + @property + def all_greedy(self) -> bool: + return len(self.random_reqs) == 0 + + @property + def all_random(self) -> bool: + return len(self.greedy_reqs) == 0 + + @property + def no_top_p(self) -> bool: + return len(self.top_p_reqs) == 0 + + @property + def no_top_k(self) -> bool: + return len(self.top_k_reqs) == 0 + + @property + def max_num_logprobs(self) -> int: + return max(self.num_logprobs.values()) if self.num_logprobs else 0 + + @property + def no_logprob(self) -> bool: + return len(self.num_logprobs) == 0 + + @property + def no_prompt_logprob(self) -> bool: + return len(self.prompt_logprob_reqs) == 0 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1fa47f553dfd6..c601aca13feaf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,14 +1,12 @@ import gc import time -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple import numpy as np import torch import torch.distributed import torch.nn as nn -from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context @@ -16,16 +14,16 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.multimodal import MultiModalKwargs -from vllm.sampling_params import SamplingParams, SamplingType +from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch if TYPE_CHECKING: - from vllm.multimodal.inputs import PlaceholderRange from vllm.v1.core.scheduler import SchedulerOutput logger = init_logger(__name__) @@ -100,7 +98,11 @@ def __init__( == CompilationLevel.PIECEWISE and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. - self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] + # The convention is different. + # self.cudagraph_batch_sizes sorts in ascending order. + # The batch sizes in the config are in descending order. + self.cudagraph_batch_sizes = list( + reversed(self.vllm_config.compilation_config.capture_sizes)) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) @@ -257,7 +259,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] # where M is the max_model_len. - token_indices = positions_np + req_indices * self.max_model_len + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) token_indices = torch.from_numpy(token_indices) input_ids = torch.empty((total_num_scheduled_tokens, ), dtype=torch.int32, @@ -270,9 +273,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): out=input_ids) # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. block_numbers = self.input_batch.block_table_cpu_tensor.flatten()[ - token_indices // self.block_size] - block_offsets = token_indices % self.block_size + req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size] + block_offsets = torch.from_numpy(positions_np % self.block_size) slot_mapping = torch.empty((total_num_scheduled_tokens, ), dtype=torch.int32, device="cpu", @@ -548,10 +557,9 @@ def profile_run(self) -> None: torch.tensor([], dtype=torch.float32, device=self.device) for _ in range(self.num_attn_layers) ] - with set_compile_context(self.cudagraph_batch_sizes): - # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.model, self.max_num_tokens, - dummy_kv_caches) + # Trigger compilation for general shape. + hidden_states = self._dummy_run(self.model, self.max_num_tokens, + dummy_kv_caches) logits = self.model.compute_logits(hidden_states, None) logits = logits[:self.max_num_tokens] # TODO(woosuk): Consider the memory usage of the sampler. @@ -574,6 +582,9 @@ def capture_model(self) -> None: # can reuse the memory pool allocated for the large shapes. with graph_capture(): for num_tokens in reversed(self.cudagraph_batch_sizes): + for _ in range(self.vllm_config.compilation_config. + cudagraph_num_of_warmups): + self._dummy_run(self.model, num_tokens, self.kv_caches) self._dummy_run(self.model, num_tokens, self.kv_caches) end_time = time.perf_counter() @@ -600,269 +611,3 @@ def _get_padded_batch_size(self, batch_size: int) -> Optional[int]: if batch_size <= size: return size return None - - -@dataclass -class CachedRequestState: - - req_id: str - prompt_token_ids: List[int] - prompt: Optional[str] - mm_inputs: List[MultiModalKwargs] - mm_positions: List["PlaceholderRange"] - sampling_params: SamplingParams - generator: Optional[torch.Generator] - - block_ids: List[int] - num_computed_tokens: int - output_token_ids: List[int] - - @property - def num_tokens(self) -> int: - return len(self.prompt_token_ids) + len(self.output_token_ids) - - -class InputBatch: - - def __init__( - self, - max_num_reqs: int, - max_model_len: int, - max_num_blocks_per_req: int, - device: torch.device, - pin_memory: bool, - ): - self.max_num_reqs = max_num_reqs - self.max_model_len = max_model_len - self.max_num_blocks_per_req = max_num_blocks_per_req - self.device = device - self.pin_memory = pin_memory - - self.req_ids: List[Optional[str]] = [None] * max_num_reqs - self.req_id_to_index: Dict[str, int] = {} - - self.token_ids_cpu = np.empty((max_num_reqs, max_model_len), - dtype=np.int32) - self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) - - # Attention-related. - self.block_table = torch.zeros((max_num_reqs, max_num_blocks_per_req), - device=self.device, - dtype=torch.int32) - self.block_table_cpu_tensor = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device="cpu", - dtype=torch.int32, - pin_memory=pin_memory, - ) - self.block_table_cpu = self.block_table_cpu_tensor.numpy() - - # Sampling-related. - self.temperature = torch.empty((max_num_reqs, ), - dtype=torch.float32, - device=device) - self.temperature_cpu_tensor = torch.empty((max_num_reqs, ), - dtype=torch.float32, - device="cpu", - pin_memory=pin_memory) - self.temperature_cpu = self.temperature_cpu_tensor.numpy() - self.greedy_reqs: Set[str] = set() - self.random_reqs: Set[str] = set() - - self.top_p = torch.empty((max_num_reqs, ), - dtype=torch.float32, - device=device) - self.top_p_cpu_tensor = torch.empty((max_num_reqs, ), - dtype=torch.float32, - device="cpu", - pin_memory=pin_memory) - self.top_p_cpu = self.top_p_cpu_tensor.numpy() - self.top_p_reqs: Set[str] = set() - - self.top_k = torch.empty((max_num_reqs, ), - dtype=torch.int32, - device=device) - self.top_k_cpu_tensor = torch.empty((max_num_reqs, ), - dtype=torch.int32, - device="cpu", - pin_memory=pin_memory) - self.top_k_cpu = self.top_k_cpu_tensor.numpy() - self.top_k_reqs: Set[str] = set() - - # req_index -> generator - self.generators: Dict[int, torch.Generator] = {} - - self.num_logprobs: Dict[str, int] = {} - self.prompt_logprob_reqs: Set[str] = set() - - def add_request( - self, - request: "CachedRequestState", - req_index: Optional[int] = None, - ) -> None: - if req_index is None: - req_index = self.num_reqs - assert req_index < self.max_num_reqs - - req_id = request.req_id - self.req_ids[req_index] = req_id - self.req_id_to_index[req_id] = req_index - - # Copy the prompt token ids and output token ids. - num_prompt_tokens = len(request.prompt_token_ids) - self.token_ids_cpu[ - req_index, :num_prompt_tokens] = request.prompt_token_ids - start_idx = num_prompt_tokens - end_idx = start_idx + len(request.output_token_ids) - self.token_ids_cpu[req_index, - start_idx:end_idx] = request.output_token_ids - - self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens - num_blocks = len(request.block_ids) - self.block_table_cpu[req_index, :num_blocks] = request.block_ids - - sampling_params = request.sampling_params - self.temperature_cpu[req_index] = sampling_params.temperature - if sampling_params.sampling_type == SamplingType.GREEDY: - self.greedy_reqs.add(req_id) - else: - self.random_reqs.add(req_id) - - self.top_p_cpu[req_index] = sampling_params.top_p - if sampling_params.top_p < 1: - self.top_p_reqs.add(req_id) - self.top_k_cpu[req_index] = sampling_params.top_k - if sampling_params.top_k > 0: - self.top_k_reqs.add(req_id) - - self.generators[req_index] = request.generator - - num_logprobs = sampling_params.logprobs - if num_logprobs is not None and num_logprobs > 0: - self.num_logprobs[req_id] = num_logprobs - if sampling_params.prompt_logprobs: - self.prompt_logprob_reqs.add(req_id) - - def remove_request(self, req_id: str) -> Optional[int]: - req_index = self.req_id_to_index.pop(req_id, None) - if req_index is None: - return None - self.req_ids[req_index] = None - - self.greedy_reqs.discard(req_id) - self.random_reqs.discard(req_id) - self.top_p_reqs.discard(req_id) - self.top_k_reqs.discard(req_id) - self.generators.pop(req_index, None) - self.num_logprobs.pop(req_id, None) - self.prompt_logprob_reqs.discard(req_id) - return req_index - - def clear(self) -> None: - self.req_ids = [None] * self.max_num_reqs - self.req_id_to_index.clear() - self.greedy_reqs.clear() - self.random_reqs.clear() - self.top_p_reqs.clear() - self.top_k_reqs.clear() - self.generators.clear() - self.num_logprobs.clear() - self.prompt_logprob_reqs.clear() - - def condense(self, empty_req_indices: List[int]) -> None: - if self.num_reqs == 0: - # The batched states are empty. - return - - # NOTE(woosuk): This function assumes that the empty_req_indices - # is sorted in descending order. - last_req_index = self.num_reqs + len(empty_req_indices) - 1 - while empty_req_indices: - # Find the largest non-empty index. - while last_req_index in empty_req_indices: - last_req_index -= 1 - - # Find the smallest empty index. - empty_index = empty_req_indices.pop() - if empty_index >= last_req_index: - break - - # Swap the states. - req_id = self.req_ids[last_req_index] - self.req_ids[empty_index] = req_id - self.req_ids[last_req_index] = None - self.req_id_to_index[req_id] = empty_index - - # TODO(woosuk): Optimize the copy of token_ids_cpu and - # block_table_cpu. - self.token_ids_cpu[empty_index] = self.token_ids_cpu[ - last_req_index] - self.num_computed_tokens_cpu[ - empty_index] = self.num_computed_tokens_cpu[last_req_index] - self.block_table_cpu[empty_index] = self.block_table_cpu[ - last_req_index] - self.temperature_cpu[empty_index] = self.temperature_cpu[ - last_req_index] - self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] - self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] - generator = self.generators.pop(last_req_index, None) - if generator is not None: - self.generators[empty_index] = generator - - # Decrement last_req_index since it is now empty. - last_req_index -= 1 - - def make_sampling_metadata( - self, - skip_copy: bool = False, - ) -> SamplingMetadata: - if not skip_copy: - self.temperature[:self.num_reqs].copy_( - self.temperature_cpu_tensor[:self.num_reqs], non_blocking=True) - self.top_p[:self.num_reqs].copy_( - self.top_p_cpu_tensor[:self.num_reqs], non_blocking=True) - self.top_k[:self.num_reqs].copy_( - self.top_k_cpu_tensor[:self.num_reqs], non_blocking=True) - return SamplingMetadata( - temperature=self.temperature[:self.num_reqs], - all_greedy=self.all_greedy, - all_random=self.all_random, - top_p=self.top_p[:self.num_reqs], - top_k=self.top_k[:self.num_reqs], - no_top_p=self.no_top_p, - no_top_k=self.no_top_k, - generators=self.generators, - max_num_logprobs=self.max_num_logprobs, - ) - - @property - def num_reqs(self) -> int: - return len(self.req_id_to_index) - - @property - def all_greedy(self) -> bool: - return len(self.random_reqs) == 0 - - @property - def all_random(self) -> bool: - return len(self.greedy_reqs) == 0 - - @property - def no_top_p(self) -> bool: - return len(self.top_p_reqs) == 0 - - @property - def no_top_k(self) -> bool: - return len(self.top_k_reqs) == 0 - - @property - def max_num_logprobs(self) -> int: - return max(self.num_logprobs.values()) if self.num_logprobs else 0 - - @property - def no_logprob(self) -> bool: - return len(self.num_logprobs) == 0 - - @property - def no_prompt_logprob(self) -> bool: - return len(self.prompt_logprob_reqs) == 0 diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index c2ebf853f41d8..cfd77d14bc697 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -25,8 +25,7 @@ from vllm.utils import STR_NOT_IMPL_ENC_DEC_BACKEND, make_tensor_with_pad from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUBuilder, - ModelInputForGPUWithSamplingMetadata, - _get_graph_batch_size) + ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( _add_attn_metadata_broadcastable_dict, _add_sampling_metadata_broadcastable_dict) @@ -465,7 +464,8 @@ def _prepare_encoder_model_input_tensors( # We will be using CUDA graph replay for this decode. max_len_of_block_table = self.get_max_block_per_batch() batch_size = len(encoder_seq_lens) - graph_batch_size = _get_graph_batch_size(batch_size) + graph_batch_size = self.vllm_config.get_graph_batch_size( + batch_size) assert graph_batch_size >= batch_size cuda_graph_pad_size = graph_batch_size - batch_size # extend the cross_block_tables and encoder_seq_lens to match diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 741bdf6346c49..94ebf926bf5a6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -18,7 +18,6 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState -from vllm.compilation.compile_context import set_compile_context from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import get_kv_transfer_group, get_pp_group @@ -63,16 +62,7 @@ logger = init_logger(__name__) LORA_WARMUP_RANK = 8 -_BATCH_SIZE_ALIGNMENT = 8 -# all the token sizes that **can** be captured by cudagraph. -# they can be arbitrarily large. -# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. -# the actual sizes to capture will be determined by the model, -# depending on the model's max_num_seqs. -# NOTE: _get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) -] + _NUM_WARMUP_ITERS = 2 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") @@ -763,7 +753,6 @@ def _use_captured_graph(self, max_decode_seq_len: int, max_encoder_seq_len: int = 0) -> bool: return (decode_only and not self.runner.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture and max_encoder_seq_len <= self.runner.max_seq_len_to_capture and batch_size <= self.runner.max_batchsize_to_capture) @@ -811,7 +800,7 @@ def _get_cuda_graph_pad_size(self, max_encoder_seq_len): return -1 - graph_batch_size = _get_graph_batch_size(batch_size) + graph_batch_size = VllmConfig.get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size return graph_batch_size - batch_size @@ -1023,7 +1012,7 @@ def __init__( self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = _get_max_graph_batch_size( + self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size( self.scheduler_config.max_num_seqs) self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ @@ -1333,14 +1322,7 @@ def profile_run(self) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - if self.model_config.enforce_eager: - batch_size_capture_list = [] - with set_compile_context(batch_size_capture_list): - self.execute_model(model_input, kv_caches, intermediate_tensors) + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() return @@ -1459,18 +1441,14 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - with self.attn_state.graph_capture( max_batch_size), graph_capture() as graph_capture_context: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. for virtual_engine in range( self.parallel_config.pipeline_parallel_size): - for batch_size in reversed(batch_size_capture_list): + for batch_size in \ + self.vllm_config.compilation_config.capture_sizes: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, @@ -1805,6 +1783,9 @@ def need_recv_kv(self, model_input, kv_caches) -> bool: kv_caches: vLLM's paged memory """ + if self.vllm_config.kv_transfer_config is None: + return False + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1812,9 +1793,6 @@ def need_recv_kv(self, model_input, kv_caches) -> bool: # check if the current run is prefill is_prefill_run = prefill_meta is not None - if self.vllm_config.kv_transfer_config is None: - return False - return self.vllm_config.kv_transfer_config.is_kv_consumer and ( not is_profile_run) and is_prefill_run @@ -1830,6 +1808,9 @@ def need_send_kv(self, model_input, kv_caches) -> bool: kv_caches: vLLM's paged memory """ + if self.vllm_config.kv_transfer_config is None: + return False + prefill_meta = model_input.attn_metadata.prefill_metadata # check if the current run is profiling @@ -1837,9 +1818,6 @@ def need_send_kv(self, model_input, kv_caches) -> bool: # check if the current run is prefill is_prefill_run = prefill_meta is not None - if self.vllm_config.kv_transfer_config is None: - return False - return self.vllm_config.kv_transfer_config.is_kv_producer and ( not is_profile_run) and is_prefill_run @@ -1994,37 +1972,3 @@ def forward( return self.output_buffers["hidden_states"] return self.output_buffers - - -def _get_graph_batch_size(batch_size: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - else: - return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // - _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) - - -def _get_max_graph_batch_size(max_num_seqs: int) -> int: - """ - max_num_seqs: Maximum number of sequences in a batch. - _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. - - pad the max_num_seqs if necessary by calling _get_graph_batch_size, - which will deal with some edge cases like 1, 2, 4. - - if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded size. - if not, it means the padded size is larger than the largest size in - _BATCH_SIZES_TO_CAPTURE, return the largest size in _BATCH_SIZES_TO_CAPTURE. - """ - padded_size = _get_graph_batch_size(max_num_seqs) - if padded_size in _BATCH_SIZES_TO_CAPTURE: - return padded_size - assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] - return _BATCH_SIZES_TO_CAPTURE[-1] diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 3ee0fb4dc943e..3ca0d88a42183 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -817,7 +817,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 205f8a337ce6c..0bf522d5333ed 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -489,7 +489,7 @@ def model_profile_run(): block_size = cache_config.block_size seq_num_blocks = (seq_len + block_size - 1) // block_size - seq_data, dummy_multi_modal_data = input_registry \ + dummy_data = input_registry \ .dummy_data_for_profiling(model_config, seq_len, mm_registry) @@ -498,11 +498,11 @@ def model_profile_run(): seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: seq_data}, + seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=block_tables, lora_request=None, - multi_modal_data=dummy_multi_modal_data) + multi_modal_data=dummy_data.multi_modal_data) seqs.append(seq) self.model_runner.block_size = tmp_cache_config.block_size diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index f43635464ef00..5f71ec0c14df8 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/serving/compatibility_matrix.rst + # Reminder: Please update docs/source/usage/compatibility_matrix.rst # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 7c0bc5a678956..6d00102e0a324 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -439,7 +439,7 @@ def init_worker(self, *args, **kwargs): Here we inject some common logic before initializing the worker. Arguments are passed to the worker class constructor. """ - enable_trace_function_call_for_thread() + enable_trace_function_call_for_thread(self.vllm_config) # see https://github.com/NVIDIA/nccl/issues/1234 os.environ['NCCL_CUMEM_ENABLE'] = '0'