diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index b2e910e1ba8a7..a67fc89d54e60 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do done lm_eval --model hf \ - --model_args pretrained=$MODEL,parallelize=True \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,parallelize=True" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 4d32b49a4fac3..65be3c5d93b20 100644 --- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do done lm_eval --model vllm \ - --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh index b4fdde6dab425..26f33b744289a 100644 --- a/.buildkite/lm-eval-harness/run-tests.sh +++ b/.buildkite/lm-eval-harness/run-tests.sh @@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do done # Parse list of configs. -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" do diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index f90e464288cf1..7cf05610b9953 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -56,7 +56,7 @@ def read_markdown(file): if os.path.exists(file): - with open(file, "r") as f: + with open(file) as f: return f.read() + "\n" else: return f"{file} not found.\n" @@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving): # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) if "serving" in str(test_file): # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 6059588fe7277..052060c576300 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -72,7 +72,7 @@ def main(args): # collect results for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: + with open(test_file) as f: results = results + json.loads(f.read()) # generate markdown table @@ -80,7 +80,7 @@ def main(args): md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - with open(args.description, "r") as f: + with open(args.description) as f: description = f.read() description = description.format( diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh index e9d7d6a8d760a..fb5063db86942 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -50,31 +50,30 @@ launch_trt_server() { git clone https://github.com/triton-inference-server/tensorrtllm_backend.git git lfs install cd tensorrtllm_backend - git checkout $trt_llm_version - tensorrtllm_backend_dir=$(pwd) + git checkout "$trt_llm_version" git submodule update --init --recursive # build trtllm engine cd /tensorrtllm_backend - cd ./tensorrt_llm/examples/${model_type} + cd "./tensorrt_llm/examples/${model_type}" python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} + --model_dir "${model_path}" \ + --dtype "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --output_dir "${trt_model_path}" trtllm-build \ - --checkpoint_dir ${trt_model_path} \ + --checkpoint_dir "${trt_model_path}" \ --use_fused_mlp \ --reduce_fusion disable \ --workers 8 \ - --gpt_attention_plugin ${model_dtype} \ - --gemm_plugin ${model_dtype} \ - --tp_size ${model_tp_size} \ - --max_batch_size ${max_batch_size} \ - --max_input_len ${max_input_len} \ - --max_seq_len ${max_seq_len} \ - --max_num_tokens ${max_num_tokens} \ - --output_dir ${trt_engine_path} + --gpt_attention_plugin "${model_dtype}" \ + --gemm_plugin "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --max_batch_size "${max_batch_size}" \ + --max_input_len "${max_input_len}" \ + --max_seq_len "${max_seq_len}" \ + --max_num_tokens "${max_num_tokens}" \ + --output_dir "${trt_engine_path}" # handle triton protobuf files and launch triton server cd /tensorrtllm_backend @@ -82,15 +81,15 @@ launch_trt_server() { cp -r all_models/inflight_batcher_llm/* triton_model_repo/ cd triton_model_repo rm -rf ./tensorrt_llm/1/* - cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false - python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5 - python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false - python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size - python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1 + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" cd /tensorrtllm_backend python3 scripts/launch_triton_server.py \ - --world_size=${model_tp_size} \ + --world_size="${model_tp_size}" \ --model_repo=/tensorrtllm_backend/triton_model_repo & } @@ -98,10 +97,7 @@ launch_trt_server() { launch_tgi_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -129,10 +125,7 @@ launch_tgi_server() { launch_lmdeploy_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") server_command="lmdeploy serve api_server $model \ @@ -149,10 +142,7 @@ launch_sglang_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -185,10 +175,7 @@ launch_vllm_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -217,19 +204,19 @@ launch_vllm_server() { main() { - if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then launch_trt_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then launch_tgi_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then launch_lmdeploy_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then launch_sglang_server fi diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index c6a1bbdeb7d48..686f70dbece6c 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -16,10 +16,10 @@ main() { fi # initial annotation - description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" + #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" # download results - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" mkdir -p results/ /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ ls @@ -30,15 +30,15 @@ main() { /workspace/buildkite-agent artifact upload "results.zip" # upload benchmarking scripts - cd $VLLM_SOURCE_CODE_LOC/ + cd "$VLLM_SOURCE_CODE_LOC/" zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # upload benchmarking pipeline /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md @@ -75,4 +75,4 @@ main() { # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } -main "$@" \ No newline at end of file +main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index dd8c15e0700eb..3f38cf5137535 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -12,7 +12,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" echo "GPU type is $gpu_type" } @@ -102,7 +102,7 @@ kill_gpu_processes() { pkill -f text-generation pkill -f lmdeploy - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done } @@ -119,8 +119,8 @@ wait_for_server() { ensure_installed() { # Ensure that the given command is installed by apt-get local cmd=$1 - if ! which $cmd >/dev/null; then - apt-get update && apt-get install -y $cmd + if ! which "$cmd" >/dev/null; then + apt-get update && apt-get install -y "$cmd" fi } @@ -173,13 +173,11 @@ run_serving_tests() { echo "Reuse previous server for test case $test_name" else kill_gpu_processes - bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \ + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ "$server_params" "$common_params" fi - wait_for_server - - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." else @@ -190,13 +188,13 @@ run_serving_tests() { # prepare tokenizer # this is required for lmdeploy. - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" rm -rf /tokenizer_cache mkdir /tokenizer_cache python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$model" \ --cachedir /tokenizer_cache - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" # change model name for lmdeploy (it will not follow standard hf name) @@ -307,11 +305,11 @@ run_serving_tests() { prepare_dataset() { # download sharegpt dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" echo "" > sonnet_4x.txt for _ in {1..4} do @@ -339,17 +337,17 @@ main() { prepare_dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # run the test - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" # upload benchmark results to buildkite python3 -m pip install tabulate pandas - python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" upload_to_buildkite } diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index a0b9a409b758d..d397b05cdff23 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -17,7 +17,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') echo "GPU type is $gpu_type" } @@ -93,7 +93,7 @@ kill_gpu_processes() { # wait until GPU memory usage smaller than 1GB - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done @@ -117,7 +117,7 @@ upload_to_buildkite() { fi # Use the determined command to annotate and upload artifacts - $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md + $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" } @@ -150,7 +150,7 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -206,9 +206,9 @@ run_throughput_tests() { throughput_args=$(json2args "$throughput_params") # check if there is enough GPU to run the test - tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -270,7 +270,7 @@ run_serving_tests() { # check if there is enough GPU to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -278,7 +278,7 @@ run_serving_tests() { server_model=$(echo "$server_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model') if [[ $server_model != "$client_model" ]]; then - echo "Server model and client model must be the same. Skip testcase $testname." + echo "Server model and client model must be the same. Skip testcase $test_name." continue fi @@ -293,8 +293,7 @@ run_serving_tests() { server_pid=$! # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "vllm server is up and running." else diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 4e4d4cd4ca3c6..92d6fad73a94c 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -36,11 +36,11 @@ # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index f16862907def1..19f7160e68a4d 100644 --- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10 retries=0 while [ $retries -lt 1000 ]; do - if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then exit 0 fi @@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do sleep 5 done -exit 1 \ No newline at end of file +exit 1 diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh index df201cdc7c554..902e162720b89 100755 --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script runs test inside the corresponding ROCm docker container. set -o pipefail @@ -31,8 +33,8 @@ cleanup_docker() { echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes - docker volume prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." @@ -57,17 +59,17 @@ done echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" -docker pull ${image_name} +docker pull "${image_name}" remove_docker_container() { - docker rm -f ${container_name} || docker image rm -f ${image_name} || true + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true } trap remove_docker_container EXIT echo "--- Running container" HF_CACHE="$(realpath ~)/huggingface" -mkdir -p ${HF_CACHE} +mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" commands=$@ @@ -107,35 +109,36 @@ fi PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then + # assign job count as the number of shards used + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do - #replace shard arguments - commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} - commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} - echo "Shard ${GPU} commands:$commands" + # assign shard-id for each shard + commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + echo "Shard ${GPU} commands:$commands_gpu" docker run \ --device /dev/kfd --device /dev/dri \ --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=${GPU} \ + -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name}_${GPU} \ - ${image_name} \ - /bin/bash -c "${commands}" \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}_${GPU}" \ + "${image_name}" \ + /bin/bash -c "${commands_gpu}" \ |& while read -r line; do echo ">>Shard $GPU: $line"; done & PIDS+=($!) done #wait for all processes to finish and collect exit codes - for pid in ${PIDS[@]}; do - wait ${pid} + for pid in "${PIDS[@]}"; do + wait "${pid}" STATUS+=($?) done - for st in ${STATUS[@]}; do + for st in "${STATUS[@]}"; do if [[ ${st} -ne 0 ]]; then echo "One of the processes failed with $st" - exit ${st} + exit "${st}" fi done else @@ -146,9 +149,9 @@ else --rm \ -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ - ${image_name} \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}" \ + "${image_name}" \ /bin/bash -c "${commands}" fi diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh index cbf6dda677c53..1641c1faa9d6a 100644 --- a/.buildkite/run-benchmarks.sh +++ b/.buildkite/run-benchmarks.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script is run by buildkite to run the benchmarks and upload the results to buildkite set -ex diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index fd60f5b6afeca..79526adef2a79 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex @@ -13,27 +15,38 @@ remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. source /etc/environment #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test - -# Run basic model test -docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" \ - --ignore=tests/models/test_embedding.py \ - --ignore=tests/models/test_oot_registration.py \ - --ignore=tests/models/test_registry.py \ - --ignore=tests/models/test_jamba.py \ - --ignore=tests/models/test_mamba.py \ - --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported - -# online inference -docker exec cpu-test bash -c " - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test + +function cpu_tests() { + set -e + + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/encoder_decoder/language + pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + + # online inference + docker exec cpu-test bash -c " + set -e + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index c331a9c49c0d0..b3771bb268e22 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -1,10 +1,16 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-48-95} +NUMA_NODE=${NUMA_NODE:-1} + # Try building the docker image -numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . -numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu . +numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } @@ -12,46 +18,61 @@ trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 - -# offline inference -docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" - -# Run basic model test -docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language \ - --ignore=tests/models/test_fp8.py \ - --ignore=tests/models/decoder_only/language/test_jamba.py \ - --ignore=tests/models/decoder_only/language/test_mamba.py \ - --ignore=tests/models/decoder_only/language/test_granitemoe.py \ - --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported - -# Run compressed-tensor test -docker exec cpu-test bash -c " - pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" - -# Run AWQ test -docker exec cpu-test bash -c " - pytest -s -v \ - tests/quantization/test_ipex_quant.py" - -# online inference -docker exec cpu-test bash -c " - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=48-92 - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ + --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ + --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 + +function cpu_tests() { + set -e + + # offline inference + docker exec cpu-test-avx2 bash -c " + set -e + python3 examples/offline_inference.py" + + # Run basic model test + docker exec cpu-test bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/encoder_decoder/language + pytest -v -s tests/models/decoder_only/language/test_models.py + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + + # Run compressed-tensor test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" + + # Run AWQ test + docker exec cpu-test bash -c " + set -e + pytest -s -v \ + tests/quantization/test_ipex_quant.py" + + # online inference + docker exec cpu-test bash -c " + set -e + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 25m bash -c "cpu_tests" diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh new file mode 100644 index 0000000000000..4505dc7a9373c --- /dev/null +++ b/.buildkite/run-hpu-test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t hpu-test-env -f Dockerfile.hpu . + +# Setup cleanup +remove_docker_container() { docker rm -f hpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh index 7ac4dcc4c786d..530bf90a855fe 100755 --- a/.buildkite/run-multi-node-test.sh +++ b/.buildkite/run-multi-node-test.sh @@ -14,7 +14,7 @@ DOCKER_IMAGE=$4 shift 4 COMMANDS=("$@") -if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then +if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then echo "The number of commands must be equal to the number of nodes." echo "Number of nodes: $NUM_NODES" echo "Number of commands: ${#COMMANDS[@]}" @@ -23,7 +23,7 @@ fi echo "List of commands" for command in "${COMMANDS[@]}"; do - echo $command + echo "$command" done start_network() { @@ -36,7 +36,7 @@ start_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done @@ -49,17 +49,20 @@ start_nodes() { # 3. map the huggingface cache directory to the container # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # starting from 192.168.10.11) - docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ + -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ + --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ + /bin/bash -c "tail -f /dev/null" # organize containers into a ray cluster - if [ $node -eq 0 ]; then + if [ "$node" -eq 0 ]; then # start the ray head node - docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" # wait for the head node to be ready sleep 10 else # start the ray worker nodes, and connect them to the head node - docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" fi done @@ -79,22 +82,22 @@ run_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" - if [ $node -ne 0 ]; then - docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + if [ "$node" -ne 0 ]; then + docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else - docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" fi done } cleanup() { for node in $(seq 0 $(($NUM_NODES-1))); do - docker stop node$node + docker stop "node$node" done docker network rm docker-net } diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 252c0f7fecd12..9259391aaed49 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e @@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then docker system prune -f - echo $current_time > /tmp/neuron-docker-build-timestamp + echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else - echo $(date +%s) > /tmp/neuron-docker-build-timestamp + date "+%s" > /tmp/neuron-docker-build-timestamp fi docker build -t neuron -f Dockerfile.neuron . @@ -34,7 +36,7 @@ wait_for_server_to_start() { timeout=300 counter=0 - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do + while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do sleep 1 counter=$((counter + 1)) if [ $counter -ge $timeout ]; then diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 70e56596c4a86..6b12f424fd828 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the OpenVINO docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex @@ -11,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh index 6989c94d46a89..770dad6ffa3a1 100644 --- a/.buildkite/run-tpu-test.sh +++ b/.buildkite/run-tpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e # Build the docker image. @@ -12,4 +14,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 6ffa66d5ef3d6..faeac8e2ded36 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 32eed1a771718..fbaa427bb7270 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,6 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only +# nightly(bool): run this test in nightly pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. @@ -119,6 +120,7 @@ steps: - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile commands: + - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py @@ -163,6 +165,14 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization +- label: V1 Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1 + - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] @@ -267,7 +277,6 @@ steps: source_file_dependencies: - benchmarks/ commands: - - pip install aiohttp - bash run-benchmarks.sh - label: Quantization Test # 33min @@ -304,7 +313,7 @@ steps: ##### models test ##### -- label: Basic Models Test # 3min +- label: Basic Models Test # 10min source_file_dependencies: - vllm/ - tests/models @@ -313,37 +322,50 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test (Standard) # 35min +- label: Decoder-only Language Models Test (Standard) # 18min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language/test_models.py - - pytest -v -s models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language -m core_model + - pytest -v -s models/decoder_only/language -m quant_model -- label: Decoder-only Language Models Test (Extended) # 1h20min +- label: Decoder-only Language Models Test (Extended) # 46min nightly: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language commands: - - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' -- label: Decoder-only Multi-Modal Models Test # 1h31min +- label: Decoder-only Multi-Modal Models Test (Standard) # 22min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language commands: - - pytest -v -s models/decoder_only/audio_language + - pytest -v -s models/decoder_only/audio_language -m core_model + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model + # No tests under this group for now + # - pytest -v -s models/decoder_only/audio_language -m quant_model + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model + +- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m + nightly: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + commands: + - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' # HACK - run phi3v tests separately to sidestep this transformers bug # https://github.com/huggingface/transformers/issues/34307 - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' -- label: Other Models Test # 6min +- label: Other Models Test # 20min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -421,7 +443,6 @@ steps: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" @@ -500,6 +521,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 6fddca0d6e4b9..4f54eea564ecb 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,3 +5,28 @@ updates: directory: "/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: ["dependencies"] + open-pull-requests-limit: 5 + reviewers: ["khluu", "simon-mo"] + allow: + - dependency-type: "all" + ignore: + - dependency-name: "torch" + - dependency-name: "torchvision" + - dependency-name: "xformers" + - dependency-name: "lm-format-enforcer" + - dependency-name: "gguf" + - dependency-name: "compressed-tensors" + - dependency-name: "ray[adag]" + - dependency-name: "lm-eval" + groups: + patch-update: + applies-to: version-updates + update-types: ["patch"] + minor-update: + applies-to: version-updates + update-types: ["minor"] diff --git a/.github/mergify.yml b/.github/mergify.yml index 1ce5039a061b2..ca4bd7ee2b87f 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -46,7 +46,9 @@ pull_request_rules: comment: message: | This pull request has merge conflicts that must be resolved before it can be - merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + merged. Please rebase the PR, @{{author}}. + + https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork - name: remove 'needs-rebase' label when conflict is resolved conditions: diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh new file mode 100755 index 0000000000000..3b2da7b9f8966 --- /dev/null +++ b/.github/scripts/cleanup_pr_body.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +set -eu + +# ensure 1 argument is passed +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +PR_NUMBER=$1 +OLD=/tmp/orig_pr_body.txt +NEW=/tmp/new_pr_body.txt + +gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" +cp "${OLD}" "${NEW}" + +# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" +sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}" + +# Remove "FIX #xxxx (*link existing issues this PR will resolve*)" +sed -i '/FIX #xxxx.*$/d' "${NEW}" + +# Remove "FILL IN THE PR DESCRIPTION HERE" +sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" + +# Run this only if ${NEW} is different than ${OLD} +if ! cmp -s "${OLD}" "${NEW}"; then + echo "Updating PR body" + gh pr edit --body-file "${NEW}" "${PR_NUMBER}" +else + echo "No changes needed" +fi diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index b80749aaa8fec..0226cf0ca00e9 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -6,12 +6,14 @@ on: paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' pull_request: branches: - "main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' env: LC_ALL: en_US.UTF-8 @@ -28,7 +30,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Checkout" - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml index 68d60d7365ed1..68149d2dc019f 100644 --- a/.github/workflows/clang-format.yml +++ b/.github/workflows/clang-format.yml @@ -6,9 +6,21 @@ on: push: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' pull_request: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' jobs: clang-format: @@ -17,9 +29,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml new file mode 100644 index 0000000000000..0085a1cc22373 --- /dev/null +++ b/.github/workflows/cleanup_pr_body.yml @@ -0,0 +1,26 @@ +name: Cleanup PR Body + +on: + pull_request_target: + types: [opened, reopened, edited] + +permissions: + pull-requests: write + +jobs: + update-description: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: '3.12' + + - name: Update PR description + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 0000000000000..68887adaae54b --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,45 @@ +name: codespell + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + pull_request: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + +jobs: + codespell: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Spelling check with codespell + run: | + codespell --toml pyproject.toml diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 5f1e5f8eeaf7d..73eeacf1fa562 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -6,20 +6,35 @@ on: push: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' pull_request: branches: - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - '**/*.py' + # - '.github/workflows/mypy.yaml' + # - 'tools/mypy.sh' + # - 'pyproject.toml' jobs: mypy: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -33,4 +48,4 @@ jobs: - name: Mypy run: | echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 + tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index f959a1cacf866..c1051d10a4860 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Extract branch info shell: bash @@ -48,13 +48,13 @@ jobs: fail-fast: false matrix: os: ['ubuntu-20.04'] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. cuda-version: ['11.8', '12.1'] steps: - name: Checkout - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup ccache uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index 9cc8a9e914474..7266cc378cfb0 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -6,33 +6,47 @@ on: push: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml pull_request: branches: - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - "**/*.py" + # - pyproject.toml + # - requirements-lint.txt + # - .github/workflows/matchers/ruff.json + # - .github/workflows/ruff.yml jobs: ruff: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml - - name: Run isort - run: | - isort . --check-only + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh index 312c6e82f33a3..3d0b7a1fe0402 100644 --- a/.github/workflows/scripts/cuda-install.sh +++ b/.github/workflows/scripts/cuda-install.sh @@ -1,16 +1,16 @@ #!/bin/bash # Replace '.' with '-' ex: 11.8 -> 11-8 -cuda_version=$(echo $1 | tr "." "-") +cuda_version=$(echo "$1" | tr "." "-") # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 -OS=$(echo $2 | tr -d ".\-") +OS=$(echo "$2" | tr -d ".\-") # Installs CUDA -wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb +wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" sudo dpkg -i cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb sudo apt -qq update -sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} +sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" sudo apt clean # Test nvcc diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh index dfc1851d7692c..e3cda7dad2d17 100644 --- a/.github/workflows/scripts/pytorch-install.sh +++ b/.github/workflows/scripts/pytorch-install.sh @@ -6,7 +6,7 @@ cuda_version=$3 # Install torch $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya -$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} +$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" # Print version information $python_executable --version diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 0000000000000..4b1587e373e17 --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,37 @@ +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml index 9f06b35c19e32..ff441f94435ad 100644 --- a/.github/workflows/yapf.yml +++ b/.github/workflows/yapf.yml @@ -6,26 +6,33 @@ on: push: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml pull_request: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + jobs: yapf: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + pip install toml==0.10.2 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/.gitignore b/.gitignore index 1ea6e3419db2a..ceef6a5fba456 100644 --- a/.gitignore +++ b/.gitignore @@ -202,3 +202,4 @@ benchmarks/*.json # Linting actionlint +shellcheck*/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 42cbf18a0f712..284196bc2d279 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -6,17 +6,16 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.12" sphinx: - configuration: docs/source/conf.py - fail_on_warning: true + configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: [] # Optionally declare the Python requirements required to build your docs python: - install: - - requirements: docs/requirements-docs.txt - + install: + - requirements: docs/requirements-docs.txt diff --git a/.shellcheckrc b/.shellcheckrc new file mode 100644 index 0000000000000..f3b6eedf8d907 --- /dev/null +++ b/.shellcheckrc @@ -0,0 +1,9 @@ +# rules currently disabled: +# +# SC1091 (info): Not following: was not specified as input (see shellcheck -x) +# SC2004 (style): $/${} is unnecessary on arithmetic variables. +# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. +# SC2155 (warning): Declare and assign separately to avoid masking return values. +# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. +# +disable=SC1091,SC2004,SC2129,SC2155,SC2164 diff --git a/CMakeLists.txt b/CMakeLists.txt index 1a6a311e97633..5acbd762ee957 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,13 +31,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12") +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") # # Supported/expected torch versions for CUDA/ROCm. @@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") # # Try to find python package with an executable that exactly matches @@ -128,9 +128,9 @@ endif() if(VLLM_GPU_LANG STREQUAL "CUDA") # - # For cuda we want to be able to control which architectures we compile for on + # For cuda we want to be able to control which architectures we compile for on # a per-file basis in order to cut down on compile time. So here we extract - # the set of architectures we want to compile for and remove the from the + # the set of architectures we want to compile for and remove the from the # CMAKE_CUDA_FLAGS so that they are not applied globally. # clear_cuda_arches(CUDA_ARCH_FLAGS) @@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") # Filter the target architectures by the supported supported archs # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS + cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") else() @@ -187,10 +187,12 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") set(VLLM_EXT_SRC "csrc/cache_kernels.cu" - "csrc/attention/attention_kernels.cu" + "csrc/attention/paged_attention_v1.cu" + "csrc/attention/paged_attention_v2.cu" "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" + "csrc/layernorm_quant_kernels.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" @@ -236,7 +238,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # are not supported by Machete yet. cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) if (MARLIN_ARCHS) - set(MARLIN_SRCS + set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" @@ -277,7 +279,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures") endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't # build any 3x kernels set(SCALED_MM_3X_ARCHS) endif() @@ -285,7 +287,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. - cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS + cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) @@ -316,10 +318,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) # - # For the Machete kernels we automatically generate sources for various + # For the Machete kernels we automatically generate sources for various # preselected input type pairs and schedules. # Generate sources: - set(MACHETE_GEN_SCRIPT + set(MACHETE_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) @@ -329,8 +331,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -340,11 +342,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT machete_generation_result EQUAL 0) message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" + " Result: \"${machete_generation_result}\"" "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") else() - set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} + set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} CACHE STRING "Last run machete generate script hash" FORCE) message(STATUS "Machete generation completed successfully.") endif() @@ -366,7 +368,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " @@ -392,8 +394,8 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) -# If CUTLASS is compiled on NVCC >= 12.5, it by default uses -# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # driver API. This causes problems when linking with earlier versions of CUDA. # Setting this variable sidesteps the issue by calling the driver directly. target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) @@ -471,9 +473,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") return() endif () -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) # we need to manually set VLLM_GPU_ARCHES here. if(VLLM_GPU_LANG STREQUAL "CUDA") foreach(_ARCH ${CUDA_ARCHS}) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b39fd75b5fb70..6d46a6dca371d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,60 +1,3 @@ # Contributing to vLLM -Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: - -- Identify and report any issues or bugs. -- Request or add support for a new model. -- Suggest or implement new features. -- Improve documentation or contribute a how-to guide. - -We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. - -Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! - -## License - -See [LICENSE](LICENSE). - -## Developing - -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. - -## Testing - -```bash -pip install -r requirements-dev.txt - -# linting and formatting -bash format.sh -# Static type checking -mypy -# Unit tests -pytest tests/ -``` -**Note:** Currently, the repository does not pass the ``mypy`` tests. - -## Contribution Guidelines - -### DCO and Signed-off-by - -When contributing changes to this project, you must agree to the [DCO](DCO). -Commits must include a `Signed-off-by:` header which certifies agreement with -the terms of the [DCO](DCO). - -Using `-s` with `git commit` will automatically add this header. - -### Issues - -If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. - -> [!IMPORTANT] -> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability). - -### Pull Requests & Code Reviews - -Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. - -### Thank You - -Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. -All of your contributions help make vLLM a great tool and community for everyone! +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). diff --git a/Dockerfile b/Dockerfile index 0a562253c537b..220dbe26712ec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -191,6 +191,14 @@ ADD . /vllm-workspace/ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt +# enable fast downloads from hf (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install hf_transfer +ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +# Copy in the v1 package for testing (it isn't distributed yet) +COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 + # doc requires source code # we hide them inside `test_docs/` , so that this source code # will not be imported by other tests @@ -206,7 +214,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10 + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index f1a21d6bd13fc..287b4958da4e5 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.4.0 +RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace diff --git a/Dockerfile.hpu b/Dockerfile.hpu new file mode 100644 index 0000000000000..d18fc016387bf --- /dev/null +++ b/Dockerfile.hpu @@ -0,0 +1,18 @@ +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-hpu.txt + +ENV no_proxy=localhost,127.0.0.1 +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 0d0d8df94578c..2143315d2a078 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-neuron.txt ENV VLLM_TARGET_DEVICE neuron diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index cd5fcf481f07c..b19c6ddec7948 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 562117a313020..8fb79afaebe97 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -52,7 +52,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ torch==2.6.0.dev20240918 \ - setuptools-scm>=8 \ + 'setuptools-scm>=8' \ torchvision==0.20.0.dev20240918 \ --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ *) ;; esac diff --git a/Dockerfile.tpu b/Dockerfile.tpu index dd8f9ad4714a9..0a507b6ecdf60 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 -# Install the TPU and Pallas dependencies. -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - # Build vLLM. COPY . . ARG GIT_REPO_CHECK=0 @@ -25,7 +19,6 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ -r requirements-tpu.txt RUN python3 setup.py develop diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 0ecb46df6256c..63bc682770422 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ -r requirements-xpu.txt +RUN git clone https://github.com/intel/pti-gpu && \ + cd pti-gpu/sdk && \ + git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ + make -j && \ + cmake --install . --config Release --prefix "/usr/local" + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + COPY . . ARG GIT_REPO_CHECK RUN --mount=type=bind,source=.git,target=.git \ diff --git a/benchmarks/README.md b/benchmarks/README.md index 192d6c4022c83..2aa4a285021f1 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -6,3 +6,14 @@ You can download the dataset by running: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 0a903877f000d..25c8b1bbf3e22 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -79,7 +79,7 @@ async def async_request_tgi( # any data, we should skip it. if chunk_bytes.startswith(":"): continue - chunk = remove_prefix(chunk_bytes, "data:") + chunk = chunk_bytes.removeprefix("data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -144,8 +144,8 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -256,13 +256,14 @@ async def async_request_openai_completions( async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: + first_chunk_received = False async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -274,7 +275,8 @@ async def async_request_openai_completions( if data["choices"][0]["text"]: timestamp = time.perf_counter() # First token - if ttft == 0.0: + if not first_chunk_received: + first_chunk_received = True ttft = time.perf_counter() - st output.ttft = ttft @@ -285,9 +287,14 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += data["choices"][0]["text"] - + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") output.generated_text = generated_text - output.success = True output.latency = latency else: output.error = response.reason or "" @@ -349,8 +356,8 @@ async def async_request_openai_chat_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -389,14 +396,6 @@ async def async_request_openai_chat_completions( return output -# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix):] - return text - - def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 1aac029992dbf..6d33096ca1d11 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -118,7 +118,7 @@ def main(args): random.seed(args.seed) if args.dataset_path is not None: print(f"Start to sample {args.num_prompts} prompts" - "from {args.dataset_path}") + f"from {args.dataset_path}") filtered_datasets = sample_requests( dataset_path=args.dataset_path, num_requests=args.num_prompts, @@ -142,13 +142,6 @@ def main(args): repeat_count=args.repeat_count, sort=args.sort) - print("------warm up------") - test_prefix( - llm=llm, - prompts=prompts, - sampling_params=sampling_params, - ) - print("------start generating------") test_prefix( llm=llm, diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 0d205014b15bf..bdb8ea8e2a5dc 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -297,8 +297,33 @@ def sample_random_requests( async def get_request( input_requests: List[Tuple[str, int, int]], request_rate: float, + burstiness: float = 1.0, ) -> AsyncGenerator[Tuple[str, int, int], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + for request in input_requests: yield request @@ -306,8 +331,9 @@ async def get_request( # If the request rate is infinity, then we don't need to wait. continue - # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) # The next request will be sent after the interval. await asyncio.sleep(interval) @@ -406,9 +432,9 @@ def calculate_metrics( median_itl_ms=np.median(itls or 0) * 1000, percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_e2el_ms=np.median(e2els or 0) * 1000, + mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.mean(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], ) @@ -426,6 +452,7 @@ async def benchmark( logprobs: Optional[int], best_of: int, request_rate: float, + burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: List[str], @@ -480,7 +507,13 @@ async def benchmark( if profile_output.success: print("Profiler started") + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) @@ -502,7 +535,7 @@ async def limited_request_func(request_func_input, pbar): benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate): + async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, prompt=prompt, @@ -769,6 +802,7 @@ def main(args: argparse.Namespace): logprobs=args.logprobs, best_of=args.best_of, request_rate=args.request_rate, + burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), @@ -807,6 +841,7 @@ def main(args: argparse.Namespace): # Traffic result_json["request_rate"] = ( args.request_rate if args.request_rate < float("inf") else "inf") + result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result @@ -922,8 +957,20 @@ def main(args: argparse.Namespace): default=float("inf"), help="Number of requests per second. If this is inf, " "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.", + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index ee41c8ea38382..159cf055737ce 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -4,10 +4,11 @@ import json import random import time -from typing import List, Optional, Tuple +from typing import List, Optional import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -15,16 +16,56 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) +from vllm.inputs import TextPrompt +from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + + +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -33,23 +74,36 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] - for i in range(len(dataset)): + filtered_dataset: List[SampleRequest] = [] + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -60,13 +114,17 @@ def sample_requests( if prompt_len > 1024 or prompt_len + output_len > 2048: # Prune too long sequences. continue - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset def run_vllm( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: EngineArgs, ) -> float: @@ -74,17 +132,19 @@ def run_vllm( llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) use_beam_search = False @@ -94,11 +154,11 @@ def run_vllm( llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0][2] - for prompt, input_len, _output_len in requests: - assert _output_len == output_len + for request in requests: + assert request.expected_output_len == output_len start = time.perf_counter() llm.beam_search( prompts, @@ -112,7 +172,7 @@ def run_vllm( async def run_vllm_async( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, @@ -123,17 +183,19 @@ async def run_vllm_async( engine_args, disable_frontend_multiprocessing) as llm: # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) generators = [] @@ -149,7 +211,7 @@ async def run_vllm_async( def run_hf( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -207,14 +269,14 @@ def run_hf( def run_mii( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) @@ -243,12 +305,17 @@ def main(args: argparse.Namespace): else: raise ValueError( f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + requests = [ + SampleRequest(prompt=prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len) + for _ in range(args.num_prompts) + ] else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -270,9 +337,15 @@ def main(args: argparse.Namespace): args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_output_tokens = sum(output_len for _, _, output_len in requests) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") @@ -299,7 +372,9 @@ def main(args: argparse.Namespace): parser.add_argument("--dataset", type=str, default=None, - help="Path to the dataset.") + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, diff --git a/benchmarks/kernels/benchmark_machete.py b/benchmarks/kernels/benchmark_machete.py index b70c4b94c97a1..665b50bf18cf0 100644 --- a/benchmarks/kernels/benchmark_machete.py +++ b/benchmarks/kernels/benchmark_machete.py @@ -269,10 +269,10 @@ def run_square_bench(args): def run_range_bench(args): - m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")] - m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")] + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) m_increment, k_increment, n_increment = \ - [int(x) for x in args.dim_increment.split(",")] + (int(x) for x in args.dim_increment.split(",")) Ms = list(range(m_start, m_end + 1, m_increment)) Ks = list(range(k_start, k_end + 1, k_increment)) Ns = list(range(n_start, n_end + 1, n_increment)) diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh index 8c5cd454fbbee..ba7383d88dc49 100755 --- a/benchmarks/launch_tgi_server.sh +++ b/benchmarks/launch_tgi_server.sh @@ -4,13 +4,13 @@ PORT=8000 MODEL=$1 TOKENS=$2 -docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ - -v $PWD/data:/data \ +docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ + -v "$PWD/data:/data" \ ghcr.io/huggingface/text-generation-inference:2.2.0 \ - --model-id $MODEL \ + --model-id "$MODEL" \ --sharded false \ --max-input-length 1024 \ --max-total-tokens 2048 \ --max-best-of 5 \ --max-concurrent-requests 5000 \ - --max-batch-total-tokens $TOKENS + --max-batch-total-tokens "$TOKENS" diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 43380a4940d4f..66d70705d0654 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # list(APPEND CXX_COMPILE_FLAGS "-fopenmp" + "-mf16c" "-DVLLM_CPU_EXTENSION") execute_process(COMMAND cat /proc/cpuinfo @@ -98,7 +99,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.5.3 + GIT_TAG v3.6 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) diff --git a/collect_env.py b/collect_env.py index 80403d576d78f..254c19b19a5ac 100644 --- a/collect_env.py +++ b/collect_env.py @@ -1,17 +1,19 @@ # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py -# Unlike the rest of the PyTorch this file must be python2 compliant. -# This script outputs relevant system environment info -# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` import datetime import locale import os import re import subprocess import sys +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` from collections import namedtuple +from vllm.envs import environment_variables + try: import torch TORCH_AVAILABLE = True @@ -52,6 +54,7 @@ 'vllm_version', # vllm specific field 'vllm_build_flags', # vllm specific field 'gpu_topo', # vllm specific field + 'env_vars', ]) DEFAULT_CONDA_PATTERNS = { @@ -512,6 +515,22 @@ def is_xnnpack_available(): else: return "N/A" +def get_env_vars(): + env_vars = '' + secret_terms=('secret', 'token', 'api', 'access', 'password') + report_prefix = ("TORCH", "NCCL", "PYTORCH", + "CUDA", "CUBLAS", "CUDNN", + "OMP_", "MKL_", + "NVIDIA") + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars def get_env_info(): run_lambda = run @@ -583,6 +602,7 @@ def get_version_or_na(cfg, prefix): vllm_version=vllm_version, vllm_build_flags=vllm_build_flags, gpu_topo=gpu_topo, + env_vars=get_env_vars(), ) @@ -631,6 +651,8 @@ def get_version_or_na(cfg, prefix): {vllm_build_flags} GPU Topology: {gpu_topo} + +{env_vars} """.strip() diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cuh similarity index 64% rename from csrc/attention/attention_kernels.cu rename to csrc/attention/attention_kernels.cuh index bcd170411e7cb..563e1438f0b01 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cuh @@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel( } // namespace vllm -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel), \ - shared_mem_size); \ - vllm::paged_attention_v1_kernel \ - <<>>( \ - out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ - scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -// TODO(woosuk): Tune NUM_THREADS. -template -void paged_attention_v1_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_seq_len = - DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; - int logits_size = padded_max_seq_len * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len - // Keep that in sync with the logic here! - int shared_mem_size = std::max(logits_size, outputs_size); - - dim3 grid(num_heads, num_seqs, 1); - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V1(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V1(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V1(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V1(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V1(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V1(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V1(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V1(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v1_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); - -#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V1_LAUNCHER_BLOCK_SIZE) -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel \ - <<>>( \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ - value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); \ - vllm::paged_attention_v2_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ - max_num_partitions); - -template -void paged_attention_v2_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); - int logits_size = PARTITION_SIZE * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - - // For paged attention v2 kernel. - dim3 grid(num_heads, num_seqs, max_num_partitions); - int shared_mem_size = std::max(logits_size, outputs_size); - // For paged attention v2 reduce kernel. - dim3 reduce_grid(num_heads, num_seqs); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); - - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V2(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V2(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V2(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V2(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V2(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V2(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V2(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V2(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v2_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& - tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V2_LAUNCHER_BLOCK_SIZE) -} - #undef WARP_SIZE #undef MAX #undef MIN diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu new file mode 100644 index 0000000000000..8b99f0843aaf6 --- /dev/null +++ b/csrc/attention/paged_attention_v1.cu @@ -0,0 +1,193 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ + VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ + ((void*)vllm::paged_attention_v1_kernel), \ + shared_mem_size); \ + vllm::paged_attention_v1_kernel \ + <<>>( \ + out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ + scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ + alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +// TODO(woosuk): Tune NUM_THREADS. +template +void paged_attention_v1_launcher( + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int padded_max_seq_len = + DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; + int logits_size = padded_max_seq_len * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len + // Keep that in sync with the logic here! + int shared_mem_size = std::max(logits_size, outputs_size); + + dim3 grid(num_heads, num_seqs, 1); + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 64: + LAUNCH_PAGED_ATTENTION_V1(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V1(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V1(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V1(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V1(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V1(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V1(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v1_launcher( \ + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ + seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); + +#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v1( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V1_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu new file mode 100644 index 0000000000000..3a7a9dee916aa --- /dev/null +++ b/csrc/attention/paged_attention_v2.cu @@ -0,0 +1,203 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ + vllm::paged_attention_v2_kernel \ + <<>>( \ + exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ + value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ + seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ + kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); \ + vllm::paged_attention_v2_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ + max_num_partitions); + +template +void paged_attention_v2_launcher( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); + int logits_size = PARTITION_SIZE * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + + // For paged attention v2 kernel. + dim3 grid(num_heads, num_seqs, max_num_partitions); + int shared_mem_size = std::max(logits_size, outputs_size); + // For paged attention v2 reduce kernel. + dim3 reduce_grid(num_heads, num_seqs); + int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); + + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 64: + LAUNCH_PAGED_ATTENTION_V2(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V2(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V2(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V2(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V2(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V2(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V2(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v2_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v2( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& + tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V2_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index abb4e3bea14bb..e3953c7c45719 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -22,6 +22,16 @@ struct KernelVecType { using v_load_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using q_load_vec_type = vec_op::FP16Vec8; + using q_vec_type = vec_op::FP32Vec16; + using k_load_vec_type = vec_op::FP16Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +}; + #ifdef __AVX512BF16__ template <> struct KernelVecType { diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index a325153b470cc..4bb4eb0f491ac 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -// FIXME: FP16 is not fully supported in Torch-CPU #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) @@ -50,37 +50,37 @@ template struct Vec { struct FP32Vec8; struct FP32Vec16; -#ifdef __AVX512FP16__ struct FP16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; - __m128h reg; + __m128i reg; - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + explicit FP16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + explicit FP16Vec8(const FP32Vec8 &); - explicit FP16Vec8(__m128h data) : reg(data) {} + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } + __m256i reg; - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } + explicit FP16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } + explicit FP16Vec16(const FP32Vec16 &); - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + _mm256_mask_storeu_epi16(ptr, mask, reg); + } }; -#endif struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; @@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif + explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} explicit FP32Vec8(const BF16Vec8 &v) : reg(_mm256_castsi256_ps( @@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec { : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const INT32Vec16 &v) @@ -430,6 +432,16 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(const FP32Vec8 &data) : reg_low(data.reg), reg_high(data.reg) {} + explicit FP32Vec16(const FP16Vec16 &v) { + __m128i low = _mm256_extractf128_si256(v.reg, 0); + __m128i high = _mm256_extractf128_si256(v.reg, 1); + + reg_low = _mm256_cvtph_ps(low); + reg_high = _mm256_cvtph_ps(high); + } + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec16 &v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -534,24 +546,34 @@ template using vec_t = typename VecType::vec_type; template <> struct VecType { using vec_type = FP32Vec8; }; -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; -#endif +template <> struct VecType { using vec_type = FP16Vec8; }; template <> struct VecType { using vec_type = BF16Vec8; }; template void storeFP32(float v, T *ptr) { *ptr = v; } -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { acc = acc + a * b; } +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast(ptr) = + _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} + +#ifdef __AVX512F__ +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} +#else +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +#endif + #ifdef __AVX512BF16__ template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp index 024ad4ae43da8..8b5011dc065f0 100644 --- a/csrc/cpu/dnnl_helper.hpp +++ b/csrc/cpu/dnnl_helper.hpp @@ -2,6 +2,7 @@ #define DNNL_HELPER_HPP #include +#include #include "oneapi/dnnl/dnnl.hpp" @@ -32,6 +33,11 @@ struct DNNLType { static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; }; +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + template constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index b493fd793818a..f42fa2361a2db 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -23,6 +23,13 @@ struct KernelVecType { using cvt_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { + using load_vec_type = vec_op::FP16Vec16; + using azp_adj_load_vec_type = vec_op::INT32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + #ifdef __AVX512F__ template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu index 9b82bec44c3c6..123278bfed71d 100644 --- a/csrc/custom_all_reduce.cu +++ b/csrc/custom_all_reduce.cu @@ -5,32 +5,29 @@ #include "custom_all_reduce.cuh" -// fake pointer type, must match fptr_t type in ops.h +// Fake pointer type, must match fptr_t type in ops.h. +// We use this type alias to indicate when pointers are passed in as int64_t. using fptr_t = int64_t; static_assert(sizeof(void*) == sizeof(fptr_t)); -fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, - const std::vector& handles, - const std::vector& offsets, int64_t rank, +fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, + torch::Tensor& rank_data, int64_t rank, bool full_nvlink) { - int world_size = offsets.size(); + int world_size = fake_ipc_ptrs.size(); if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported"); if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now"); - if (world_size != handles.size()) - throw std::invalid_argument( - "handles length should equal to offsets length"); if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in"); - cudaIpcMemHandle_t ipc_handles[8]; + vllm::Signal* ipc_ptrs[8]; for (int i = 0; i < world_size; i++) { - std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t)); + ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); } - return (fptr_t) new vllm::CustomAllreduce( - reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), - rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); + return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(), + rank_data.numel(), rank, world_size, + full_nvlink); } /** @@ -55,26 +52,48 @@ bool _is_weak_contiguous(torch::Tensor& t) { t.numel() * t.element_size()); } -void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, - cudaStream_t stream) { +/** + * Performs an out-of-place allreduce and stores result in out. + * + * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered. + * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first + * copied into _reg_buffer. + */ +void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, + fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) { auto fa = reinterpret_cast(_fa); + const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); + auto stream = c10::cuda::getCurrentCUDAStream().stream(); + + TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); + TORCH_CHECK_EQ(inp.numel(), out.numel()); TORCH_CHECK(_is_weak_contiguous(out)); + TORCH_CHECK(_is_weak_contiguous(inp)); + auto input_size = inp.numel() * inp.element_size(); + auto reg_buffer = reinterpret_cast(_reg_buffer); + if (reg_buffer) { + TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes); + AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size, + cudaMemcpyDeviceToDevice, stream)); + } else { + reg_buffer = inp.data_ptr(); + } switch (out.scalar_type()) { case at::ScalarType::Float: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + fa->allreduce(stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } case at::ScalarType::Half: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + fa->allreduce(stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) case at::ScalarType::BFloat16: { fa->allreduce( - stream, reinterpret_cast(inp.data_ptr()), + stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } @@ -85,57 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, } } -void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - _all_reduce(_fa, inp, out, stream); -} - -void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, - torch::Tensor& out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - - auto input_size = inp.numel() * inp.element_size(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(), - "registered buffer is too small to contain the input"); - AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(), - input_size, cudaMemcpyDeviceToDevice, stream)); - _all_reduce(_fa, reg_buffer, out, stream); -} - void dispose(fptr_t _fa) { - auto fa = reinterpret_cast(_fa); - delete fa; + delete reinterpret_cast(_fa); } int64_t meta_size() { return sizeof(vllm::Signal); } -void register_buffer(fptr_t _fa, torch::Tensor& t, - const std::vector& handles, - const std::vector& offsets) { +void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs) { auto fa = reinterpret_cast(_fa); - fa->register_buffer(handles, offsets, t.data_ptr()); + TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_); + void* ipc_ptrs[8]; + for (int i = 0; i < fake_ipc_ptrs.size(); i++) { + ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); + } + fa->register_buffer(ipc_ptrs); } -std::tuple> get_graph_buffer_ipc_meta( - fptr_t _fa) { +// Use vector to represent byte data for python binding compatibility. +std::tuple, std::vector> +get_graph_buffer_ipc_meta(fptr_t _fa) { auto fa = reinterpret_cast(_fa); - auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta(); - auto options = - torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); - auto handles = - torch::empty({static_cast(handle_bytes.size())}, options); - std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size()); - return {handles, std::move(offsets)}; + auto [handle, offsets] = fa->get_graph_buffer_ipc_meta(); + std::vector bytes(handle.begin(), handle.end()); + return std::make_tuple(bytes, offsets); } -void register_graph_buffers(fptr_t _fa, const std::vector& handles, +// Use vector to represent byte data for python binding compatibility. +void register_graph_buffers(fptr_t _fa, + const std::vector>& handles, const std::vector>& offsets) { auto fa = reinterpret_cast(_fa); - fa->register_graph_buffers(handles, offsets); + std::vector bytes; + bytes.reserve(handles.size()); + for (int i = 0; i < handles.size(); i++) { + bytes.emplace_back(handles[i].begin(), handles[i].end()); + } + bytes.reserve(handles.size()); + fa->register_graph_buffers(bytes, offsets); } diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index a2f7e43300002..6be4d4f2b2eb8 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -285,46 +285,52 @@ class CustomAllreduce { int world_size_; bool full_nvlink_; - // below are device pointers RankSignals sg_; + // Stores an map from a pointer to its peer pointters from all ranks. std::unordered_map buffers_; Signal* self_sg_; - // stores the registered device pointers from all ranks + // Stores rank data from all ranks. This is mainly for cuda graph purposes. + // For cuda graph to work, all kernel arguments must be fixed during graph + // capture time. However, the peer pointers are not known during graph capture + // time. Therefore, during capture, we increment the rank data pointer and use + // that as the argument to the kernel. The kernel arguments are stored in + // graph_unreg_buffers_. The actual peer pointers will be filled in at the + // memory pointed to by the pointers in graph_unreg_buffers_ when + // the IPC handles are exchanged between ranks. + // + // The overall process looks like this: + // 1. Graph capture. + // 2. Each rank obtains the IPC handles for each addresses used during cuda + // graph capture using get_graph_buffer_ipc_meta. + // 3. (In Python) all gather the IPC handles. + // 4. Obtain the peer pointers by opening the IPC handles, and store them in + // the rank data array at corresponding positions. RankData *d_rank_data_base_, *d_rank_data_end_; std::vector graph_unreg_buffers_; // a map from IPC handles to opened IPC pointers std::map ipc_handles_; /** - * meta is a pointer to device metadata and temporary buffer for allreduce. + * Signals are an array of ipc-enabled buffers from all ranks. + * For each of the buffer, the layout is as follows: + * | -- sizeof(Signal) -- | ------ a few MB ----- | + * The first section is for allreduce synchronization, and the second section + * is for storing the intermediate results required by some allreduce algos. * - * There's a total of sizeof(Signal) of prefix before the actual data, - * so meta + 1 points to actual temporary buffer. - * - * note: this class does not own any device memory. Any required buffers - * are passed in from the constructor + * Note: this class does not own any device memory. Any required buffers + * are passed in from the constructor. */ - CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz, - const cudaIpcMemHandle_t* handles, - const std::vector& offsets, int rank, - bool full_nvlink = true) + CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz, + int rank, int world_size, bool full_nvlink = true) : rank_(rank), - world_size_(offsets.size()), + world_size_(world_size), full_nvlink_(full_nvlink), - self_sg_(meta), + self_sg_(signals[rank]), d_rank_data_base_(reinterpret_cast(rank_data)), d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { for (int i = 0; i < world_size_; i++) { - Signal* rank_sg; - if (i != rank_) { - char* handle = open_ipc_handle(&handles[i]); - handle += offsets[i]; - rank_sg = (Signal*)handle; - } else { - rank_sg = self_sg_; - } - sg_.signals[i] = rank_sg; + sg_.signals[i] = signals[i]; } } @@ -341,11 +347,10 @@ class CustomAllreduce { return it->second; } - std::pair, std::vector> - get_graph_buffer_ipc_meta() { + std::pair> get_graph_buffer_ipc_meta() { auto num_buffers = graph_unreg_buffers_.size(); auto handle_sz = sizeof(cudaIpcMemHandle_t); - std::vector handles(handle_sz * num_buffers, 0); + std::string handles(handle_sz * num_buffers, static_cast(0)); std::vector offsets(num_buffers); for (int i = 0; i < num_buffers; i++) { auto ptr = graph_unreg_buffers_[i]; @@ -370,26 +375,22 @@ class CustomAllreduce { std::to_string(d_rank_data_base_ + num - d_rank_data_end_)); } - void register_buffer(const std::vector& handles, - const std::vector& offsets, void* self) { + /** + * Register already-shared IPC pointers. + */ + void register_buffer(void** ptrs) { check_rank_data_capacity(); RankData data; for (int i = 0; i < world_size_; i++) { - if (i != rank_) { - char* handle = open_ipc_handle(handles[i].data()); - handle += offsets[i]; - data.ptrs[i] = handle; - } else { - data.ptrs[i] = self; - } + data.ptrs[i] = ptrs[i]; } auto d_data = d_rank_data_base_++; CUDACHECK( cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice)); - buffers_[self] = d_data; + buffers_[ptrs[rank_]] = d_data; } - // note: when registering graph buffers, we intentionally choose to not + // Note: when registering graph buffers, we intentionally choose to not // deduplicate the addresses. That means if the allocator reuses some // addresses, they will be registered again. This is to account for the remote // possibility of different allocation patterns between ranks. For example, @@ -424,11 +425,13 @@ class CustomAllreduce { } /** - * This is the result after careful grid search. Using 36 blocks give the best - * or close to the best runtime on the devices I tried: A100, A10, A30, T4, - * V100. You'll notice that NCCL kernels also only take a small amount of SMs. - * Not quite sure the underlying reason, but my guess is that too many SMs - * will cause contention on NVLink bus. + * Performs allreduce, assuming input has already been registered. + * + * Block and grid default configs are results after careful grid search. Using + * 36 blocks give the best or close to the best runtime on the devices I + * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only + * take a small amount of SMs. Not quite sure the underlying reason, but my + * guess is that too many SMs will cause contention on NVLink bus. */ template void allreduce(cudaStream_t stream, T* input, T* output, int size, diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu index 376687e91cfda..b59ea40d980f4 100644 --- a/csrc/custom_all_reduce_test.cu +++ b/csrc/custom_all_reduce_test.cu @@ -135,24 +135,26 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit, void* rank_data; size_t rank_data_sz = 16 * 1024 * 1024; CUDACHECK(cudaMalloc(&rank_data, rank_data_sz)); - std::vector offsets(nRanks, 0); - vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles, - offsets, myRank); + vllm::Signal* ipc_ptrs[8]; + for (int i = 0; i < nRanks; i++) { + if (i == myRank) + ipc_ptrs[i] = buffer; + else + CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i], + cudaIpcMemLazyEnablePeerAccess)); + } + vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks); auto* self_data = reinterpret_cast(reinterpret_cast(buffer) + sizeof(vllm::Signal) + data_size * sizeof(T)); // hack buffer registration { - std::vector handles; - handles.reserve(nRanks); + void* data[8]; for (int i = 0; i < nRanks; i++) { - char* begin = (char*)&data_handles[i]; - char* end = (char*)&data_handles[i + 1]; - handles.emplace_back(begin, end); + data[i] = + ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T); } - std::vector offsets(nRanks, - sizeof(vllm::Signal) + data_size * sizeof(T)); - fa.register_buffer(handles, offsets, self_data); + fa.register_buffer(data); } double* ground_truth; diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu index 7a7a25d2173d2..fb6882f3e7c3e 100644 --- a/csrc/layernorm_kernels.cu +++ b/csrc/layernorm_kernels.cu @@ -1,21 +1,13 @@ -#include -#include +#include "type_convert.cuh" +#include "dispatch_utils.h" + +#include #include -#include "dispatch_utils.h" #ifndef USE_ROCM - #include - #include - #include #include #else - #include - #include - #include #include - -using __nv_bfloat16 = __hip_bfloat16; -using __nv_bfloat162 = __hip_bfloat162; #endif namespace vllm { @@ -51,155 +43,6 @@ __global__ void rms_norm_kernel( } } -/* Converter structs for the conversion from torch types to HIP/CUDA types, - and the associated type conversions within HIP/CUDA. These helpers need - to be implemented for now because the relevant type conversion - operators/constructors are not consistently implemented by HIP/CUDA, so - a generic conversion via type casts cannot be implemented. - - Each struct should have the member static constexpr bool `exists`: - If false, the optimized kernel is not used for the corresponding torch type. - If true, the struct should be fully defined as shown in the examples below. - */ -template -struct _typeConvert { - static constexpr bool exists = false; -}; - -#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) -// CUDA < 12.0 runs into issues with packed type conversion -template <> -struct _typeConvert { - static constexpr bool exists = true; - using hip_type = __half; - using packed_hip_type = __half2; - - __device__ static inline float convert(hip_type x) { return __half2float(x); } - __device__ static inline float2 convert(packed_hip_type x) { - return __half22float2(x); - } - __device__ static inline hip_type convert(float x) { - return __float2half_rn(x); - } - __device__ static inline packed_hip_type convert(float2 x) { - return __float22half2_rn(x); - } -}; - - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -// CUDA_ARCH < 800 does not have BF16 support -// TODO: Add in ROCm support once public headers handle bf16 maturely -template <> -struct _typeConvert { - static constexpr bool exists = true; - using hip_type = __nv_bfloat16; - using packed_hip_type = __nv_bfloat162; - - __device__ static inline float convert(hip_type x) { - return __bfloat162float(x); - } - __device__ static inline float2 convert(packed_hip_type x) { - return __bfloat1622float2(x); - } - __device__ static inline hip_type convert(float x) { - return __float2bfloat16(x); - } - __device__ static inline packed_hip_type convert(float2 x) { - return __float22bfloat162_rn(x); - } -}; - #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= - // 12000)) - -/* Vector POD struct to generate vectorized and packed FP16/BF16 ops - for appropriate specializations of fused_add_rms_norm_kernel. - Only functions that are necessary in that kernel are implemented. - Alignment to 16 bytes is required to use 128-bit global memory ops. - */ -template -struct alignas(16) _f16Vec { - /* Not theoretically necessary that width is a power of 2 but should - almost always be the case for optimization purposes */ - static_assert(width > 0 && (width & (width - 1)) == 0, - "Width is not a positive power of 2!"); - using Converter = _typeConvert; - using T1 = typename Converter::hip_type; - using T2 = typename Converter::packed_hip_type; - T1 data[width]; - - __device__ _f16Vec& operator+=(const _f16Vec& other) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i + 1]}; - temp += T2{other.data[i], other.data[i + 1]}; - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) data[i] += other.data[i]; - } - return *this; - } - - __device__ _f16Vec& operator*=(const _f16Vec& other) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i + 1]}; - temp *= T2{other.data[i], other.data[i + 1]}; - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) data[i] *= other.data[i]; - } - return *this; - } - - __device__ _f16Vec& operator*=(const float scale) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - float2 temp_f = Converter::convert(T2{data[i], data[i + 1]}); - temp_f.x *= scale; - temp_f.y *= scale; - T2 temp = Converter::convert(temp_f); - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) { - float temp = Converter::convert(data[i]) * scale; - data[i] = Converter::convert(temp); - } - } - return *this; - } - - __device__ float sum_squares() const { - float result = 0.0f; - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - float2 z = Converter::convert(T2{data[i], data[i + 1]}); - result += z.x * z.x + z.y * z.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) { - float x = Converter::convert(data[i]); - result += x * x; - } - } - return result; - } -}; - /* Function specialization in the case of FP16/BF16 tensors. Additional optimizations we can make in this case are packed and vectorized operations, which help with the diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu new file mode 100644 index 0000000000000..c18e2a4e4abe0 --- /dev/null +++ b/csrc/layernorm_quant_kernels.cu @@ -0,0 +1,234 @@ +/* + * This file contains the CUDA kernels for the fused quantized layernorm. + * The kernels correspond to the kernels in layernorm_kernels.cu, except they + * also produce quantized output directly. + * Currently, only static fp8 quantization is supported. + */ + +#include "type_convert.cuh" +#include "quantization/fp8/common.cuh" +#include "dispatch_utils.h" + +#include +#include + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +// TODO(woosuk): Further optimize this kernel. +template +__global__ void rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + __shared__ float s_variance; + float variance = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + const float x = (float)input[blockIdx.x * hidden_size + idx]; + variance += x * x; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; + out[blockIdx.x * hidden_size + idx] = + scaled_fp8_conversion(out_norm, scale_inv); + } +} + +/* Function specialization in the case of FP16/BF16 tensors. + Additional optimizations we can make in this case are + packed and vectorized operations, which help with the + memory latency bottleneck. */ +template +__global__ std::enable_if_t<(width > 0) && _typeConvert::exists> +fused_add_rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + // Sanity checks on our vector struct and type-punned pointer arithmetic + static_assert(std::is_pod_v<_f16Vec>); + static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); + + const int vec_hidden_size = hidden_size / width; + __shared__ float s_variance; + float variance = 0.0f; + /* These and the argument pointers are all declared `restrict` as they are + not aliased in practice. Argument pointers should not be dereferenced + in this kernel as that would be undefined behavior */ + auto* __restrict__ input_v = + reinterpret_cast<_f16Vec*>(input); + auto* __restrict__ residual_v = + reinterpret_cast<_f16Vec*>(residual); + auto* __restrict__ weight_v = + reinterpret_cast*>(weight); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = input_v[id]; + temp += residual_v[id]; + variance += temp.sum_squares(); + residual_v[id] = temp; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = residual_v[id]; + temp *= s_variance; + temp *= weight_v[idx]; +#pragma unroll + for (int i = 0; i < width; ++i) { + out[id * width + i] = + scaled_fp8_conversion(float(temp.data[i]), scale_inv); + } + } +} + +/* Generic fused_add_rms_norm_kernel + The width field is not used here but necessary for other specializations. + */ +template +__global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> +fused_add_rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + __shared__ float s_variance; + float variance = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + scalar_t z = input[blockIdx.x * hidden_size + idx]; + z += residual[blockIdx.x * hidden_size + idx]; + float x = (float)z; + variance += x * x; + residual[blockIdx.x * hidden_size + idx] = z; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)residual[blockIdx.x * hidden_size + idx]; + float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; + out[blockIdx.x * hidden_size + idx] = + scaled_fp8_conversion(out_norm, scale_inv); + } +} + +} // namespace vllm + +void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + torch::Tensor& scale, // [1] + double epsilon) { + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + vllm::rms_norm_static_fp8_quant_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + weight.data_ptr(), scale.data_ptr(), epsilon, + num_tokens, hidden_size); + }); +} + +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ + vllm::fused_add_rms_norm_static_fp8_quant_kernel \ + <<>>( \ + out.data_ptr(), input.data_ptr(), \ + residual.data_ptr(), weight.data_ptr(), \ + scale.data_ptr(), epsilon, num_tokens, hidden_size); \ + }); + +void fused_add_rms_norm_static_fp8_quant( + torch::Tensor& out, // [..., hidden_size], + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& residual, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + torch::Tensor& scale, // [1] + double epsilon) { + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + /* This kernel is memory-latency bound in many scenarios. + When num_tokens is large, a smaller block size allows + for increased block occupancy on CUs and better latency + hiding on global mem ops. */ + const int max_block_size = (num_tokens < 256) ? 1024 : 256; + dim3 block(std::min(hidden_size, max_block_size)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + /*If the tensor types are FP16/BF16, try to use the optimized kernel + with packed + vectorized ops. + Max optimization is achieved with a width-8 vector of FP16/BF16s + since we can load at most 128 bits at once in a global memory op. + However, this requires each tensor's data to be aligned to 16 + bytes. + */ + auto inp_ptr = reinterpret_cast(input.data_ptr()); + auto res_ptr = reinterpret_cast(residual.data_ptr()); + auto wt_ptr = reinterpret_cast(weight.data_ptr()); + bool ptrs_are_aligned = + inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; + if (ptrs_are_aligned && hidden_size % 8 == 0) { + LAUNCH_FUSED_ADD_RMS_NORM(8); + } else { + LAUNCH_FUSED_ADD_RMS_NORM(0); + } +} diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index 3a464c5f327ad..498d069c05f0d 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize); } out += kChunkSize; + + int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); + // in case the final state is separated between the last "smem_exchange" and + // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), + // (which occurs when `final_state_position` is a non-positivie index) + // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it + if (final_state_position < 0 && seqlen > kWidth){ + input_t vals_load[kNElts] = {0}; + if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){ + // chunk = n_chunks - 2, a segment of the final state sits in the last index + reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1]; + #pragma unroll + for (int w = 0; w < -final_state_position; ++w){ + conv_states[w] = vals_load[kNElts + final_state_position + w]; + } + } + if ((chunk == n_chunks - 1) && tidx == 0){ + // chunk = n_chunks - 1, the second segment of the final state first positions + reinterpret_cast(vals_load)[0] = smem_exchange[0]; + for (int w = -final_state_position; w < kWidth - 1; ++w){ + conv_states[w] = vals_load[w + final_state_position]; + } + return; + } + } } // Final state is stored in the smem_exchange last token slot, // in case seqlen < kWidth, we would need to take the final state from the @@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { } else { // in case the final state is in between the threads data - reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; - reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; const int offset = ((seqlen - (kWidth - 1)) % (kNElts)); + if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){ + // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a + // illegal access error on H100. + // Therefore, we access last_thread + 1, only if the final state data sits there + reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; + } + reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; #pragma unroll for (int w = 0; w < kWidth - 1; ++w){ conv_states[w] = x_vals_load[offset + w ]; diff --git a/csrc/ops.h b/csrc/ops.h index c50eb39a3dacc..672e608e9c47e 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -56,6 +56,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, + torch::Tensor& weight, torch::Tensor& scale, + double epsilon); + +void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out, + torch::Tensor& input, + torch::Tensor& residual, + torch::Tensor& weight, + torch::Tensor& scale, double epsilon); + void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); @@ -199,20 +209,16 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, #ifndef USE_ROCM using fptr_t = int64_t; -fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, - const std::vector& handles, - const std::vector& offsets, int64_t rank, - bool full_nvlink); -void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out); -void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, - torch::Tensor& out); +fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, + torch::Tensor& rank_data, int64_t rank, bool full_nvlink); +void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, + fptr_t reg_buffer, int64_t reg_buffer_sz_bytes); void dispose(fptr_t _fa); int64_t meta_size(); -void register_buffer(fptr_t _fa, torch::Tensor& t, - const std::vector& handles, - const std::vector& offsets); -std::tuple> get_graph_buffer_ipc_meta( - fptr_t _fa); -void register_graph_buffers(fptr_t _fa, const std::vector& handles, +void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs); +std::tuple, std::vector> +get_graph_buffer_ipc_meta(fptr_t _fa); +void register_graph_buffers(fptr_t _fa, + const std::vector>& handles, const std::vector>& offsets); #endif diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index 46fef79f439fb..bd184ee22682e 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t, } } +/// each thread processes a block per query __global__ void advance_step_flashinfer_kernel( int num_threads, int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, long const* sampled_token_ids_ptr, @@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel( int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr + if (idx == 0) { + paged_kv_indptr_ptr[idx] = 0; + } if (idx < num_queries) { int sum = 0; for (int i = 0; i <= idx; ++i) { @@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel( } __global__ void advance_step_flashinfer_indices_kernel( - int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const block_tables_stride, int* paged_kv_indices_ptr, + int num_seqs, int num_queries, int const* block_tables_ptr, + int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - int row = idx / block_tables_stride; - int col = idx % block_tables_stride; - - if (row < num_queries && col < block_table_bound_ptr[row]) { - paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] = - block_tables_ptr[row * block_tables_stride + col]; + // note: max_num_blocks_per_seq = block_tables.stride(0) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // when cuda graphs are enabled, paged_kv_indptr tensor + // has to be updated for the padded queries + // tid represents a query# for paged_kv_indptr tensor + if (num_queries < tid && tid <= num_seqs) { + paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; } - // if cudagraph, fill padded seqs with the last valid seq's indptr - if (num_queries < row && row <= num_seqs) { - paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries]; + + // each thread processes a block_ptr in block_tables + // block_tables shape: [num_queries, max_num_blocks_per_seq] + // paged_kv_indices is flattened block_tables. + for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); + idx += (gridDim.x * blockDim.x)) { + // block_tables-row = paged_kv_indptr[queryNum] + int queryNum = idx / max_num_blocks_per_seq; + int col = idx % max_num_blocks_per_seq; + if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { + int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; + int block_tables_idx = queryNum * max_num_blocks_per_seq + col; + paged_kv_indices_ptr[indices_arr_idx] = + block_tables_ptr[block_tables_idx]; + } } } @@ -247,22 +263,16 @@ void advance_step_flashinfer( int threads; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - if (logging) { - printf("launching kernel with %d blocks\n", blocks); - } - // TODO(will): support arbitrary block_tables stride - if ((blocks * threads) / block_tables.stride(0) < num_queries) { - TORCH_CHECK(false, - "multi-step: not enough threads to map block_table to" - "FlashInfer's paged_kv_indices on GPU. Try reducing the number " - "of seqs,", - " increasing the block size or take smaller steps.", - " num_queries = ", num_queries, - " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); + int block_tables_stride = block_tables.stride(0); + TORCH_CHECK((blocks * threads > num_queries), + "multi-step: not enough threads to map to num_queries = ", + num_queries, " block_tables.stride(0) = ", block_tables.stride(0), + " blocks = ", blocks, " max_threads = ", threads); + if (logging) { + printf("launching kernels with %d blocks and %d threads\n", blocks, + threads); } - advance_step_flashinfer_kernel<<>>( threads, num_seqs, num_queries, block_size, reinterpret_cast(input_tokens.data_ptr()), @@ -281,7 +291,7 @@ void advance_step_flashinfer( reinterpret_cast(block_table_bound.data_ptr())); advance_step_flashinfer_indices_kernel<<>>( - threads, num_seqs, num_queries, + num_seqs, num_queries, reinterpret_cast(block_tables.data_ptr()), block_tables.stride(0), reinterpret_cast(paged_kv_indices.data_ptr()), diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu index f2c609c1b68c3..e4f6615ede1ee 100644 --- a/csrc/quantization/fp8/common.cu +++ b/csrc/quantization/fp8/common.cu @@ -1,185 +1,16 @@ -#include -#include -#include - -#include - -#include "cuda_compat.h" +#include "common.cuh" #include "dispatch_utils.h" +#include + #ifndef USE_ROCM - #include #include #else - #include #include #endif -#ifndef USE_ROCM -using FP8_TYPE = c10::Float8_e4m3fn; -C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = - std::numeric_limits::max(); -#else - #include "amd/hip_float8.h" -using FP8_TYPE = c10::Float8_e4m3fnuz; -// Using the default max value from pytorch (240.0) will cause accuracy -// issue when running dynamic quantization. Here use 224.0f for rocm. -constexpr auto FP8_E4M3_MAX = 224.0f; -#endif - namespace vllm { -__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { - float old; - old = (value >= 0) - ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) - : __uint_as_float( - atomicMin((unsigned int*)addr, __float_as_uint(value))); - - return old; -} - -template -__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, - float const scale) { - float x = 0.0f; - if constexpr (is_scale_inverted) { - x = val * scale; - } else { - x = val / scale; - } - - float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); -#ifndef USE_ROCM - return static_cast(r); -#else - // Use hardware cvt instruction for fp8 on rocm - return c10::Float8_e4m3fnuz(hip_fp8(r).data, - c10::Float8_e4m3fnuz::from_bits()); -#endif -} - -// Compute the absolute maximum m of the input tensor and store -// m / float8_e4m3::max() in *scale. Each thread block performs a -// reduction tree and the memory in scale is atomically updated. -// So to get the right answer, *scale needs to be initialized to -// a value <= 0.0 and we need to wait for all thread blocks to -// finish before consuming *scale. -template -__global__ void segmented_max_reduction(float* __restrict__ scale, - const scalar_t* __restrict__ input, - int64_t num_elems) { - __shared__ float cache[1024]; - int64_t i = blockDim.x * blockIdx.x + threadIdx.x; - - // First store maximum for all values processes by - // the current thread in cache[threadIdx.x] - scalar_t tmp = 0.0; - while (i < num_elems) { - float x = static_cast(input[i]); - tmp = max(tmp, fabs(x)); - i += blockDim.x * gridDim.x; - } - cache[threadIdx.x] = tmp; - - __syncthreads(); - - // Now perform parallel reduction within the thread block - int ib = blockDim.x / 2; - while (ib != 0) { - if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { - cache[threadIdx.x] = cache[threadIdx.x + ib]; - } - __syncthreads(); - ib /= 2; - } - // Finally, since cache[0] contains the maximum for this thread block, - // atomically write the max to the target location - if (threadIdx.x == 0) { - atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX); - } -} - -template -struct __align__(8) vec4_t { - scalar_t x; - scalar_t y; - scalar_t z; - scalar_t w; -}; - -typedef struct __align__(4) { - FP8_TYPE x; - FP8_TYPE y; - FP8_TYPE z; - FP8_TYPE w; -} -float8x4_t; - -template -__device__ float thread_max_vec(scalar_t const* __restrict__ input, - int64_t const num_elems, int const tid, - int const step) { - // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); - - int64_t const num_vec_elems = num_elems >> 2; - float absmax_val = 0.0f; - -#pragma unroll 4 - for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - absmax_val = max(absmax_val, fabs(in_vec.x)); - absmax_val = max(absmax_val, fabs(in_vec.y)); - absmax_val = max(absmax_val, fabs(in_vec.z)); - absmax_val = max(absmax_val, fabs(in_vec.w)); - } - - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - absmax_val = max(absmax_val, fabs(input[i])); - } - - return absmax_val; -} - -template -__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, - scalar_t const* __restrict__ input, - float const scale, - int64_t const num_elems, - int const tid, int const step) { - // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); - float8x4_t* vectorized_out = reinterpret_cast(out); - - int64_t const num_vec_elems = num_elems >> 2; - -#pragma unroll 4 - for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - float8x4_t out_vec; - - out_vec.x = scaled_fp8_conversion( - static_cast(in_vec.x), scale); - out_vec.y = scaled_fp8_conversion( - static_cast(in_vec.y), scale); - out_vec.z = scaled_fp8_conversion( - static_cast(in_vec.z), scale); - out_vec.w = scaled_fp8_conversion( - static_cast(in_vec.w), scale); - vectorized_out[i] = out_vec; - } - - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - out[i] = scaled_fp8_conversion( - static_cast(input[i]), scale); - } -} - template __global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out, const scalar_t* __restrict__ input, diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh new file mode 100644 index 0000000000000..d7c0297d5333f --- /dev/null +++ b/csrc/quantization/fp8/common.cuh @@ -0,0 +1,172 @@ +#pragma once + +#include + +#ifndef USE_ROCM + #include +using FP8_TYPE = c10::Float8_e4m3fn; +C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = + std::numeric_limits::max(); +#else + #include + #include "amd/hip_float8.h" +using FP8_TYPE = c10::Float8_e4m3fnuz; +// Using the default max value from pytorch (240.0) will cause accuracy +// issue when running dynamic quantization. Here use 224.0f for rocm. +constexpr auto FP8_E4M3_MAX = 224.0f; +#endif + +namespace vllm { + +__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { + float old; + old = (value >= 0) + ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) + : __uint_as_float( + atomicMin((unsigned int*)addr, __float_as_uint(value))); + + return old; +} + +template +__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, + float const scale) { + float x = 0.0f; + if constexpr (is_scale_inverted) { + x = val * scale; + } else { + x = val / scale; + } + + float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); +#ifndef USE_ROCM + return static_cast(r); +#else + // Use hardware cvt instruction for fp8 on rocm + return c10::Float8_e4m3fnuz(hip_fp8(r).data, + c10::Float8_e4m3fnuz::from_bits()); +#endif +} + +// Compute the absolute maximum m of the input tensor and store +// m / float8_e4m3::max() in *scale. Each thread block performs a +// reduction tree and the memory in scale is atomically updated. +// So to get the right answer, *scale needs to be initialized to +// a value <= 0.0 and we need to wait for all thread blocks to +// finish before consuming *scale. +template +__global__ void segmented_max_reduction(float* __restrict__ scale, + const scalar_t* __restrict__ input, + int64_t num_elems) { + __shared__ float cache[1024]; + int64_t i = blockDim.x * blockIdx.x + threadIdx.x; + + // First store maximum for all values processes by + // the current thread in cache[threadIdx.x] + scalar_t tmp = 0.0; + while (i < num_elems) { + float x = static_cast(input[i]); + tmp = max(tmp, fabs(x)); + i += blockDim.x * gridDim.x; + } + cache[threadIdx.x] = tmp; + + __syncthreads(); + + // Now perform parallel reduction within the thread block + int ib = blockDim.x / 2; + while (ib != 0) { + if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { + cache[threadIdx.x] = cache[threadIdx.x + ib]; + } + __syncthreads(); + ib /= 2; + } + // Finally, since cache[0] contains the maximum for this thread block, + // atomically write the max to the target location + if (threadIdx.x == 0) { + atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX); + } +} + +template +struct __align__(8) vec4_t { + scalar_t x; + scalar_t y; + scalar_t z; + scalar_t w; +}; + +typedef struct __align__(4) { + FP8_TYPE x; + FP8_TYPE y; + FP8_TYPE z; + FP8_TYPE w; +} +float8x4_t; + +template +__device__ float thread_max_vec(scalar_t const* __restrict__ input, + int64_t const num_elems, int const tid, + int const step) { + // Vectorized input/output to better utilize memory bandwidth. + vec4_t const* vectorized_in = + reinterpret_cast const*>(input); + + int64_t const num_vec_elems = num_elems >> 2; + float absmax_val = 0.0f; + +#pragma unroll 4 + for (int64_t i = tid; i < num_vec_elems; i += step) { + vec4_t in_vec = vectorized_in[i]; + absmax_val = max(absmax_val, fabs(in_vec.x)); + absmax_val = max(absmax_val, fabs(in_vec.y)); + absmax_val = max(absmax_val, fabs(in_vec.z)); + absmax_val = max(absmax_val, fabs(in_vec.w)); + } + + // Handle the remaining elements if num_elems is not divisible by 4 + for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + absmax_val = max(absmax_val, fabs(input[i])); + } + + return absmax_val; +} + +template +__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, + scalar_t const* __restrict__ input, + float const scale, + int64_t const num_elems, + int const tid, int const step) { + // Vectorized input/output to better utilize memory bandwidth. + vec4_t const* vectorized_in = + reinterpret_cast const*>(input); + float8x4_t* vectorized_out = reinterpret_cast(out); + + int64_t const num_vec_elems = num_elems >> 2; + +#pragma unroll 4 + for (int64_t i = tid; i < num_vec_elems; i += step) { + vec4_t in_vec = vectorized_in[i]; + float8x4_t out_vec; + + out_vec.x = scaled_fp8_conversion( + static_cast(in_vec.x), scale); + out_vec.y = scaled_fp8_conversion( + static_cast(in_vec.y), scale); + out_vec.z = scaled_fp8_conversion( + static_cast(in_vec.z), scale); + out_vec.w = scaled_fp8_conversion( + static_cast(in_vec.w), scale); + vectorized_out[i] = out_vec; + } + + // Handle the remaining elements if num_elems is not divisible by 4 + for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + out[i] = scaled_fp8_conversion( + static_cast(input[i]), scale); + } +} + +} // namespace vllm \ No newline at end of file diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index ebbe76cfb944a..d126af1849024 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -468,7 +468,7 @@ def generate(): impl_configs = [] GPTQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -476,7 +476,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) GPTQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=False, with_scales=True) @@ -490,7 +490,7 @@ def generate(): ] AWQ_kernel_type_configs = list( - (TypeConfig( + TypeConfig( element_a=element_a, element_b=element_b, element_b_scale=element_a, @@ -498,7 +498,7 @@ def generate(): element_d=element_a, accumulator=DataType.f32, ) for element_b in (DataType.u4, DataType.u8) - for element_a in (DataType.f16, DataType.bf16))) + for element_a in (DataType.f16, DataType.bf16)) AWQ_kernel_specializations = [ Specialization(with_C=False, with_zeropoints=True, with_scales=True) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index b8185c24d5628..229fd554d3eee 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -101,7 +101,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( - "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> " + "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> " "()"); ops.impl("rms_norm", torch::kCUDA, &rms_norm); @@ -111,6 +111,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Layernorm-quant + // Apply Root Mean Square (RMS) Normalization to the input tensor. + ops.def( + "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, " + "Tensor scale, float epsilon) -> " + "()"); + ops.impl("rms_norm_static_fp8_quant", torch::kCUDA, + &rms_norm_static_fp8_quant); + + // In-place fused Add and RMS Normalization. + ops.def( + "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, " + "Tensor! residual, Tensor weight, " + "Tensor scale, float epsilon) -> ()"); + ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA, + &fused_add_rms_norm_static_fp8_quant); + // Rotary embedding // Apply GPT-NeoX or GPT-J style rotary embedding to query and key. ops.def( @@ -322,18 +339,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute FP8 quantized tensor for given scaling factor. ops.def( - "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()"); + "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> " + "()"); ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant); // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor. ops.def( - "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> " + "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) " + "-> " "()"); ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant); // Compute dynamic-per-token FP8 quantized tensor and scaling factor. ops.def( - "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, " + "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, " "Tensor! scale, Tensor? scale_ub) -> " "()"); ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, @@ -341,13 +360,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute int8 quantized tensor for given scaling factor. ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," + "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale," "Tensor? azp) -> ()"); ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " + "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, " "Tensor!? azp) -> ()"); ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); @@ -411,27 +430,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { // Custom all-reduce kernels custom_ar.def( - "init_custom_ar(Tensor meta, Tensor rank_data, " - "str[] handles, int[] offsets, int rank, " - "bool full_nvlink) -> int"); + "init_custom_ar(int[] ipc_tensors, Tensor rank_data, " + "int rank, bool full_nvlink) -> int"); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); - - custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()"); - custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg); - custom_ar.def( - "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> " - "()"); - custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg); + "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, " + "int reg_buffer_sz_bytes) -> ()"); + custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce); custom_ar.def("dispose", &dispose); custom_ar.def("meta_size", &meta_size); - custom_ar.def( - "register_buffer(int fa, Tensor t, str[] handles, " - "int[] offsets) -> ()"); - custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer); - + custom_ar.def("register_buffer", ®ister_buffer); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("register_graph_buffers", ®ister_graph_buffers); } diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh new file mode 100644 index 0000000000000..21b9d0ae515df --- /dev/null +++ b/csrc/type_convert.cuh @@ -0,0 +1,165 @@ +#pragma once + +#include + +#ifndef USE_ROCM + #include + #include +#else + #include + #include + +using __nv_bfloat16 = __hip_bfloat16; +using __nv_bfloat162 = __hip_bfloat162; +#endif + +namespace vllm { +/* Converter structs for the conversion from torch types to HIP/CUDA types, + and the associated type conversions within HIP/CUDA. These helpers need + to be implemented for now because the relevant type conversion + operators/constructors are not consistently implemented by HIP/CUDA, so + a generic conversion via type casts cannot be implemented. + + Each struct should have the member static constexpr bool `exists`: + If false, the optimized kernel is not used for the corresponding torch type. + If true, the struct should be fully defined as shown in the examples below. + */ +template +struct _typeConvert { + static constexpr bool exists = false; +}; + +#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) +// CUDA < 12.0 runs into issues with packed type conversion +template <> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __half; + using packed_hip_type = __half2; + + __device__ static inline float convert(hip_type x) { return __half2float(x); } + __device__ static inline float2 convert(packed_hip_type x) { + return __half22float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2half_rn(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22half2_rn(x); + } +}; + + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +// CUDA_ARCH < 800 does not have BF16 support +// TODO: Add in ROCm support once public headers handle bf16 maturely +template <> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __nv_bfloat16; + using packed_hip_type = __nv_bfloat162; + + __device__ static inline float convert(hip_type x) { + return __bfloat162float(x); + } + __device__ static inline float2 convert(packed_hip_type x) { + return __bfloat1622float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2bfloat16(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22bfloat162_rn(x); + } +}; + #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= + // 12000)) + +/* Vector POD struct to generate vectorized and packed FP16/BF16 ops + for appropriate specializations of fused_add_rms_norm_kernel. + Only functions that are necessary in that kernel are implemented. + Alignment to 16 bytes is required to use 128-bit global memory ops. + */ +template +struct alignas(16) _f16Vec { + /* Not theoretically necessary that width is a power of 2 but should + almost always be the case for optimization purposes */ + static_assert(width > 0 && (width & (width - 1)) == 0, + "Width is not a positive power of 2!"); + using Converter = _typeConvert; + using T1 = typename Converter::hip_type; + using T2 = typename Converter::packed_hip_type; + T1 data[width]; + + __device__ _f16Vec& operator+=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i + 1]}; + temp += T2{other.data[i], other.data[i + 1]}; + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) data[i] += other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i + 1]}; + temp *= T2{other.data[i], other.data[i + 1]}; + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) data[i] *= other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const float scale) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 temp_f = Converter::convert(T2{data[i], data[i + 1]}); + temp_f.x *= scale; + temp_f.y *= scale; + T2 temp = Converter::convert(temp_f); + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) { + float temp = Converter::convert(data[i]) * scale; + data[i] = Converter::convert(temp); + } + } + return *this; + } + + __device__ float sum_squares() const { + float result = 0.0f; + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + result += z.x * z.x + z.y * z.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) { + float x = Converter::convert(data[i]); + result += x * x; + } + } + return result; + } +}; +} // namespace vllm \ No newline at end of file diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index d58f226136918..e3e35844405ac 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -13,5 +13,7 @@ torch py-cpuinfo transformers mistral_common >= 1.3.4 +aiohttp +starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index f475be71fc448..18b502c786e1d 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -9,6 +9,8 @@ document.addEventListener("DOMContentLoaded", function () { script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "vLLM"); script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-position-y", "20%"); + script.setAttribute("runllm-position-x", "3%"); script.setAttribute("runllm-assistant-id", "207"); script.async = true; diff --git a/docs/source/assets/design/hierarchy.png b/docs/source/assets/design/hierarchy.png new file mode 100644 index 0000000000000..6a1b4ba9590ba Binary files /dev/null and b/docs/source/assets/design/hierarchy.png differ diff --git a/docs/source/conf.py b/docs/source/conf.py index 8435129e752e1..96ad9a4c26b09 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -96,7 +96,6 @@ def setup(app): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ - "aiohttp", "compressed_tensors", "cpuinfo", "cv2", @@ -117,6 +116,7 @@ def setup(app): "soundfile", "gguf", "lark", + "decord", ] for mock_target in autodoc_mock_imports: @@ -143,6 +143,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: "python": ("https://docs.python.org/3", None), "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "aiohttp": ("https://docs.aiohttp.org/en/stable", None), "pillow": ("https://pillow.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable", None), "torch": ("https://pytorch.org/docs/stable", None), diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst similarity index 100% rename from docs/source/dev/dockerfile/dockerfile.rst rename to docs/source/contributing/dockerfile/dockerfile.rst diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst new file mode 100644 index 0000000000000..ac2d2b2fe4103 --- /dev/null +++ b/docs/source/contributing/overview.rst @@ -0,0 +1,70 @@ +Contributing to vLLM +===================== + +Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: + +- Identify and report any issues or bugs. +- Request or add support for a new model. +- Suggest or implement new features. +- Improve documentation or contribute a how-to guide. + +We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. + +Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! + +License +------- + +See `LICENSE `_. + +Developing +---------- + +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. + +Testing +------- + +.. code-block:: bash + + pip install -r requirements-dev.txt + + # linting and formatting + bash format.sh + # Static type checking + mypy + # Unit tests + pytest tests/ + +.. note:: Currently, the repository does not pass the ``mypy`` tests. + +Contribution Guidelines +======================= + +DCO and Signed-off-by +---------------------- + +When contributing changes to this project, you must agree to the `DCO `_. +Commits must include a ``Signed-off-by:`` header which certifies agreement with +the terms of the `DCO `_. + +Using ``-s`` with ``git commit`` will automatically add this header. + +Issues +------ + +If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. important:: + If you discover a security vulnerability, please follow the instructions `here `_. + +Pull Requests & Code Reviews +---------------------------- + +Please check the PR checklist in the `PR template `_ for a detailed guide for contribution. + +Thank You +--------- + +Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. +All of your contributions help make vLLM a great tool and community for everyone! diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst similarity index 97% rename from docs/source/dev/profiling/profiling_index.rst rename to docs/source/contributing/profiling/profiling_index.rst index 9e8b2f1817567..a422b1fcda521 100644 --- a/docs/source/dev/profiling/profiling_index.rst +++ b/docs/source/contributing/profiling/profiling_index.rst @@ -1,5 +1,6 @@ -Profiling vLLM -================================= +============== +Profiling vLLM +============== We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst new file mode 100644 index 0000000000000..b3404f6b936e7 --- /dev/null +++ b/docs/source/design/class_hierarchy.rst @@ -0,0 +1,33 @@ +vLLM's Class Hierarchy +======================= + +This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible. + +1. **Entrypoints**: vLLM has two entrypoints: `command line usage `__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage `__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference. + +2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload. + +3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. + +4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. + +5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get. + +The following figure shows the class hierarchy of vLLM: + + .. figure:: ../assets/design/hierarchy.png + :alt: query + :width: 100% + :align: center + +There are several important design choices behind this class hierarchy: + +1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig `__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. + +2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. + +3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. + +One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. + +In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes. diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst new file mode 100644 index 0000000000000..e6c1cea6001ea --- /dev/null +++ b/docs/source/design/huggingface_integration.rst @@ -0,0 +1,40 @@ +.. _huggingface_integration: + +Integration with HuggingFace +=================================== + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. + +Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. + +1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: + + - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + + - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. + + - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. + +3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. + + - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst similarity index 100% rename from docs/source/dev/input_processing/input_processing_pipeline.rst rename to docs/source/design/input_processing/input_processing_pipeline.rst diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst similarity index 100% rename from docs/source/dev/input_processing/model_inputs_index.rst rename to docs/source/design/input_processing/model_inputs_index.rst diff --git a/docs/source/dev/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst similarity index 100% rename from docs/source/dev/kernel/paged_attention.rst rename to docs/source/design/kernel/paged_attention.rst diff --git a/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst similarity index 100% rename from docs/source/dev/multimodal/adding_multimodal_plugin.rst rename to docs/source/design/multimodal/adding_multimodal_plugin.rst diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst similarity index 97% rename from docs/source/dev/multimodal/multimodal_index.rst rename to docs/source/design/multimodal/multimodal_index.rst index e112b43aade5e..30f543abc20c7 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/design/multimodal/multimodal_index.rst @@ -53,7 +53,7 @@ Base Classes .. autodata:: vllm.multimodal.MultiModalDataDict -.. autoclass:: vllm.multimodal.MultiModalInputs +.. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: diff --git a/docs/source/dev/pooling_params.rst b/docs/source/dev/pooling_params.rst new file mode 100644 index 0000000000000..334e0287aff09 --- /dev/null +++ b/docs/source/dev/pooling_params.rst @@ -0,0 +1,5 @@ +Pooling Parameters +================== + +.. autoclass:: vllm.PoolingParams + :members: diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 301337aebcf4c..ece5d785e0c65 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -13,8 +13,6 @@ Requirements * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) * ROCm 6.2 -Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8. - Installation options: #. :ref:`Build from source with docker ` diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index d12aeebbbc184..69530fd778c55 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -3,13 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: - Tensor Parallel (``-tp = N``) - Quantization (``INT8 W8A8, AWQ``) .. note:: - FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. + More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. Table of contents: @@ -72,8 +72,6 @@ Build from source $ VLLM_TARGET_DEVICE=cpu python setup.py install .. note:: - - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support. - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 91978065faf42..77bf550601346 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -20,6 +20,10 @@ Hangs loading a model from disk If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. +.. note:: + + To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. + Model is too large ---------------------------------------- If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. @@ -75,6 +79,9 @@ If GPU/CPU communication cannot be established, you can use the following Python print("PyTorch GLOO is successful!") + if world_size <= 1: + exit() + # Test vLLM NCCL, with cuda graph from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator @@ -119,6 +126,8 @@ If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes` If the script runs successfully, you should see the message ``sanity check is successful!``. +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + .. note:: A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst new file mode 100644 index 0000000000000..68c1a56660fa4 --- /dev/null +++ b/docs/source/getting_started/gaudi-installation.rst @@ -0,0 +1,402 @@ +Installation with Intel® Gaudi® AI Accelerators +=============================================== + +This README provides instructions on running vLLM with Intel Gaudi devices. + +Requirements and Installation +============================= + +Please follow the instructions provided in the `Gaudi Installation +Guide `__ +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the `Optimizing Training Platform +Guide `__. + +Requirements +------------ + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + + +Quick start using Dockerfile +---------------------------- +.. code:: console + + $ docker build -f Dockerfile.hpu -t vllm-hpu-env . + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env + + +.. tip:: + If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. + + +Build from source +----------------- + +Environment verification +~~~~~~~~~~~~~~~~~~~~~~~~ + +To verify that the Intel Gaudi software was correctly installed, run: + +.. code:: console + + $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible + $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed + $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed + $ pip list | grep neural # verify that neural_compressor is installed + +Refer to `Intel Gaudi Software Stack +Verification `__ +for more details. + +Run Docker Image +~~~~~~~~~~~~~~~~ + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the `Intel Gaudi +documentation `__ +for more details. + +Use the following commands to run a Docker image: + +.. code:: console + + $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +Build and Install vLLM +~~~~~~~~~~~~~~~~~~~~~~ + +To build and install vLLM from source, run: + +.. code:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python setup.py develop + + +Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: + +.. code:: console + + $ git clone https://github.com/HabanaAI/vllm-fork.git + $ cd vllm-fork + $ git checkout habana_main + $ python setup.py develop + + +Supported Features +================== + +- `Offline batched + inference `__ +- Online inference via `OpenAI-Compatible + Server `__ +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with `HPU Graphs `__ + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +Unsupported Features +==================== + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +Supported Configurations +======================== + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- `meta-llama/Llama-2-7b `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-7b-chat-hf `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-70b `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Llama-2-70b-chat-hf `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +Performance Tuning +================== + +Execution modes +--------------- + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. + +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode + +.. warning:: + In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. + + +Bucketing mechanism +------------------- + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. + +.. note:: + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +.. code-block:: + + INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + +``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +.. code-block:: + + min = 2, step = 32, max = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) + +Example (without ramp-up) + +.. code-block:: + + min = 128, step = 128, max = 512 + => ramp_up = () + => stable = (128, 256, 384, 512) + => buckets = ramp_up + stable => (128, 256, 384, 512) + + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +.. warning:: + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. + +.. note:: + Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. + +Warmup +------ + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +.. code-block:: + + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + ... + INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + ... + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +.. tip:: + Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. + +HPU Graph capture +----------------- + +`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +.. note:: + ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode +- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. + + +.. note:: + ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. + + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +.. code-block:: + + INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + ... + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + ... + INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + ... + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + + +Recommended vLLM Parameters +--------------------------- + +- We recommend running inference on Gaudi 2 with ``block_size`` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see `Gaudi + Architecture `__). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +Environment variables +--------------------- + +**Diagnostic and profiling knobs:** + +- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default +- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default +- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default +- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default +- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default +- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - ``{phase}`` is either ``PROMPT`` or ``DECODE`` + - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` + - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` + - Default values: + + - Prompt: + - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` + - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` + + - Decode: + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` + - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` + + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default +- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs + +Troubleshooting: Tweaking HPU Graphs +==================================== + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak ``gpu_memory_utilization`` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default ``gpu_memory_utilization`` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. + +- If this method is not efficient, you can disable ``HPUGraph`` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding ``--enforce-eager`` flag to + server (for online inference), or by passing ``enforce_eager=True`` + argument to LLM constructor (for offline inference). diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index a706b285edede..f02626bda4c64 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -10,7 +10,7 @@ Requirements ============ * OS: Linux -* Python: 3.8 - 3.12 +* Python: 3.9 -- 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions @@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. Another way to access the latest code is to use the docker images: @@ -148,7 +148,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. @@ -181,8 +181,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example $ export MAX_JOBS=6 $ pip install -e . -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. @@ -209,7 +209,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build -------------------- -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst index ec99fc013057b..025ba6ef7ebd8 100644 --- a/docs/source/getting_started/neuron-installation.rst +++ b/docs/source/getting_started/neuron-installation.rst @@ -11,7 +11,7 @@ Requirements ------------ * OS: Linux -* Python: 3.8 -- 3.11 +* Python: 3.9 -- 3.11 * Accelerator: NeuronCore_v2 (in trn1/inf2 instances) * Pytorch 2.0.1/2.1.1 * AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index f0e6cddf09ef7..0c0491c860563 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -12,7 +12,7 @@ This guide will help you quickly get started with vLLM to: Prerequisites -------------- - OS: Linux -- Python: 3.8 - 3.12 +- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Installation @@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep A more detailed client example can be found `here `__. -OpenAI Chat API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~ +OpenAI Chat Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. You can use the `create chat completion `_ endpoint to interact with the model: @@ -157,7 +157,7 @@ You can use the `create chat completion `_. +For more information on the TPU versions supported with vLLM, see: + +* `TPU v6e `_ +* `TPU v5e `_ +* `TPU v5p `_ +* `TPU v4 `_ + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +* `TPU v6e topologies `_ +* `TPU v5e topologies `_ +* `TPU v5p topologies `_ +* `TPU v4 topologies `_ + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see `TPU quota `_. + +For TPU pricing information, see `Cloud TPU pricing `_. + +You may need additional persistent storage for your TPU VMs. For more +information, see `Storage options for Cloud TPU data `_. Requirements ------------ -* Google Cloud TPU VM (single & multi host) -* TPU versions: v5e, v5p, v4 -* Python: 3.10 +* Google Cloud TPU VM +* TPU versions: v6e, v5e, v5p, v4 +* Python: 3.10 or newer -Installation options: +Provision Cloud TPUs +==================== -1. :ref:`Build a docker image with Dockerfile `. -2. :ref:`Build from source `. +You can provision Cloud TPUs using the `Cloud TPU API `_` +or the `queued resources `_` +API. This section shows how to create TPUs using the queued resource API. +For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +`Queued resources `_ +enable you to request Cloud TPU resources in a queued manner. When you request +queued resources, the request is added to a queue maintained by the Cloud TPU +service. When the requested resource becomes available, it's assigned to your +Google Cloud project for your immediate exclusive use. -.. _build_docker_tpu: +Provision a Cloud TPU with the queued resource API +-------------------------------------------------- +Create a TPU v5e with 4 TPU chips: -Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------- +.. code-block:: console -`Dockerfile.tpu `_ is provided to build a docker image with TPU support. + gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ + --node-id TPU_NAME \ + --project PROJECT_ID \ + --zone ZONE \ + --accelerator-type ACCELERATOR_TYPE \ + --runtime-version RUNTIME_VERSION \ + --service-account SERVICE_ACCOUNT -.. code-block:: console +.. list-table:: Parameter descriptions + :header-rows: 1 - $ docker build -f Dockerfile.tpu -t vllm-tpu . + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The `zone `_ where you + want to create your Cloud TPU. + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, followed by a + '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU + with 4 cores. For more information, see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` +Connect to your TPU using SSH: -You can run the docker image with the following command: +.. code-block:: bash -.. code-block:: console + gcloud compute tpus tpu-vm ssh TPU_NAME - $ # Make sure to add `--privileged --net host --shm-size=16G`. - $ docker run --privileged --net host --shm-size=16G -it vllm-tpu +Create and activate a Conda environment for vLLM: +.. code-block:: bash -.. _build_from_source_tpu: + conda create -n vllm python=3.10 -y + conda activate vllm -Build from source ------------------ +Clone the vLLM repository and go to the vLLM directory: -You can also build and install the TPU backend from source. +.. code-block:: bash -First, install the dependencies: + git clone https://github.com/vllm-project/vllm.git && cd vllm -.. code-block:: console +Uninstall the existing `torch` and `torch_xla` packages: + +.. code-block:: bash - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y - $ conda activate myenv + pip uninstall torch torch-xla -y - $ # Clean up the existing torch and torch-xla packages. - $ pip uninstall torch torch-xla -y +Install build dependencies: - $ # Install PyTorch and PyTorch XLA. - $ export DATE="20241017" - $ export TORCH_VERSION="2.6.0" - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl +.. code-block:: bash - $ # Install JAX and Pallas. - $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html - $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + pip install -r requirements-tpu.txt + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev - $ # Install other build dependencies. - $ pip install -r requirements-tpu.txt +Run the setup script: +.. code-block:: bash -Next, build vLLM from source. This will only take a few seconds: + VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +Provision Cloud TPUs with GKE +----------------------------- + +For more information about using TPUs with GKE, see +https://cloud.google.com/kubernetes-engine/docs/how-to/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus + +.. _build_docker_tpu: + +Build a docker image with :code:`Dockerfile.tpu` +------------------------------------------------ + +You can use `Dockerfile.tpu `_ +to build a Docker image with TPU support. .. code-block:: console - $ VLLM_TARGET_DEVICE="tpu" python setup.py develop + $ docker build -f Dockerfile.tpu -t vllm-tpu . + +Run the Docker image with the following command: + +.. code-block:: console + $ # Make sure to add `--privileged --net host --shm-size=16G`. + $ docker run --privileged --net host --shm-size=16G -it vllm-tpu .. note:: @@ -82,7 +166,6 @@ Next, build vLLM from source. This will only take a few seconds: The compilation time may take 20~30 minutes in the first run. However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). - .. tip:: If you encounter the following error: @@ -93,7 +176,7 @@ Next, build vLLM from source. This will only take a few seconds: ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory - Please install OpenBLAS with the following command: + Install OpenBLAS with the following command: .. code-block:: console diff --git a/docs/source/index.rst b/docs/source/index.rst index b93242aa8e039..c57ff553f18bc 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,7 +43,7 @@ vLLM is flexible and easy to use with: * Tensor parallelism and pipeline parallelism support for distributed inference * Streaming outputs * OpenAI-compatible API server -* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. * Prefix caching support * Multi-lora support @@ -67,6 +67,7 @@ Documentation getting_started/openvino-installation getting_started/cpu-installation getting_started/arm-installation + getting_started/gaudi-installation getting_started/neuron-installation getting_started/tpu-installation getting_started/xpu-installation @@ -126,29 +127,51 @@ Documentation .. toctree:: :maxdepth: 1 - :caption: Performance benchmarks + :caption: Performance - performance_benchmark/benchmarks + performance/benchmarks + +.. Community: User community resources + +.. toctree:: + :maxdepth: 1 + :caption: Community + + community/meetups + community/sponsors + +.. API Documentation: API reference aimed at vllm library usage .. toctree:: :maxdepth: 2 - :caption: Developer Documentation + :caption: API Documentation dev/sampling_params + dev/pooling_params dev/offline_inference/offline_index dev/engine/engine_index - dev/kernel/paged_attention - dev/input_processing/model_inputs_index - dev/multimodal/multimodal_index - dev/dockerfile/dockerfile - dev/profiling/profiling_index + +.. Design: docs about vLLM internals .. toctree:: - :maxdepth: 1 - :caption: Community + :maxdepth: 2 + :caption: Design - community/meetups - community/sponsors + design/class_hierarchy + design/huggingface_integration + design/input_processing/model_inputs_index + design/kernel/paged_attention + design/multimodal/multimodal_index + +.. For Developers: contributing to the vLLM project + +.. toctree:: + :maxdepth: 2 + :caption: For Developers + + contributing/overview + contributing/profiling/profiling_index + contributing/dockerfile/dockerfile Indices and tables ================== diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst index 3d0d1aec69845..49b5285c45590 100644 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i 3. Register maximum number of multi-modal tokens ------------------------------------------------ -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. .. code-block:: diff diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst index b02c80aebec69..d57ffec53215d 100644 --- a/docs/source/models/spec_decode.rst +++ b/docs/source/models/spec_decode.rst @@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_. + titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. **Conclusion** @@ -197,7 +197,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_. Resources for vLLM contributors ------------------------------- diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index ff893b613f150..161733c049bbe 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -160,13 +160,13 @@ Text Generation - - ✅︎ * - :code:`GraniteForCausalLM` - - PowerLM - - :code:`ibm/PowerLM-3b` etc. + - Granite 3.0, PowerLM + - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - ✅︎ - ✅︎ * - :code:`GraniteMoeForCausalLM` - - PowerMoE - - :code:`ibm/PowerMoE-3b` etc. + - Granite 3.0 MoE, PowerMoE + - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - ✅︎ - ✅︎ * - :code:`InternLMForCausalLM` @@ -277,11 +277,11 @@ Text Generation * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2ForCausalLM` - Qwen2 - - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ * - :code:`Qwen2MoeForCausalLM` @@ -333,7 +333,7 @@ Text Embedding * - :code:`MistralModel` - Mistral-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - + - ✅︎ - ✅︎ .. important:: @@ -440,6 +440,18 @@ Text Generation - :code:`THUDM/glm-4v-9b` etc. - - ✅︎ + * - :code:`H2OVLChatModel` + - H2OVL + - T + I\ :sup:`E+` + - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅︎ + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` @@ -466,7 +478,7 @@ Text Generation - ✅︎ * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - - T + I\ :sup:`+` + V + - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ @@ -478,7 +490,7 @@ Text Generation - ✅︎ * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - - T + I + - T + I\ :sup:`+` - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - @@ -516,7 +528,7 @@ Text Generation - Qwen-VL - T + I\ :sup:`E+` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2AudioForConditionalGeneration` - Qwen2-Audio @@ -526,9 +538,9 @@ Text Generation - ✅︎ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`+` + - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - + - ✅︎ - ✅︎ * - :code:`UltravoxModel` - Ultravox @@ -540,6 +552,9 @@ Text Generation | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. note:: + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + .. note:: For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 @@ -569,6 +584,12 @@ Multimodal Embedding - :code:`TIGER-Lab/VLM2Vec-Full` - 🚧 - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index a47902ab4fc9d..bcbe50a25fa09 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -185,7 +185,7 @@ Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruc --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 .. important:: - Since OpenAI Vision API is based on `Chat Completions `_ API, + Since OpenAI Vision API is based on `Chat Completions API `_, a chat template is **required** to launch the API server. Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. @@ -240,8 +240,15 @@ To consume the server, you can use the OpenAI client like in the example below: ) print("Chat completion output:", chat_response.choices[0].message.content) +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. -A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. +.. tip:: + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, + and pass the file path as ``url`` in the API request. + +.. tip:: + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. .. note:: @@ -251,5 +258,73 @@ A full code example can be found in `examples/openai_api_client_for_multimodal.p $ export VLLM_IMAGE_FETCH_TIMEOUT= -.. note:: - There is no need to format the prompt in the API request since it will be handled by the server. +Chat Embeddings API +^^^^^^^^^^^^^^^^^^^ + +vLLM's Chat Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. + +.. tip:: + The schema of ``messages`` is exactly the same as in Chat Completions API. + +In this example, we will serve the ``TIGER-Lab/VLM2Vec-Full`` model. + +.. code-block:: bash + + vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + +.. important:: + + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` + to run this model in embedding mode instead of text generation mode. + +.. important:: + + VLM2Vec does not expect chat-based input. We use a `custom chat template `_ + to combine the text and images together. + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: + +.. code-block:: python + + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + +Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. + +.. code-block:: bash + + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + +.. important:: + + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, + which is handled by the jinja template. + +.. important:: + + Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + +A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst new file mode 100644 index 0000000000000..6d4d7b544cb5d --- /dev/null +++ b/docs/source/performance/benchmarks.rst @@ -0,0 +1,33 @@ +.. _benchmarks: + +================ +Benchmark Suites +================ + +vLLM contains two sets of benchmarks: + ++ :ref:`Performance benchmarks ` ++ :ref:`Nightly benchmarks ` + + +.. _performance_benchmarks: + +Performance Benchmarks +---------------------- + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public `vLLM Performance Dashboard `_. + +More information on the performance benchmarks and their parameters can be found `here `__. + +.. _nightly_benchmarks: + +Nightly Benchmarks +------------------ + +These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. + +The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. + +More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst deleted file mode 100644 index e5c8d6a55de63..0000000000000 --- a/docs/source/performance_benchmark/benchmarks.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. _benchmarks: - -Benchmark suites of vLLM -======================== - - - -vLLM contains two sets of benchmarks: - -+ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results. - -+ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_. - - -Trigger a benchmark -------------------- - -The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`. - - -.. note:: - - Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics. diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index 20a81f4cad1d1..f629b3ca78318 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -283,7 +283,7 @@ Feature x Feature - ✅ - ✅ - ✅ - - `✗ `__ + - `✗ `__ - ? - ✅ - ✅ @@ -359,7 +359,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - `✗ `__ + - ✅ - ✗ * - :abbr:`logP (Logprobs)` - ✅ diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 7882e14f3b849..f39997e0e44d9 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -13,3 +13,4 @@ Integrations deploying_with_dstack serving_with_langchain serving_with_llamaindex + serving_with_llamastack diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index a1f93a9a28578..200663dac4209 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -26,13 +26,26 @@ print(completion.choices[0].message) ``` ## API Reference -Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: -- Chat: `tools`, and `tool_choice`. -- Completions: `suffix`. -vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst). +We currently support the following OpenAI APIs: + +- [Completions API](https://platform.openai.com/docs/api-reference/completions) + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Using VLMs](../models/vlm.rst). + - *Note: `image_url.detail` parameter is not supported.* + - We also support `audio_url` content type for audio files. + - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. + - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) + - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), + which will be treated as a single prompt to the model according to its chat template. + - This enables multi-modal inputs to be passed to embedding models, see [Using VLMs](../models/vlm.rst). + - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* ## Extra Parameters + vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. @@ -49,7 +62,52 @@ completion = client.chat.completions.create( ) ``` -### Extra Parameters for Chat API +### Extra HTTP Headers + +Only `X-Request-Id` HTTP request header is supported for now. + +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } +) +print(completion._request_id) + +completion = client.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } +) +print(completion._request_id) +``` + +### Extra Parameters for Completions API + +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +### Extra Parameters for Chat Completions API + The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py @@ -66,21 +124,22 @@ The following extra parameters are supported: :end-before: end-chat-completion-extra-params ``` -### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +### Extra Parameters for Embeddings API + +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params +:start-after: begin-embedding-pooling-params +:end-before: end-embedding-pooling-params ``` The following extra parameters are supported: ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params +:start-after: begin-embedding-extra-params +:end-before: end-embedding-extra-params ``` ## Chat Template @@ -127,14 +186,7 @@ this, unless explicitly specified. :func: create_parser_for_docs :prog: vllm serve ``` -## Tool Calling in the Chat Completion API -### Named Function Calling -vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is -enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a -high-quality one. -To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and -specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. ### Config file @@ -163,12 +215,22 @@ The order of priorities is `command line > config file values > defaults`. --- ## Tool calling in the chat completion API -vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap. +vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap. It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. +Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models. + + +### Named Function Calling +vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is +enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +high-quality one. vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and +specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. + ### Automatic Function Calling To enable this feature, you should set the following flags: @@ -242,6 +304,21 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` +#### IBM Granite + +Supported models: +* `ibm-granite/granite-3.0-8b-instruct` + +Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` + +`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. + +* `ibm-granite/granite-20b-functioncalling` + +Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + #### InternLM Models (`internlm`) @@ -264,16 +341,6 @@ AI21's Jamba-1.5 models are supported. Flags: `--tool-call-parser jamba` -#### IBM Granite (`granite-20b-fc`) - -Supported models: -* `ibm-granite/granite-20b-functioncalling` - -Flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` - -The example chat template deviates slightly from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - - ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst new file mode 100644 index 0000000000000..8ef96c4e54369 --- /dev/null +++ b/docs/source/serving/serving_with_llamastack.rst @@ -0,0 +1,42 @@ +.. _run_on_llamastack: + +Serving with Llama Stack +============================ + +vLLM is also available via `Llama Stack `_ . + +To install Llama Stack, run + +.. code-block:: console + + $ pip install llama-stack -q + +Inference using OpenAI Compatible API +------------------------------------- + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +.. code-block:: yaml + + inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 + +Please refer to `this guide `_ for more details on this remote vLLM provider. + +Inference via Embedded vLLM +--------------------------- + +An `inline vLLM provider +`_ +is also available. This is a sample of configuration using that method: + +.. code-block:: yaml + + inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py index 15f1a06b1219b..d75cc8b3d1cf7 100644 --- a/examples/fp8/quantizer/quantize.py +++ b/examples/fp8/quantizer/quantize.py @@ -230,7 +230,7 @@ def calibrate_loop(): def main(args): if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for inference.") + raise OSError("GPU is required for inference.") random.seed(RAND_SEED) np.random.seed(RAND_SEED) @@ -314,7 +314,7 @@ def main(args): # Workaround for wo quantization if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: - with open(f"{export_path}/config.json", 'r') as f: + with open(f"{export_path}/config.json") as f: tensorrt_llm_config = json.load(f) if args.qformat == "int8_wo": tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 37ec667d96a77..050b791b62adb 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int): tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, - enforce_eager=True, - enable_chunked_prefill=False, - max_model_len=8192, - limit_mm_per_prompt={"audio": audio_count}) + llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count}) stop_token_ids = None return llm, prompt, stop_token_ids diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 83d2548a506e4..11af6880e1b5a 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -176,6 +176,31 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids +# H2OVL-Mississippi +def run_h2ovl(question: str, modality: str): + assert modality == "image" + + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + return llm, prompt, stop_token_ids + + # InternVL def run_internvl(question: str, modality: str): assert modality == "image" @@ -262,10 +287,9 @@ def run_qwen2_vl(question: str, modality: str): model_name = "Qwen/Qwen2-VL-7B-Instruct" - # Tested on L40 llm = LLM( model=model_name, - max_model_len=8192, + max_model_len=4096, max_num_seqs=5, # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ @@ -353,6 +377,31 @@ def run_glm4v(question: str, modality: str): return llm, prompt, stop_token_ids +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 3 * 364 + }, + }, + ) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -364,6 +413,7 @@ def run_glm4v(question: str, modality: str): "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, + "h2ovl_chat": run_h2ovl, "internvl_chat": run_internvl, "NVLM_D": run_nvlm_d, "qwen_vl": run_qwen_vl, @@ -372,6 +422,7 @@ def run_glm4v(question: str, modality: str): "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, + "idefics3": run_idefics3, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index e28514bf403f7..dc12df8d78211 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -107,6 +107,40 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"max_dynamic_patch": 4}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -256,13 +290,46 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 2 * 364 + }, + }, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + model_example_map = { "phi3_v": load_phi3v, + "h2ovl_chat": load_h2onvl, "internvl_chat": load_internvl, "NVLM_D": load_nvlm_d, "qwen2_vl": load_qwen2_vl, "qwen_vl_chat": load_qwenvl_chat, "mllama": load_mllama, + "idefics3": load_idefics3, } diff --git a/examples/openai_api_client_for_multimodal.py b/examples/openai_chat_completion_client_for_multimodal.py similarity index 100% rename from examples/openai_api_client_for_multimodal.py rename to examples/openai_chat_completion_client_for_multimodal.py diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py new file mode 100644 index 0000000000000..fff82020d9a30 --- /dev/null +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -0,0 +1,120 @@ +import argparse +import base64 +import io + +import requests +from PIL import Image + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + +def vlm2vec(): + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": + "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }], + "encoding_format": + "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +def dse_qwen2_vl(inp: dict): + # Embedding an Image + if inp["dtype"] == "image": + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": inp["image_url"], + } + }, { + "type": "text", + "text": "What is shown in this image?" + }] + }] + # Embedding a Text Query + else: + # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image + # of the minimum input size + buffer = io.BytesIO() + image_placeholder = Image.new("RGB", (56, 56)) + image_placeholder.save(buffer, "png") + buffer.seek(0) + image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_placeholder}", + } + }, + { + "type": "text", + "text": f"Query: {inp['content']}" + }, + ] + }] + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "MrLight/dse-qwen2-2b-mrl-v1", + "messages": messages, + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + "Script to call a specified VLM through the API. Make sure to serve " + "the model with --task embedding before running this.") + parser.add_argument("model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.") + args = parser.parse_args() + + if args.model == "vlm2vec": + vlm2vec() + elif args.model == "dse_qwen2_vl": + dse_qwen2_vl({ + "dtye": "image", + "image_url": image_url, + }) + dse_qwen2_vl({ + "dtype": "text", + "content": "What is the weather like today?", + }) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index d1389f5392c8c..f76a61bb5eec3 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,33 +1,4 @@ { - "__inputs": [ - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.2" - }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -54,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 1, "links": [], "liveNow": false, "panels": [ @@ -76,6 +47,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -241,6 +213,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -358,6 +331,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -523,6 +497,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -658,6 +633,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -823,6 +799,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -984,7 +961,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1076,7 +1053,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1117,6 +1094,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -1147,8 +1125,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1199,6 +1176,319 @@ ], "title": "Finish Reason", "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Prefill", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Decode", + "range": true, + "refId": "B" + } + ], + "title": "Requests Prefill and Decode Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Tokens", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Generation Token in Sequence Group", + "type": "timeseries" } ], "refresh": "", @@ -1207,21 +1497,34 @@ "templating": { "list": [ { - "type": "datasource", - "name": "DS_PROMETHEUS", - "label": "datasource", - "current": {}, + "current": { + "selected": false, + "text": "prometheus", + "value": "edx8memhpd9tsa" + }, "hide": 0, "includeAll": false, + "label": "datasource", "multi": false, + "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false + "skipUrlSync": false, + "type": "datasource" }, { + "current": { + "selected": false, + "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct", + "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, "definition": "label_values(model_name)", "hide": 0, "includeAll": false, @@ -1249,7 +1552,6 @@ "timezone": "", "title": "vLLM", "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", - "version": 1, + "version": 8, "weekStart": "" } - diff --git a/examples/run_cluster.sh b/examples/run_cluster.sh index 8e4aa59e1766d..7b4b40b4b7e23 100644 --- a/examples/run_cluster.sh +++ b/examples/run_cluster.sh @@ -14,7 +14,7 @@ PATH_TO_HF_HOME="$4" shift 4 # Additional arguments are passed directly to the Docker command -ADDITIONAL_ARGS="$@" +ADDITIONAL_ARGS=("$@") # Validate node type if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then @@ -45,5 +45,5 @@ docker run \ --shm-size 10.24g \ --gpus all \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ - ${ADDITIONAL_ARGS} \ + "${ADDITIONAL_ARGS[@]}" \ "${DOCKER_IMAGE}" -c "${RAY_START_CMD}" diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja new file mode 100644 index 0000000000000..e7b93fae31770 --- /dev/null +++ b/examples/template_dse_qwen2_vl.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %} +{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %} +{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %} +{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %} +{% endraw %}{% endif %}<|endoftext|> \ No newline at end of file diff --git a/examples/template_vlm2vec.jinja b/examples/template_vlm2vec.jinja new file mode 100644 index 0000000000000..489b99604af38 --- /dev/null +++ b/examples/template_vlm2vec.jinja @@ -0,0 +1,16 @@ +{%- if messages | length > 1 -%} + {{ raise_exception('Embedding models should only embed one message at a time') }} +{%- endif -%} + +{% set vars = namespace(parts=[], next_image_id=1) %} +{%- for message in messages -%} + {%- for content in message['content'] -%} + {%- if content['type'] == 'text' -%} + {%- set vars.parts = vars.parts + [content['text']] %} + {%- elif content['type'] == 'image' -%} + {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} + {%- set vars.next_image_id = vars.next_image_id + 1 %} + {%- endif -%} + {%- endfor -%} +{%- endfor -%} +{{ vars.parts | join(' ') }} diff --git a/examples/tool_chat_template_granite.jinja b/examples/tool_chat_template_granite.jinja new file mode 100644 index 0000000000000..2cc19e77188dc --- /dev/null +++ b/examples/tool_chat_template_granite.jinja @@ -0,0 +1,40 @@ +{%- if tools %} + {{- '<|start_of_role|>available_tools<|end_of_role|> +' }} + {%- for tool in tools %} + {{- tool | tojson(indent=4) }} + {%- if not loop.last %} + {{- ' + +' }} + {%- endif %} + {%- endfor %} + {{- '<|end_of_text|> +' }} +{%- endif %} + +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'user' %} + {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %} + {{- '<|start_of_role|>assistant<|end_of_role|>' }} + {% for tc in message.tool_calls %} + {{- '<|tool_call|> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} + {% endfor %} + {{- '<|end_of_text|> +' }} + {%- elif message['role'] == 'assistant' %} + {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'tool_response' or message['role'] == 'tool' %} + {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- endif %} + {%- if loop.last and add_generation_prompt %} + {{- '<|start_of_role|>assistant<|end_of_role|>' }} + {%- endif %} +{%- endfor %} diff --git a/format.sh b/format.sh index be6ee0ce46dcb..d06ee62351a21 100755 --- a/format.sh +++ b/format.sh @@ -44,14 +44,14 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { - if [[ $2 != $3 ]]; then + if [[ "$2" != "$3" ]]; then echo "❓❓Wrong $1 version installed: $3 is required, not $2." exit 1 fi } -tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "yapf" "$YAPF_VERSION" "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "ruff" "$RUFF_VERSION" "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)" tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)" tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)" @@ -294,6 +294,10 @@ echo 'vLLM actionlint:' tools/actionlint.sh -color echo 'vLLM actionlint: Done' +echo 'vLLM shellcheck:' +tools/shellcheck.sh +echo 'vLLM shellcheck: Done' + if ! git diff --quiet &>/dev/null; then echo echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" diff --git a/pyproject.toml b/pyproject.toml index e78f5652f486b..3c8c46cc8621e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.0", + "torch == 2.5.1", "wheel", "jinja2", ] @@ -34,7 +34,7 @@ select = [ # Pyflakes "F", # pyupgrade - # "UP", + "UP", # flake8-bugbear "B", # flake8-simplify @@ -55,14 +55,12 @@ ignore = [ ] [tool.mypy] -python_version = "3.8" - ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" # After fixing type errors resulting from follow_imports: "skip" -> "silent", -# move the directory here and remove it from format.sh and mypy.yaml +# move the directory here and remove it from tools/mypy.sh files = [ "vllm/*.py", "vllm/adapter_commons", @@ -70,7 +68,7 @@ files = [ "vllm/entrypoints", "vllm/core", "vllm/inputs", - "vllm/logging", + "vllm/logging_utils", "vllm/multimodal", "vllm/platforms", "vllm/transformers_utils", @@ -95,6 +93,9 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ "skip_global_cleanup", - "core_model: run this model test in each PR instead of just daily", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", + "quant_model: run this model test under Quantized category", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", + "skip_v1: do not run this test with v1", ] diff --git a/python_only_dev.py b/python_only_dev.py index 4ab203bb6f9d6..1ca0f5c30b741 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -69,7 +69,8 @@ current_vllm_path = os.path.join(cwd, "vllm") print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") - os.rename(pre_built_vllm_path, tmp_path) + shutil.copytree(pre_built_vllm_path, tmp_path) + shutil.rmtree(pre_built_vllm_path) print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") os.symlink(current_vllm_path, pre_built_vllm_path) diff --git a/requirements-build.txt b/requirements-build.txt index 7b16d9778c1a6..fec01caaf25ef 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.5.0 +torch==2.5.1 wheel jinja2 diff --git a/requirements-common.txt b/requirements-common.txt index ef5ed8b645158..acb766d25a2d9 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,4 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.7.1 # required for compressed-tensors +compressed-tensors == 0.8.0 # required for compressed-tensors \ No newline at end of file diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 27ca8ca5dbc58..749b03a0603d8 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,5 +2,5 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.4.0+cpu; platform_machine != "ppc64le" +torch == 2.5.1+cpu; platform_machine != "ppc64le" torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 282ab11838bf4..058ab7c1ee9df 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package -torch == 2.5.0 +torch == 2.5.1 # These must be updated alongside torch -torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 +torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 diff --git a/requirements-hpu.txt b/requirements-hpu.txt new file mode 100644 index 0000000000000..4674efb812cfd --- /dev/null +++ b/requirements-hpu.txt @@ -0,0 +1,11 @@ +# Common dependencies +-r requirements-common.txt + +# Dependencies for HPU code +ray +triton +pandas +tabulate +setuptools>=61 +setuptools-scm>=8 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 diff --git a/requirements-lint.txt b/requirements-lint.txt index 07f738873e1a8..f9132bbf96437 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,7 +1,7 @@ # formatting yapf==0.32.0 toml==0.10.2 -tomli==2.0.1 +tomli==2.0.2 ruff==0.6.5 codespell==2.3.0 isort==5.13.2 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index 7ad0d1e7f704b..95e5914757812 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.5.0 # should be aligned with "common" vLLM torch version +torch == 2.5.1 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/requirements-test.in b/requirements-test.in index 3881f2566b556..76f6de2f77c34 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -8,30 +8,27 @@ pytest-shard # testing utils awscli +decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests -opencv-python # required for video tests peft -requests ray[adag]==2.35 -sentence-transformers # required for embedding -soundfile # required for audio test +sentence-transformers # required for embedding tests +soundfile # required for audio tests timm # required for internvl test -torch==2.5.0 +torch==2.5.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test +mistral_common[opencv] >= 1.4.4 # required for pixtral test datamodel_code_generator # required for minicpm3 test lm-eval[api]==0.4.4 # required for model evaluation test # TODO: Add this after fully implementing llava(mantis) # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test -# Benchmarking -aiohttp - # quantization bitsandbytes>=0.44.0 -buildkite-test-collector==0.1.8 +buildkite-test-collector==0.1.9 numpy < 2.0.0 diff --git a/requirements-test.txt b/requirements-test.txt index c474c2ec34b22..65695111e4dc5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,8 +1,8 @@ # -# This file is autogenerated by pip-compile with Python 3.12 +# This file is autogenerated by pip-compile with Python 3.9 # by the following command: # -# pip-compile --output-file=requirements-test.txt requirements-test.in +# pip-compile requirements-test.in # absl-py==2.1.0 # via rouge-score @@ -14,7 +14,6 @@ aiohappyeyeballs==2.4.3 # via aiohttp aiohttp==3.10.10 # via - # -r requirements-test.in # datasets # fsspec # lm-eval @@ -28,6 +27,10 @@ anyio==4.6.2.post1 # via httpx argcomplete==3.5.1 # via datamodel-code-generator +async-timeout==4.0.3 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -36,20 +39,20 @@ attrs==24.2.0 # referencing audioread==3.0.1 # via librosa -awscli==1.35.16 +awscli==1.35.23 # via -r requirements-test.in bitsandbytes==0.44.1 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator -boto3==1.35.50 +boto3==1.35.57 # via tensorizer -botocore==1.35.50 +botocore==1.35.57 # via # awscli # boto3 # s3transfer -buildkite-test-collector==0.1.8 +buildkite-test-collector==0.1.9 # via -r requirements-test.in certifi==2024.8.30 # via @@ -78,7 +81,7 @@ cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib -datamodel-code-generator==0.26.2 +datamodel-code-generator==0.26.3 # via -r requirements-test.in dataproperty==1.0.1 # via @@ -90,6 +93,8 @@ datasets==3.0.2 # lm-eval decorator==5.1.1 # via librosa +decord==0.6.0 + # via -r requirements-test.in dill==0.3.8 # via # datasets @@ -106,6 +111,10 @@ email-validator==2.2.0 # via pydantic evaluate==0.4.3 # via lm-eval +exceptiongroup==1.2.2 + # via + # anyio + # pytest fastrlock==0.8.2 # via cupy-cuda12x filelock==3.16.1 @@ -156,6 +165,8 @@ idna==3.10 # httpx # requests # yarl +importlib-resources==6.4.5 + # via matplotlib inflect==5.6.2 # via datamodel-code-generator iniconfig==2.0.0 @@ -178,7 +189,9 @@ joblib==1.4.2 jsonlines==4.0.0 # via lm-eval jsonschema==4.23.0 - # via ray + # via + # mistral-common + # ray jsonschema-specifications==2024.10.1 # via jsonschema kiwisolver==1.4.7 @@ -204,6 +217,10 @@ mbstrdecoder==1.1.3 # dataproperty # pytablewriter # typepy +mistral-common[opencv]==1.4.4 + # via + # -r requirements-test.in + # mistral-common more-itertools==10.5.0 # via lm-eval mpmath==1.3.0 @@ -238,12 +255,14 @@ numpy==1.26.4 # contourpy # cupy-cuda12x # datasets + # decord # evaluate # librosa # matplotlib + # mistral-common # numba # numexpr - # opencv-python + # opencv-python-headless # pandas # peft # rouge-score @@ -286,8 +305,8 @@ nvidia-nvjitlink-cu12==12.4.127 # torch nvidia-nvtx-cu12==12.4.127 # via torch -opencv-python==4.10.0.84 - # via -r requirements-test.in +opencv-python-headless==4.10.0.84 + # via mistral-common packaging==24.1 # via # accelerate @@ -317,9 +336,10 @@ peft==0.13.2 # via # -r requirements-test.in # lm-eval -pillow==11.0.0 +pillow==10.4.0 # via # matplotlib + # mistral-common # sentence-transformers # torchvision platformdirs==4.3.6 @@ -354,7 +374,9 @@ pybind11==2.13.6 pycparser==2.22 # via cffi pydantic[email]==2.9.2 - # via datamodel-code-generator + # via + # datamodel-code-generator + # mistral-common pydantic-core==2.23.4 # via pydantic pyparsing==3.2.0 @@ -414,19 +436,19 @@ regex==2024.9.11 # transformers requests==2.32.3 # via - # -r requirements-test.in # buildkite-test-collector # datasets # evaluate # huggingface-hub # lm-eval + # mistral-common # pooch # ray # tiktoken # transformers rouge-score==0.1.2 # via lm-eval -rpds-py==0.20.0 +rpds-py==0.20.1 # via # jsonschema # referencing @@ -456,6 +478,8 @@ scipy==1.13.1 # sentence-transformers sentence-transformers==3.2.1 # via -r requirements-test.in +sentencepiece==0.2.0 + # via mistral-common six==1.16.0 # via # python-dateutil @@ -486,13 +510,21 @@ tensorizer==2.9.0 # via -r requirements-test.in threadpoolctl==3.5.0 # via scikit-learn -tiktoken==0.8.0 - # via lm-eval +tiktoken==0.7.0 + # via + # lm-eval + # mistral-common timm==1.0.11 # via -r requirements-test.in -tokenizers==0.20.1 +tokenizers==0.20.3 # via transformers -torch==2.5.0 +toml==0.10.2 + # via datamodel-code-generator +tomli==2.0.2 + # via + # black + # pytest +torch==2.5.1 # via # -r requirements-test.in # accelerate @@ -503,7 +535,7 @@ torch==2.5.0 # tensorizer # timm # torchvision -torchvision==0.20.0 +torchvision==0.20.1 # via timm tqdm==4.66.6 # via @@ -535,8 +567,12 @@ typepy[datetime]==1.3.2 # tabledata typing-extensions==4.12.2 # via + # anyio + # black # huggingface-hub # librosa + # mistral-common + # multidict # pydantic # pydantic-core # torch @@ -552,8 +588,10 @@ xxhash==3.5.0 # via # datasets # evaluate -yarl==1.17.0 +yarl==1.17.1 # via aiohttp +zipp==3.20.2 + # via importlib-resources zstandard==0.23.0 # via lm-eval diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 4c606cf0a9105..f9a0770804e55 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -2,6 +2,22 @@ -r requirements-common.txt # Dependencies for TPU -# Currently, the TPU backend uses a nightly version of PyTorch XLA. -# You can install the dependencies in Dockerfile.tpu. +cmake>=3.26 +ninja +packaging +setuptools-scm>=8 +wheel +jinja2 ray[default] + +# Install torch_xla +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-releases/index.html +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html +torch==2.6.0.dev20241028+cpu +torchvision==0.20.0.dev20241028+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241028-cp310-cp310-linux_x86_64.whl +jaxlib==0.4.32.dev20240829 +jax==0.4.32.dev20240829 diff --git a/requirements-xpu.txt b/requirements-xpu.txt index eb76a33dab5c2..e41295792283f 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -8,9 +8,9 @@ packaging setuptools-scm>=8 wheel jinja2 -# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -torch == 2.3.1+cxx11.abi -intel-extension-for-pytorch == 2.3.110+xpu -oneccl_bind_pt == 2.3.100+xpu + +torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl triton-xpu == 3.0.0b1 diff --git a/setup.py b/setup.py index 281acdc1f6edf..2aaf6325cf17a 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ import importlib.util -import io import logging import os import re @@ -56,12 +55,6 @@ def is_ninja_available() -> bool: return which("ninja") is not None -def remove_prefix(text, prefix): - if text.startswith(prefix): - return text[len(prefix):] - return text - - class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: @@ -198,8 +191,10 @@ def build_extensions(self) -> None: os.makedirs(self.build_temp) targets = [] - target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."), - "vllm_flash_attn.") + + def target_name(s: str) -> str: + return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.") + # Build all the extensions for ext in self.extensions: self.configure(ext) @@ -254,6 +249,24 @@ def run(self): self.copy_file(file, dst_file) +def _is_hpu() -> bool: + is_hpu_available = True + try: + subprocess.run(["hl-smi"], capture_output=True, check=True) + except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): + if not os.path.exists('/dev/accel/accel0') and not os.path.exists( + '/dev/accel/accel_controlD0'): + # last resort... + try: + output = subprocess.check_output( + 'lsmod | grep habanalabs | wc -l', shell=True) + is_hpu_available = int(output) > 0 + except (ValueError, FileNotFoundError, PermissionError, + subprocess.CalledProcessError): + is_hpu_available = False + return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" + + def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -261,7 +274,7 @@ def _no_device() -> bool: def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda - and not (_is_neuron() or _is_tpu())) + and not (_is_neuron() or _is_tpu() or _is_hpu())) def _is_hip() -> bool: @@ -327,7 +340,7 @@ def get_neuronxcc_version(): "__init__.py") # Check if the command was executed successfully - with open(version_file, "rt") as fp: + with open(version_file) as fp: content = fp.read() # Extract the version using a regular expression @@ -357,6 +370,22 @@ def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) +def get_gaudi_sw_version(): + """ + Returns the driver version. + """ + # Enable console printing for `hl-smi` check + output = subprocess.run("hl-smi", + shell=True, + text=True, + capture_output=True, + env={"ENABLE_CONSOLE": "true"}) + if output.returncode == 0 and output.stdout: + return output.stdout.split("\n")[2].replace( + " ", "").split(":")[1][:-1].split("-")[0] + return "0.0.0" # when hl-smi is not available + + def get_vllm_version() -> str: version = get_version( write_to="vllm/_version.py", # TODO: move this to pyproject.toml @@ -386,6 +415,12 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"{sep}neuron{neuron_version_str}" + elif _is_hpu(): + # Get the Intel Gaudi Software Suite version + gaudi_sw_version = str(get_gaudi_sw_version()) + if gaudi_sw_version != MAIN_CUDA_VERSION: + gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] + version += f"{sep}gaudi{gaudi_sw_version}" elif _is_openvino(): version += f"{sep}openvino" elif _is_tpu(): @@ -404,7 +439,8 @@ def read_readme() -> str: """Read the README file if present.""" p = get_path("README.md") if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() else: return "" @@ -443,6 +479,8 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-rocm.txt") elif _is_neuron(): requirements = _read_requirements("requirements-neuron.txt") + elif _is_hpu(): + requirements = _read_requirements("requirements-hpu.txt") elif _is_openvino(): requirements = _read_requirements("requirements-openvino.txt") elif _is_tpu(): @@ -458,7 +496,7 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-xpu.txt") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCm, Neuron, " + "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " "OpenVINO, or CPU.") return requirements @@ -503,7 +541,6 @@ def _read_requirements(filename: str) -> List[str]: "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -517,12 +554,13 @@ def _read_requirements(filename: str) -> List[str]: ], packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", "tests*")), - python_requires=">=3.8", + python_requires=">=3.9", install_requires=get_requirements(), ext_modules=ext_modules, extras_require={ "tensorizer": ["tensorizer>=2.9.0"], - "audio": ["librosa", "soundfile"] # Required for audio processing + "audio": ["librosa", "soundfile"], # Required for audio processing + "video": ["decord"] # Required for video processing }, cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, package_data=package_data, diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 79647589d5204..7f16baa65a644 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -156,3 +156,29 @@ def test_model_with_failure(vllm_runner) -> None: ModelInputForGPUWithSamplingMetadata) finally: os.remove(filename) + + +def test_failure_with_async_out_proc(vllm_runner) -> None: + + filename = None + try: + with vllm_runner("facebook/opt-125m", + dtype="half", + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model,\ + patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", + side_effect=ValueError()): + model_config = vllm_model.model.llm_engine.model_config + assert model_config.use_async_output_proc + with pytest.raises(ValueError) as exc_info: + vllm_model.generate_greedy('how to make pizza?', 250) + matches = re.search(r"input dumped to (.+).pkl", + str(exc_info.value)) + assert matches is not None + + filename = f"{matches.group(1)}.pkl" + finally: + # Clean up + if filename is not None: + os.remove(filename) + pass diff --git a/tests/compile/backend.py b/tests/compile/backend.py new file mode 100644 index 0000000000000..9d5c68274374e --- /dev/null +++ b/tests/compile/backend.py @@ -0,0 +1,33 @@ +from copy import deepcopy +from typing import Callable + +import torch + + +class TestBackend: + """ + This class provides a simple Inductor backend that can be used for testing. + It takes a list of custom passes and runs them after Inductor's passes. + It also saves the graph before and after the custom passes for inspection. + """ + + def __init__(self, *args: Callable[[torch.fx.Graph], None]): + self.custom_passes = args + from torch._inductor import config + self.current_config = config.shallow_copy_dict() + self.current_config['post_grad_custom_post_pass'] = self.post_pass + + def __call__(self, graph: torch.fx.GraphModule, example_inputs): + from torch._inductor.compile_fx import compile_fx + return compile_fx(graph, + example_inputs, + config_patches=self.current_config) + + def post_pass(self, graph: torch.fx.Graph): + self.graph_pre_pass = deepcopy(graph) + for pass_ in self.custom_passes: + pass_(graph) + + self.graph_post_pass = deepcopy(graph) + # assign by reference, will reflect the final state of the graph + self.final_graph = graph diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json index 03d077b76f627..798a34e8dd92d 100644 --- a/tests/compile/piecewise/piecewise_compilation_config.json +++ b/tests/compile/piecewise/piecewise_compilation_config.json @@ -1,4 +1,5 @@ { "use_cudagraph": true, - "non_cudagraph_ops": ["silly.attention"] + "non_cudagraph_ops": ["silly.attention"], + "cudagraph_copy_inputs": true } \ No newline at end of file diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index a34d33efba1d8..c631850ecdedb 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -6,18 +6,21 @@ import torch from torch import nn +from torch.library import Library from vllm.compilation.compile_context import set_compile_context from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.compilation.levels import CompilationLevel - -os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) +from vllm.config import VllmConfig +from vllm.utils import direct_register_custom_op global_counter = 0 +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + -@torch.library.custom_op("silly::attention", mutates_args=["out"]) def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor) -> None: global global_counter @@ -27,16 +30,28 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out[0] += 1 -@silly_attention.register_fake -def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: return +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + @support_torch_compile class SillyModel(nn.Module): - def __init__(self) -> None: + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: super().__init__() def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -62,13 +77,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def test_simple_piecewise_compile(): - model = SillyModel() - directory = os.path.dirname(__file__) config = os.path.join(directory, "piecewise_compilation_config.json") os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + + model = SillyModel(vllm_config=VllmConfig(), prefix='') - input_buffer = torch.randn(100).cuda() + inputs = torch.randn(100).cuda() with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -80,15 +96,15 @@ def test_simple_piecewise_compile(): ): with set_compile_context([1, 2]): - model(input_buffer) + model(inputs) - model(input_buffer[:2]) - model(input_buffer[:1]) + model(torch.randn(2).cuda()) + model(torch.randn(1).cuda()) - input_buffer[:2].zero_() + input = torch.zeros(2).cuda() global global_counter global_counter = 0 - output = model(input_buffer[:2]) + output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index db6a983d70feb..c363a587a818e 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -1,6 +1,10 @@ """ Test the piecewise compilation with a simple model, comparing the output with and without the piecewise compilation. + +This is a tractable model, the weights and computation are specially designed +if the config `tractable_init` is set to True. Otherwise, the weights are +initialized randomly with a fixed seed. """ import os from dataclasses import dataclass @@ -8,16 +12,21 @@ import torch from torch import nn +from torch.library import Library from vllm.compilation.compile_context import set_compile_context from vllm.compilation.config import CompilationConfig from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.compilation.levels import CompilationLevel +from vllm.config import VllmConfig from vllm.plugins import set_compilation_config +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa -@torch.library.custom_op("silly::attention", mutates_args=["out"]) def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor) -> None: out.copy_(q) @@ -25,18 +34,32 @@ def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out += v -@silly_attention.register_fake -def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, - out: torch.Tensor) -> None: +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: return +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + @dataclass class LlamaConfig: hidden_size: int = 128 mlp_size: int = 256 vocab_size: int = 128 num_layers: int = 2 + init_value: float = 1.0 + tractable_init: bool = False + random_seed: int = 0 + + def __post_init__(self): + assert self.mlp_size >= self.hidden_size class LlamaMLP(nn.Module): @@ -54,10 +77,23 @@ def __init__(self, config: LlamaConfig) -> None: bias=False, ) - self.gate_up_projection.weight.data.fill_(0.0) - self.down_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size]) + nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:]) + nn.init.eye_(self.down_projection.weight.data) + else: + nn.init.xavier_normal_(self.gate_up_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.down_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward(self, x): + # for tractable_init and positive input, this is + # essentially an elementwise-square x = self.gate_up_projection(x) x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( x[:, x.size(1) // 2:]) @@ -72,21 +108,39 @@ def __init__(self, config: LlamaConfig) -> None: self.qkv_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size * 3, + bias=False, ) self.output_projection = nn.Linear( in_features=config.hidden_size, out_features=config.hidden_size, + bias=False, ) - self.qkv_projection.weight.data.fill_(0.0) - self.output_projection.weight.data.fill_(0.0) + if config.tractable_init: + nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 * + config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[2 * + config.hidden_size:]) + nn.init.eye_(self.output_projection.weight.data) + else: + nn.init.xavier_normal_(self.qkv_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.output_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, ) -> torch.Tensor: + # for tractable_init, this is: + # output = (hidden_states * 3 + positions * 2) qkv = self.qkv_projection(hidden_states) hidden_size = qkv.size(-1) // 3 q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) @@ -114,28 +168,43 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + For tractable computation: + - if residual is None, the outputs are: + - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + - if residual is not None, the outputs are: + - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + """ # noqa if residual is None: residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 else: hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.self_attention(positions=positions, hidden_states=hidden_states) hidden_states = hidden_states + residual residual = hidden_states - hidden_states = hidden_states / 2 + hidden_states = hidden_states + 1 hidden_states = self.mlp(hidden_states) return hidden_states, residual +@support_torch_compile class LlamaModel(nn.Module): - def __init__(self, config: LlamaConfig) -> None: + def __init__(self, + *, + vllm_config: VllmConfig, + config: LlamaConfig, + prefix: str = '', + **kwargs) -> None: super().__init__() self.embedding_tokens = nn.Embedding( num_embeddings=config.vocab_size, @@ -144,7 +213,8 @@ def __init__(self, config: LlamaConfig) -> None: self.layers = nn.ModuleList( [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) - self.embedding_tokens.weight.data.fill_(0.0) + # this is the initial value of the hidden states + self.embedding_tokens.weight.data.fill_(config.init_value) def forward( self, @@ -158,6 +228,28 @@ def forward( return hidden_states +def tractable_computation(input_ids: torch.Tensor, + positions: torch.Tensor, + config: LlamaConfig, + init_value: float = 1.0) -> torch.Tensor: + hidden_states = torch.ones(input_ids.size(0), + config.hidden_size, + device=input_ids.device, + dtype=input_ids.dtype) * init_value + + # first layer + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + # following layers + for _ in range(config.num_layers - 1): + hidden_states = hidden_states + residual + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + return hidden_states + + @torch.inference_mode def run_model(llama_config, use_compile: bool, @@ -180,10 +272,9 @@ def run_model(llama_config, CompilationLevel.NO_COMPILATION) set_compilation_config(None) - cls = LlamaModel - if use_compile: - cls = support_torch_compile(LlamaModel) - model = cls(llama_config).eval().cuda() + model = LlamaModel(config=llama_config, + vllm_config=VllmConfig(), + prefix="").eval().cuda() B = 16 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() @@ -201,7 +292,15 @@ def run_model(llama_config, del os.environ["VLLM_TORCH_COMPILE_LEVEL"] set_compilation_config(None) - return output.cpu() + output = output.cpu() + + if llama_config.tractable_init: + expected_output = tractable_computation(input_ids[:2], positions[:2], + llama_config).cpu() + + assert torch.allclose(output, expected_output) + else: + return output.cpu() def test_toy_llama(): @@ -210,7 +309,13 @@ def test_toy_llama(): llama_config = LlamaConfig(hidden_size=128, mlp_size=256, vocab_size=128, - num_layers=2) + num_layers=12) + + tractable_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2, + tractable_init=True) outputs = [] with compilation_counter.expect( @@ -221,6 +326,8 @@ def test_toy_llama(): num_cudagraph_caputured=0, ): outputs.append(run_model(llama_config, use_compile=False)) + run_model(tractable_config, use_compile=False) + with compilation_counter.expect( num_graphs_seen=1, # one graph for the model num_piecewise_graphs_seen=1, @@ -230,6 +337,7 @@ def test_toy_llama(): 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen ): outputs.append(run_model(llama_config, use_compile=True)) + run_model(tractable_config, use_compile=True) with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -245,6 +353,7 @@ def test_toy_llama(): ): outputs.append( run_model(llama_config, use_compile=True, split_attn=True)) + run_model(tractable_config, use_compile=True, split_attn=True) for i in range(1, len(outputs)): assert torch.allclose(outputs[0], outputs[i]) @@ -254,7 +363,6 @@ def test_toy_llama(): def benchmark(): os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) from triton.testing import do_bench - cls = support_torch_compile(LlamaModel) # similar to llama 3.1-8B llama_config = LlamaConfig(hidden_size=4096, @@ -287,7 +395,9 @@ def benchmark(): else: set_compilation_config(None) - model = cls(llama_config).eval().cuda().to(torch.bfloat16) + model = LlamaModel(config=llama_config, + vllm_config=VllmConfig(), + prefix="").eval().cuda().to(torch.bfloat16) B = 256 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() @@ -326,8 +436,8 @@ def benchmark(): # print in tabular format print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") for b in cudagraph_sizes: - print((f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" - f"\t{piecewise_cudagraph_time[b]:.3f}")) + print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}") if __name__ == "__main__": diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 2f92ff73845f5..833589ba5dc9f 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -1,3 +1,4 @@ +import dataclasses from typing import Dict, List, Optional import pytest @@ -8,33 +9,109 @@ from ..utils import compare_all_settings +@dataclasses.dataclass +class TestSetting: + model: str + model_args: List[str] + pp_size: int + tp_size: int + attn_backend: str + method: str + fullgraph: bool + + +# representative settings for testing +test_settings = [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="FLASHINFER", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), +] + + # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize( - "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", - [ - ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", - ["--quantization", "compressed-tensors" - ], 1, 1, "FLASH_ATTN", "generate", True), - ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True), - # TODO: add multi-modality test for llava - ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) - ]) -def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend, - method, fullgraph): +@pytest.mark.parametrize("test_setting", test_settings) +def test_compile_correctness(test_setting: TestSetting): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. + model = test_setting.model + model_args = test_setting.model_args + pp_size = test_setting.pp_size + tp_size = test_setting.tp_size + attn_backend = test_setting.attn_backend + method = test_setting.method + fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") import os os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - all_args = [["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + - ["-tp", str(tp_size)]] * 3 - # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case - # inductor will change the output, so we cannot compare them. + final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ + ["-tp", str(tp_size)] + all_envs: List[Optional[Dict[str, str]]] = [] + + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_envs.append({"VLLM_TORCH_COMPILE_LEVEL": str(level)}) + + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, [final_args] * 2, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + for level in [ CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS, @@ -46,4 +123,4 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend, all_envs[-1][ "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore - compare_all_settings(model, all_args, all_envs, method=method) + compare_all_settings(model, [final_args] * 3, all_envs, method=method) diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py new file mode 100644 index 0000000000000..e4d3defafb951 --- /dev/null +++ b/tests/compile/test_fusion.py @@ -0,0 +1,92 @@ +import pytest +import torch +from compressed_tensors.quantization import FP8_DTYPE + +import vllm.envs as envs +from vllm.compilation.config import CompilationConfig +from vllm.compilation.fusion import (FusionPass, find_auto_fn, + find_auto_fn_maybe) +from vllm.compilation.reshapes import RedundantReshapesPass +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear) + +from .backend import TestBackend + + +class TestModel(torch.nn.Module): + + def __init__(self, hidden_size: int, eps: float, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] + self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)] + self.w = [ + torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + for _ in range(2) + ] + + def forward(self, x): + resid = torch.relu(x) + y = self.norm[0](x) + + x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1]) + # make sure resid is used for replacement to work + y2, resid = self.norm[1](x2, resid) + + x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3]) + y3, resid = self.norm[2](x3, resid) # use resid here + return y3 + + +# Init does pattern registration, which can only happen once +config = CompilationConfig(enable_fusion=True) +reshape_pass = RedundantReshapesPass(config) +fusion_pass = FusionPass.instance(config) + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) +@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) +@pytest.mark.parametrize("eps", [1e-5, 1e-6]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", + reason="Only test on CUDA") +def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): + torch.set_default_device("cuda") + torch.set_default_dtype(torch.float16) + + if eps != 1e-5: + pytest.skip("Only test eps=1e-5 for now") + + # Reshape pass is needed for the fusion pass to work + backend = TestBackend(reshape_pass, fusion_pass) + model = TestModel(hidden_size, eps) + + # First dimension dynamic + x = torch.rand(num_tokens, hidden_size) + torch._dynamo.mark_dynamic(x, 0) + + result = model(x) + + model2 = torch.compile(model, backend=backend) + result2 = model2(x) + + # Check that it gives the same answer + torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3) + + # Check substitution worked + pre_nodes = backend.graph_pre_pass.nodes + post_nodes = backend.graph_post_pass.nodes + + rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default + add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default + fp8_quant = torch.ops._C.static_scaled_fp8_quant.default + + # In pre-nodes, fp8 quant should be present and fused kernels should not + assert find_auto_fn_maybe(pre_nodes, rms_quant) is None + assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None + find_auto_fn(pre_nodes, fp8_quant) + + # In post-nodes, fused kernels should be present and fp8 quant should not + find_auto_fn(post_nodes, rms_quant) + find_auto_fn(post_nodes, add_rms_quant) + assert find_auto_fn_maybe(post_nodes, fp8_quant) is None diff --git a/tests/compile/utils.py b/tests/compile/utils.py index 95cad19126df6..222c63a342a4b 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -9,29 +9,26 @@ TEST_MODELS = [ ("facebook/opt-125m", {}), - # TODO: add fake implementation for compressed-tensors - # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { - # "dtype": torch.float16, - # "quantization": "compressed-tensors" - # }), + ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", { + "dtype": torch.float16, + "quantization": "compressed-tensors" + }), ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", { "dtype": torch.float16, "quantization": "fp8" }), - # TODO: add fake implementation for compressed-tensors - # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { - # "quantization": "compressed-tensors" - # }), + ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", { + "quantization": "compressed-tensors" + }), ("meta-llama/Meta-Llama-3-8B", {}), ] -# TODO: enable in pytorch 2.5 -if False and is_quant_method_supported("aqlm"): # noqa: SIM223 +if is_quant_method_supported("aqlm"): TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { "quantization": "aqlm" })) -# TODO: enable in pytorch 2.5 +# TODO: figure out why this fails. if False and is_quant_method_supported("gguf"): # noqa: SIM223 TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { "quantization": "gguf" @@ -71,13 +68,13 @@ def check_full_graph_support(model, os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - # Inductor doesn't support fp8 and the base meta llama uses too - # much memory. - quantization = model_kwargs.get("quantization") - if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B") + # The base meta llama uses too much memory. + if (model == "meta-llama/Meta-Llama-3-8B" and optimization_level >= CompilationLevel.PIECEWISE): return + print(f"MODEL={model}") + prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/conftest.py b/tests/conftest.py index bdc6ffb148602..0dc1cc6e83c18 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,11 @@ import json import os -import sys import tempfile from collections import UserList from enum import Enum from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, TypedDict, TypeVar, Union) +from unittest.mock import patch import numpy as np import pytest @@ -52,7 +52,7 @@ def _read_prompts(filename: str) -> List[str]: - with open(filename, "r") as f: + with open(filename) as f: prompts = f.readlines() return prompts @@ -62,14 +62,8 @@ class _ImageAssetPrompts(TypedDict): cherry_blossom: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _ImageAssetsBase(UserList): - pass -else: - - class _ImageAssetsBase(UserList[ImageAsset]): - pass +class _ImageAssetsBase(UserList[ImageAsset]): + pass class _ImageAssets(_ImageAssetsBase): @@ -94,14 +88,8 @@ class _VideoAssetPrompts(TypedDict): sample_demo_1: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _VideoAssetsBase(UserList): - pass -else: - - class _VideoAssetsBase(UserList[VideoAsset]): - pass +class _VideoAssetsBase(UserList[VideoAsset]): + pass class _VideoAssets(_VideoAssetsBase): @@ -121,6 +109,23 @@ def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: """Singleton instance of :class:`_VideoAssets`.""" +@pytest.fixture(params=[True, False]) +def run_with_both_engines(request): + # Automatically runs tests twice, once with V1 and once without + use_v1 = request.param + # Tests decorated with `@skip_v1` are only run without v1 + skip_v1 = request.node.get_closest_marker("skip_v1") + + if use_v1: + if skip_v1: + pytest.skip("Skipping test on vllm V1") + with patch('vllm.envs.VLLM_USE_V1', True): + yield + else: + with patch('vllm.envs.VLLM_USE_V1', False): + yield + + @pytest.fixture(autouse=True) def init_test_http_connection(): # pytest_asyncio may use a different event loop per test @@ -238,6 +243,9 @@ def video_assets() -> _VideoAssets: class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + if device is None: device = "cpu" if current_platform.is_cpu() else "cuda" @@ -958,7 +966,7 @@ def dummy_opt_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: @@ -977,7 +985,7 @@ def dummy_llava_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyLlava"] with open(json_path, "w") as f: @@ -996,7 +1004,7 @@ def dummy_gemma2_embedding_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyGemma2Embedding"] with open(json_path, "w") as f: diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 1a6e17ef7b445..d325b9606843e 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -99,13 +99,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int, token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] - first_chain, second_chain = [ - TestPrefixCachingBlock.create_chain( - block_size=block_size, - token_ids=token_ids, - num_empty_trailing_blocks=num_empty_trailing_blocks) - for _ in range(2) - ] + first_chain, second_chain = (TestPrefixCachingBlock.create_chain( + block_size=block_size, + token_ids=token_ids, + num_empty_trailing_blocks=num_empty_trailing_blocks) + for _ in range(2)) for first_chain_block, second_chain_block in zip( first_chain, second_chain): diff --git a/tests/core/utils.py b/tests/core/utils.py index a95a573db7cd3..cd0caa4704e11 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,6 +4,7 @@ from typing import Tuple from vllm import SamplingParams +from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, Sequence, SequenceGroup @@ -27,10 +28,7 @@ def create_dummy_prompt( prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), - inputs={ - "prompt": prompt_str, - "prompt_token_ids": prompt_tokens, - }, + inputs=token_inputs(prompt_tokens, prompt=prompt_str), block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], @@ -63,23 +61,21 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - inputs = { - "prompt": decoder_prompt_str, - "prompt_token_ids": decoder_prompt_tokens, - "encoder_prompt": encoder_prompt_str, - "encoder_prompt_token_ids": encoder_prompt_tokens, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), } decoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=True) + inputs=inputs["decoder"], + block_size=block_size) encoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=False) + inputs=inputs["encoder"], + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams(best_of=best_of), @@ -108,7 +104,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={"prompt_token_ids": prompt_token_ids}, + inputs=token_inputs(prompt_token_ids), block_size=16, ) @@ -143,21 +139,19 @@ def create_seq_group_encoder_decoder( prompt_token_ids = [0] * seq_prompt_len - inputs = { - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "encoder_prompt": "", - "encoder_prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), } seqs = [] for seq_id_offset, output_len in enumerate(seq_output_lens): # Construct decoder input sequences - seq = Sequence(seq_id=seq_id_start + seq_id_offset, - inputs=inputs, - block_size=16, - from_decoder_prompt=True) + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) for i in range(output_len): seq.append_token_id( @@ -167,10 +161,11 @@ def create_seq_group_encoder_decoder( seqs.append(seq) # Encoder input sequence - encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs, - block_size=16, - from_decoder_prompt=False) + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) return SequenceGroup(request_id=request_id, seqs=seqs, diff --git a/tests/distributed/test_ca_buffer_sharing.py b/tests/distributed/test_ca_buffer_sharing.py new file mode 100644 index 0000000000000..fc4043cd3014e --- /dev/null +++ b/tests/distributed/test_ca_buffer_sharing.py @@ -0,0 +1,59 @@ +# can only run on machines with p2p access across GPUs +# can only run with torchrun: +# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py + +import ctypes + +import torch +import torch.distributed as dist + +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary +from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa + CustomAllreduce) + +# create a cpu process group for communicating metadata (ipc handle) +dist.init_process_group(backend="gloo") +rank = local_rank = dist.get_rank() +world_size = dist.get_world_size() + +# every process sets its own device (differently) +lib = CudaRTLibrary() +lib.cudaSetDevice(rank) + +buffer_size_in_bytes = 1024 +byte_value = 2 # the value we write to the buffer for verification + +pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes) + +print(f"Rank {rank} has pointers {pointers}") + +dist.barrier() +torch.cuda.synchronize() + +if rank == 0: + # the first rank tries to write to all buffers + for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes) + +dist.barrier() +torch.cuda.synchronize() + +host_data = (ctypes.c_char * buffer_size_in_bytes)() + +# all ranks read from all buffers, and check if the data is correct +for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes) + for i in range(buffer_size_in_bytes): + assert ord(host_data[i]) == byte_value, ( + f"Rank {rank} failed" + f" to verify buffer {p}. Expected {byte_value}, " + f"got {ord(host_data[i])}") + +print(f"Rank {rank} verified all buffers") + +dist.barrier() +torch.cuda.synchronize() + +CustomAllreduce.free_shared_buffer(pointers) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 95435e753058a..86ca1948ef94a 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -95,13 +95,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): inp = torch.ones(sz, dtype=torch.float32, device=device) out = inp for _ in range(num_communication): - out = fa.all_reduce_unreg(out) + out = fa.all_reduce(out, registered=False) torch.testing.assert_close(out, inp * (tp_size**num_communication)) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) out = inp for _ in range(num_communication): - out = fa.all_reduce_unreg(out) + out = fa.all_reduce(out, registered=False) torch.testing.assert_close(out, inp * (tp_size**num_communication)) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index ed6360f9d6148..5d566f8308b70 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -32,6 +32,8 @@ class PPTestOptions(NamedTuple): multi_node_only: bool trust_remote_code: bool tokenizer_mode: Optional[str] + load_format: Optional[str] = None + hf_overrides: Optional[str] = None @dataclass @@ -50,6 +52,8 @@ def detailed( task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -78,7 +82,9 @@ def detailed( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) @staticmethod @@ -90,6 +96,8 @@ def fast( multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -102,7 +110,9 @@ def fast( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) def iter_params(self, model_name: str): @@ -161,12 +171,11 @@ def iter_params(self, model_name: str): "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), - "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), + "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(), @@ -214,9 +223,9 @@ def iter_params(self, model_name: str): # NOTE: You can update this on your local machine to run specific tests TEST_MODELS = [ # [LANGUAGE GENERATION] + "microsoft/Phi-3.5-MoE-instruct", "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", - "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -238,7 +247,8 @@ def _compare_tp( method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup - multi_node_only, trust_remote_code, tokenizer_mode = test_options + multi_node_only, trust_remote_code, tokenizer_mode, \ + load_format, hf_overrides = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") @@ -267,6 +277,10 @@ def _compare_tp( common_args.append("--trust-remote-code") if tokenizer_mode: common_args.extend(["--tokenizer-mode", tokenizer_mode]) + if load_format: + common_args.extend(["--load-format", load_format]) + if hf_overrides: + common_args.extend(["--hf-overrides", hf_overrides]) if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2 and chunked_prefill): diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index a51a9909f6f41..50444d3abfaf2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,9 +1,17 @@ +import socket + +import pytest import ray +import torch import vllm.envs as envs -from vllm.utils import (cuda_device_count_stateless, +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.utils import StatelessProcessGroup +from vllm.utils import (cuda_device_count_stateless, get_open_port, update_environment_variables) +from ..utils import multi_gpu_test + @ray.remote class _CUDADeviceCountStatelessTestActor: @@ -24,10 +32,111 @@ def test_cuda_device_count_stateless(): CUDA_VISIBLE_DEVICES is changed.""" actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore num_gpus=2).remote() - assert sorted(ray.get( - actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] + assert len( + sorted(ray.get( + actor.get_cuda_visible_devices.remote()).split(","))) == 2 assert ray.get(actor.get_count.remote()) == 2 ray.get(actor.set_cuda_visible_devices.remote("0")) assert ray.get(actor.get_count.remote()) == 1 ray.get(actor.set_cuda_visible_devices.remote("")) assert ray.get(actor.get_count.remote()) == 0 + + +def cpu_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + if rank <= 2: + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) + data = torch.tensor([rank]) + data = pg1.broadcast_obj(data, src=2) + assert data.item() == 2 + if rank <= 2: + data = torch.tensor([rank + 1]) + data = pg2.broadcast_obj(data, src=2) + assert data.item() == 3 + pg2.barrier() + pg1.barrier() + + +def gpu_worker(rank, WORLD_SIZE, port1, port2): + torch.cuda.set_device(rank) + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + pynccl1 = PyNcclCommunicator(pg1, device=rank) + pynccl1.disabled = False + if rank <= 2: + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) + pynccl2 = PyNcclCommunicator(pg2, device=rank) + pynccl2.disabled = False + data = torch.tensor([rank]).cuda() + pynccl1.all_reduce(data) + pg1.barrier() + torch.cuda.synchronize() + if rank <= 2: + pynccl2.all_reduce(data) + pg2.barrier() + torch.cuda.synchronize() + item = data[0].item() + print(f"rank: {rank}, item: {item}") + if rank == 3: + assert item == 6 + else: + assert item == 18 + + +def broadcast_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + if rank == 2: + pg1.broadcast_obj("secret", src=2) + else: + obj = pg1.broadcast_obj(None, src=2) + assert obj == "secret" + pg1.barrier() + + +def allgather_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + data = pg1.all_gather_obj(rank) + assert data == list(range(WORLD_SIZE)) + pg1.barrier() + + +@multi_gpu_test(num_gpus=4) +@pytest.mark.parametrize( + "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]) +def test_stateless_process_group(worker): + port1 = get_open_port() + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port1)) + port2 = get_open_port() + WORLD_SIZE = 4 + from multiprocessing import get_context + ctx = get_context("fork") + processes = [] + for i in range(WORLD_SIZE): + rank = i + processes.append( + ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))) + for p in processes: + p.start() + for p in processes: + p.join() + for p in processes: + assert not p.exitcode + print("All processes finished.") diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index bef0c515b9073..fa5d6a69a9bc8 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,12 +7,18 @@ import pytest from transformers import AutoModelForSeq2SeqLM +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, + global_force_attn_backend_context_manager) from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close +LIST_ENC_DEC_SUPPORTED_BACKENDS = [ + _Backend.XFORMERS, _Backend.FLASH_ATTN, None +] + def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -28,8 +34,16 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @@ -48,6 +62,7 @@ def test_encoder_decoder_e2e( num_logprobs: int, decoder_prompt_type: DecoderPromptType, enforce_eager: bool, + attn_backend: _Backend, ) -> None: ''' End-to-End (E2E) test for the encoder-decoder framework. @@ -56,43 +71,49 @@ def test_encoder_decoder_e2e( implementations to ensure that both implementations produce consistent and correct results. ''' - test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type] - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - - hf_skip_tokens = (1 - if decoder_prompt_type == DecoderPromptType.NONE else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + test_case_prompts = example_encoder_decoder_prompts[ + decoder_prompt_type] + + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + with vllm_runner(model, dtype=dtype, + enforce_eager=enforce_eager) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) + + hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE + else 0) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 0d84443c51f99..cc14e8cbf75df 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -4,6 +4,7 @@ from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus @@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={"prompt_token_ids": []}, + inputs=token_inputs([]), block_size=16, eos_token_id=eos_token_id, ) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index f7dc167fea6e4..e92e2588d01cb 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -30,6 +30,13 @@ def test_limit_mm_per_prompt_parser(arg, expected): assert args.limit_mm_per_prompt == expected +def test_valid_pooling_config(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args = parser.parse_args(["--pooling-type=MEAN"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert engine_args.pooling_type == 'MEAN' + + @pytest.mark.parametrize( ("arg"), [ diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py new file mode 100644 index 0000000000000..6bf7190a656b8 --- /dev/null +++ b/tests/entrypoints/llm/test_accuracy.py @@ -0,0 +1,56 @@ +""" +This file test accuracy of the vLLM server via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import lm_eval +import pytest + +from vllm.platforms import current_platform + +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +EXPECTED_VALUE = 0.58 + + +def run_test(): + """Run the end to end accuracy test.""" + + model_args = f"pretrained={MODEL_NAME},max_model_len=2048" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + measured_value = results["results"][TASK][FILTER] + assert (measured_value - RTOL < EXPECTED_VALUE + and measured_value + RTOL > EXPECTED_VALUE + ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 is currently only supported on CUDA.") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test() + + +def test_lm_eval_accuracy_v0_engine(monkeypatch): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test() diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py index 675a980ab3f3f..ee7010a238114 100644 --- a/tests/entrypoints/llm/test_prompt_validation.py +++ b/tests/entrypoints/llm/test_prompt_validation.py @@ -3,12 +3,21 @@ from vllm import LLM +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def test_empty_prompt(): llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) +@pytest.mark.skip_v1 def test_out_of_vocab_token(): llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='out of vocabulary'): diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py index 63beaaba29a80..b1d4461d164aa 100644 --- a/tests/entrypoints/openai/test_accuracy.py +++ b/tests/entrypoints/openai/test_accuracy.py @@ -10,6 +10,8 @@ import lm_eval import pytest +from vllm.platforms import current_platform + from ...utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" @@ -18,22 +20,33 @@ FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.58 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"] MORE_ARGS_LIST = [ + [], # Default ["--enable-chunked-prefill"], # Chunked ["--num-scheduler-steps", "8"], # MS ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream ] +MAX_WAIT_SECONDS = None +if current_platform.is_tpu(): + MORE_ARGS_LIST = [ + [], # Default + # ["--num-scheduler-steps", "8"], # Multi-step << currently fails + ] + MAX_WAIT_SECONDS = 600 + + +def run_test(more_args): + """Run the end to end accuracy test.""" -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy(more_args): args = list(DEFAULT_ARGS) args.extend(more_args) - print(f"Running with: {args}") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer( + MODEL_NAME, args, + max_wait_seconds=MAX_WAIT_SECONDS) as remote_server: url = f"{remote_server.url_for('v1')}/completions" model_args = ( @@ -51,3 +64,22 @@ def test_lm_eval_accuracy(more_args): assert (measured_value - RTOL < EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 currently only supported on CUDA") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test([]) + + +@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) +def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test(more_args) diff --git a/tests/entrypoints/openai/test_basic.py b/tests/entrypoints/openai/test_basic.py index d3aea533b6db9..4616f363cc04a 100644 --- a/tests/entrypoints/openai/test_basic.py +++ b/tests/entrypoints/openai/test_basic.py @@ -1,7 +1,6 @@ from http import HTTPStatus from typing import List -import openai import pytest import pytest_asyncio import requests @@ -83,10 +82,8 @@ async def client(server): indirect=True, ) @pytest.mark.asyncio -async def test_show_version(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/version") +async def test_show_version(server: RemoteOpenAIServer): + response = requests.get(server.url_for("version")) response.raise_for_status() assert response.json() == {"version": VLLM_VERSION} @@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI): indirect=True, ) @pytest.mark.asyncio -async def test_check_health(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/health") +async def test_check_health(server: RemoteOpenAIServer): + response = requests.get(server.url_for("health")) assert response.status_code == HTTPStatus.OK diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index f119c6c1201c9..9f2b77dde2a7f 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -4,14 +4,18 @@ import openai import pytest import pytest_asyncio +import requests + +from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @pytest.fixture(scope="module") -def embedding_server(): +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -19,31 +23,29 @@ def embedding_server(): "--enforce-eager", "--max-model-len", "8192", + "--chat-template", + DUMMY_CHAT_TEMPLATE, ] - with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest_asyncio.fixture -async def embedding_client(embedding_server): - async with embedding_server.get_async_client() as async_client: +async def client(server): + async with server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, # test using token IDs input_tokens = [1, 1, 1, 1, 1] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): # test List[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 32 + assert embeddings.usage.total_tokens == 32 # test List[List[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_embedding(server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "user", + "content": "The cat sat on the mat.", + }, { + "role": "assistant", + "content": "A feline was resting on a rug.", + }, { + "role": "user", + "content": "Stars twinkle brightly in the night sky.", + }] + + chat_response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }) + chat_response.raise_for_status() + chat_embeddings = chat_response.json() + + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + prompt = tokenizer.apply_chat_template( + messages, + chat_template=DUMMY_CHAT_TEMPLATE, + add_generation_prompt=True, + continue_final_message=False, + tokenize=False, + ) + completion_response = await client.embeddings.create( + model=model_name, + input=prompt, + encoding_format="float", + # To be consistent with chat + extra_body={"add_special_tokens": False}, + ) + completion_embeddings = completion_response.model_dump(mode="json") + + assert chat_embeddings.pop("id") is not None + assert completion_embeddings.pop("id") is not None + assert chat_embeddings.pop("created") <= completion_embeddings.pop( + "created") + assert chat_embeddings == completion_embeddings + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - responses_float = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="float") + responses_float = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="float") - responses_base64 = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="base64") + responses_base64 = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="base64") decoded_responses_base64_data = [] for data in responses_base64.data: @@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, 1] # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await embedding_client.embeddings.create( - input=input_texts, model=model_name) + responses_default = await client.embeddings.create(input=input_texts, + model=model_name) assert responses_float.data[0].embedding == responses_default.data[ 0].embedding @@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) @@ -173,7 +219,7 @@ async def test_single_embedding_truncation( 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) @@ -187,18 +233,15 @@ async def test_single_embedding_truncation( @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation_invalid( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] with pytest.raises(openai.BadRequestError): - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 8193}) diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6cb74eb78cbf0..6523c8b6297c6 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -70,18 +70,21 @@ async def client(server): [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], + "vllm:request_params_max_tokens": + [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:generation_tokens": - [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST) + ], "vllm:request_success": [("_total", _NUM_REQUESTS)], } @pytest.mark.asyncio -async def test_metrics_counts(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_counts(server: RemoteOpenAIServer, + client: openai.AsyncClient): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -89,7 +92,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): prompt=_TOKENIZED_PROMPT, max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) print(response.text) assert response.status_code == HTTPStatus.OK @@ -150,6 +153,9 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:request_params_n_sum", "vllm:request_params_n_bucket", "vllm:request_params_n_count", + "vllm:request_params_max_tokens_sum", + "vllm:request_params_max_tokens_bucket", + "vllm:request_params_max_tokens_count", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -170,16 +176,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_metrics_exist(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_exist(server: RemoteOpenAIServer, + client: openai.AsyncClient): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK for metric in EXPECTED_METRICS: diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py index 58075f7023821..1ae64ef492d5b 100644 --- a/tests/entrypoints/openai/test_prompt_validation.py +++ b/tests/entrypoints/openai/test_prompt_validation.py @@ -35,3 +35,23 @@ async def test_out_of_vocab_token_ids(): prompt=[999999], max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_reject_multistep_with_guided_decoding(): + model_name = "gpt2" + server_args = ["--enforce-eager", "--num-scheduler-steps", "8"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*')): + await client.completions.create( + model=model_name, + prompt="Hello", + max_tokens=5, + temperature=0.0, + extra_body={"response_format": { + "type": "json_object" + }}) diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 859a676a9c777..b1956a8cbc9dc 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,3 @@ -import openai # use the official client for correctness check import pytest import pytest_asyncio import requests @@ -55,9 +54,11 @@ async def client(server): [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str, tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_completions( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_special_tokens": add_special, "model": model_name, @@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_chat( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_generation_prompt": add_generation, @@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_detokenize( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) - print(f"CALLING {base_url} FOR {model_name}") - response = requests.post(base_url + "/detokenize", + response = requests.post(server.url_for("detokenize"), json={ "model": model_name, "tokens": tokens diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py new file mode 100644 index 0000000000000..294b250362699 --- /dev/null +++ b/tests/entrypoints/openai/test_video.py @@ -0,0 +1,345 @@ +from typing import Dict, List + +import openai +import pytest +import pytest_asyncio + +from vllm.multimodal.utils import encode_video_base64, fetch_video + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" +MAXIMUM_VIDEOS = 4 + +TEST_VIDEO_URLS = [ + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "generate", + "--dtype", + "bfloat16", + "--max-model-len", + "32768", + "--max-num-seqs", + "2", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"video={MAXIMUM_VIDEOS}", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_video() -> Dict[str, str]: + return { + video_url: encode_video_base64(fetch_video(video_url)) + for video_url in TEST_VIDEO_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video(client: openai.AsyncOpenAI, + model_name: str, video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=6299, total_tokens=6309) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_base64encoded( + client: openai.AsyncOpenAI, model_name: str, video_url: str, + base64_encoded_video: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=6299, total_tokens=6309) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, video_url: str, + base64_encoded_video: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_chat_streaming_video(client: openai.AsyncOpenAI, + model_name: str, video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "video_urls", + [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]) +async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str, + video_urls: List[str]): + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "video_url", + "video_url": { + "url": video_url + } + } for video_url in video_urls), + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + if len(video_urls) > MAXIMUM_VIDEOS: + with pytest.raises(openai.BadRequestError): # test multi-video input + await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + completion = completion.choices[0].text + assert completion is not None and len(completion) >= 0 + else: + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py new file mode 100644 index 0000000000000..d0c43b47bf0af --- /dev/null +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -0,0 +1,99 @@ +from typing import Dict + +import pytest +import pytest_asyncio +import requests + +from vllm.multimodal.utils import encode_image_base64, fetch_image + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" +MAXIMUM_IMAGES = 2 + +vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" +assert vlm2vec_jinja_path.exists() + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embedding", + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "5", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"image={MAXIMUM_IMAGES}", + "--chat-template", + str(vlm2vec_jinja_path), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_image() -> Dict[str, str]: + return { + image_url: encode_image_base64(fetch_image(image_url)) + for image_url in TEST_IMAGE_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embeddings = response.json() + assert embeddings["id"] is not None + assert len(embeddings["data"]) == 1 + assert len(embeddings["data"][0]["embedding"]) == 3072 + assert embeddings["usage"]["completion_tokens"] == 0 + assert embeddings["usage"]["prompt_tokens"] == 762 + assert embeddings["usage"]["total_tokens"] == 762 diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py index 057a11746014c..a84501f9c303f 100644 --- a/tests/kernels/test_activation.py +++ b/tests/kernels/test_activation.py @@ -14,7 +14,7 @@ DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 4096, 5120, 13824] # Arbitrary values for testing +D = [512, 13824] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 4ecd0fc1a21ad..3e3c0668198ad 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -33,7 +33,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 3fe9ca0b0450f..169ce040d370c 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -44,6 +44,8 @@ def test_env(name: str, device: str, monkeypatch): def test_flash_attn(monkeypatch): """Test FlashAttn validation.""" + # TODO: When testing for v1, pipe in `use_v1` as an argument to + # which_attn_to_use override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 59917dd2c58ad..238d6426bf099 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -14,13 +14,17 @@ awq_marlin_quantize) from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] +GROUP_SIZES = [-1, 32, 128] -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", GROUP_SIZES) @pytest.mark.skipif(not (ops.supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe")), reason="Marlin is not supported on this GPU type.") diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index fb601852dd523..fad342d1b5923 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -25,10 +25,10 @@ DTYPES = [torch.half, torch.bfloat16] NUM_GEN_SEQS = [3] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing +NUM_HEADS = [(40, 40)] # Arbitrary values for testing HEAD_SIZES = [64, 112] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] @@ -37,7 +37,7 @@ BLOCKSPARSE_VERT_STRIDES = [8] BLOCKSPARSE_BLOCK_SIZES = [64] -BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1] +BLOCKSPARSE_HEADS_SLIDINGS = [2, -1] BLOCKSPARSE_HOMO_HEADS = [True, False] diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 5b8311a33c361..40550ed51e2c7 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -13,7 +13,7 @@ NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing @@ -258,19 +258,20 @@ def test_reshape_and_cache_flash( del key_caches del value_caches + k_scale = key.amax().item() / 256 + v_scale = value.amax().item() / 256 + # Clone the KV caches. if kv_cache_dtype == "fp8": cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(cloned_key_cache, key_cache) + ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype) cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(cloned_value_cache, value_cache) + ops.convert_fp8(cloned_value_cache, value_cache, v_scale, + kv_cache_dtype) else: cloned_key_cache = key_cache.clone() cloned_value_cache = value_cache.clone() - # Using default kv_scale - k_scale = v_scale = 1.0 - # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, @@ -281,9 +282,15 @@ def test_reshape_and_cache_flash( if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(result_key_cache, key_cache) + ops.convert_fp8(result_key_cache, + key_cache, + k_scale, + kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(result_value_cache, value_cache) + ops.convert_fp8(result_value_cache, + value_cache, + v_scale, + kv_dtype=kv_cache_dtype) # Run the reference implementation. block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") diff --git a/tests/kernels/test_causal_conv1d.py b/tests/kernels/test_causal_conv1d.py index 96bfe06d74ae5..f9b11018288be 100644 --- a/tests/kernels/test_causal_conv1d.py +++ b/tests/kernels/test_causal_conv1d.py @@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor, @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) @pytest.mark.parametrize( - 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096]) + 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]) @pytest.mark.parametrize('dim', [64]) @pytest.mark.parametrize('batch', [1]) def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, @@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, unpadded_out = out[:, :out_ref_tensor.shape[-1]] assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol) - assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol) + assert torch.allclose(final_states[state_indices], + final_states_ref[state_indices], + rtol=rtol, + atol=atol) causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(), padded_state_indices, has_initial_states, diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index 993e67e827ea0..afe53797322f9 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -11,6 +11,28 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 496), + (16, 256, 496), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 496), + (64, 16384, 1024), + (100, 8192, 496), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] @@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int, (out, a, b, scale_a, scale_b, bias)) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33]) -@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) @@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index bc99c5559d388..3d3724c50421d 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -16,13 +16,13 @@ from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) +from vllm.forward_context import set_forward_context from vllm.platforms import current_platform # List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] - +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] HEAD_SIZES = [64, 256] NUM_HEADS = [1, 16] @@ -145,7 +145,8 @@ class that Attention will automatically select when it is constructed. test_pt.num_heads, test_pt.head_size, test_pt.block_size, - device=CUDA_DEVICE) + device=CUDA_DEVICE, + backend=test_pt.backend_name) return TestResources(scale, attn_backend, attn, kv_cache) @@ -592,6 +593,7 @@ def _run_encoder_attention_test( attn: Attention, encoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run encoder attention. @@ -610,6 +612,8 @@ def _run_encoder_attention_test( (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed {query,key,value} and @@ -619,20 +623,31 @@ def _run_encoder_attention_test( attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), + attn_metadata, + attn_type=attn_type) def _run_decoder_self_attention_test( test_rsrcs: TestResources, decoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run decoder self-attention test. @@ -650,6 +665,8 @@ def _run_decoder_self_attention_test( query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -660,12 +677,22 @@ def _run_decoder_self_attention_test( kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + kv_cache, + attn_metadata, + attn_type=attn_type) def _run_encoder_decoder_cross_attention_test( @@ -673,6 +700,7 @@ def _run_encoder_decoder_cross_attention_test( decoder_test_params: PhaseTestParameters, cross_test_params: Optional[PhaseTestParameters], attn_metadata: AttentionMetadata, + test_pt: TestPoint, ) -> torch.Tensor: ''' Run encoder/decoder cross-attention test. @@ -701,6 +729,8 @@ def _run_encoder_decoder_cross_attention_test( (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -718,12 +748,37 @@ def _run_encoder_decoder_cross_attention_test( cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + key, + value, + kv_cache, + attn_metadata, + attn_type=attn_type) + + +@pytest.fixture(autouse=True) +def set_reset_environment(attn_backend): + # Set the default torch datatype to bfloat16 to enable + # testing of the Flash Attention backend. Also clear the + # cached value of the backend. + default_dtype = torch.get_default_dtype() + if attn_backend.name == 'FLASH_ATTN': + torch.set_default_dtype(torch.bfloat16) + _cached_get_attn_backend.cache_clear() + yield + # Reset the torch datatype to what it was before the test + # so as not to impact the remaining tests. + torch.set_default_dtype(default_dtype) @pytest.mark.skipif(current_platform.is_rocm(), @@ -773,10 +828,8 @@ def test_encoder_only( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -807,10 +860,14 @@ def test_encoder_only( # PREFILL: encoder attention enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test( - test_rsrcs.attn, enc_test_params, prephase_attn_metadata)) + test_rsrcs.attn, + enc_test_params, + prephase_attn_metadata, + test_pt=test_pt)) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) @pytest.mark.skipif(current_platform.is_rocm(), @@ -892,10 +949,8 @@ def test_e2e_enc_dec_attn( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -955,29 +1010,39 @@ def test_e2e_enc_dec_attn( enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, enc_test_params, - prephase_attn_metadata) + prephase_attn_metadata, + test_pt=test_pt) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_attn_metadata, + test_pt=test_pt) # - Is prefill decoder self-attention correct? assert_actual_matches_ideal(prephase_dec_test_params, - prephase_dec_pckd_act_out) + prephase_dec_pckd_act_out, + attn_backend.name) # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, - prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_cross_test_params, + prephase_attn_metadata, + test_pt=test_pt) # - Is prefill encoder/decoder cross-attention correct? assert_actual_matches_ideal(prephase_cross_test_params, - prephase_cross_pckd_act_out) + prephase_cross_pckd_act_out, + attn_backend.name) # DECODE: build decode-phase attention metadata @@ -993,17 +1058,26 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, decphase_dec_test_params, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + decphase_attn_metadata, + test_pt=test_pt) # - Is decode-phase decoder self-attention correct? assert_actual_matches_ideal(decphase_dec_test_params, - decphase_dec_pckd_act_out) + decphase_dec_pckd_act_out, + attn_backend.name) # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + None, + decphase_attn_metadata, + test_pt=test_pt) # - Is decode-phase encoder/decoder cross-attention correct? assert_actual_matches_ideal(decphase_cross_test_params, - decphase_cross_pckd_act_out) + decphase_cross_pckd_act_out, + attn_backend.name) diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py index 8db6a0d0d9fa4..12c578db0893c 100644 --- a/tests/kernels/test_int8_quant.py +++ b/tests/kernels/test_int8_quant.py @@ -7,11 +7,10 @@ from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, - 8193] # Arbitrary values for testing +HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing SEEDS = [0] -SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] +SCALE = [0.1, 2.1] def opcheck_int8_quant_static(output, input, scale, azp=None): @@ -132,7 +131,7 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("scale", SCALE[2:]) # Reduce test time +@pytest.mark.parametrize("scale", SCALE) @pytest.mark.parametrize("azp", [-255, 54]) @torch.inference_mode() def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py index 9dfa2cbe45e94..727769e071842 100644 --- a/tests/kernels/test_layernorm.py +++ b/tests/kernels/test_layernorm.py @@ -1,13 +1,14 @@ import pytest import torch +from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, +HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] @@ -59,3 +60,75 @@ def test_rms_norm( else: opcheck(torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)) + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0]) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_fused_rms_norm_quant( + num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + quant_scale: float, + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x *= scale + if add_residual: + residual = torch.randn_like(x) * scale + residual_fused = residual.clone() + else: + residual = residual_fused = None + + out_norm = torch.empty_like(x) + out_quant = torch.empty_like(x, dtype=FP8_DTYPE) + out_quant_fused = torch.empty_like(out_quant) + + quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32) + + if add_residual: + torch.ops._C.fused_add_rms_norm_static_fp8_quant( + out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6) + + # Unfused kernel is in-place so it goes second + # Also use a separate clone of x to avoid modifying the input + x_unfused = x.clone() + torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6) + torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused, + quant_scale_t) + + torch.cuda.synchronize() + torch.testing.assert_close(residual_fused, + residual, + atol=1e-2, + rtol=1e-2) + + opcheck( + torch.ops._C.fused_add_rms_norm_static_fp8_quant, + (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)) + else: + torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight, + quant_scale_t, 1e-6) + + torch.ops._C.rms_norm(out_norm, x, weight, 1e-6) + torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, + quant_scale_t) + + opcheck(torch.ops._C.rms_norm_static_fp8_quant, + (out_quant_fused, x, weight, quant_scale_t, 1e-6)) + + torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32), + out_quant.to(dtype=torch.float32), + atol=1e-3, + rtol=1e-3) diff --git a/tests/kernels/test_mamba_ssm.py b/tests/kernels/test_mamba_ssm.py index bf7ff3b5c59b8..19d1158c79c73 100644 --- a/tests/kernels/test_mamba_ssm.py +++ b/tests/kernels/test_mamba_ssm.py @@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, for var in (u_ref, delta_ref, B_ref, C_ref, z_ref) ] for i in range(len(seqlens[0])): - u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] + u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits) if padded_state_indices[i] == PAD_SLOT_ID: continue out_ref_s, _ = selective_scan_ref( @@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) if itype == torch.bfloat16: - rtol, atol = 7e-2, 7e-2 + rtol, atol = 1e-1, 1e-1 if torch.version.hip: atol *= 2 # set seed @@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, dt_bias=dt_bias, dt_softplus=True) - print("Output diff max", (out - out_ref[0]).max()) - print("Output diff mean", (out - out_ref[0]).mean()) + print("Output diff max", (out[:batch_size] - out_ref).max()) + print("Output diff mean", (out[:batch_size] - out_ref).mean()) print("Output state diff max", (state[state_indices, :] - state_ref).max()) print("Output state diff mean", (state[state_indices, :] - state_ref).mean()) diff --git a/tests/kernels/test_marlin_gemm.py b/tests/kernels/test_marlin_gemm.py index 5cfd4d6da7a86..b6dd68cc51a9f 100644 --- a/tests/kernels/test_marlin_gemm.py +++ b/tests/kernels/test_marlin_gemm.py @@ -35,7 +35,7 @@ USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] -MARLIN_N_CHUNKS = [64, 128, 256] +MARLIN_N_CHUNKS = [64, 256] MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 19c3fc1e1fe3a..17428ebfc2e28 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -20,12 +20,15 @@ from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] -@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 256, 1024]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_fused_moe( m: int, @@ -93,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py index b408559cc0b07..eee77c22ab81a 100644 --- a/tests/kernels/test_pos_encoding.py +++ b/tests/kernels/test_pos_encoding.py @@ -11,10 +11,10 @@ IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 112, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size -NUM_HEADS = [7, 17] # Arbitrary values for testing -BATCH_SIZES = [1, 5] # Arbitrary values for testing +NUM_HEADS = [17] # Arbitrary values for testing +BATCH_SIZES = [5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py new file mode 100644 index 0000000000000..8e96a2f70d751 --- /dev/null +++ b/tests/kernels/test_triton_scaled_mm.py @@ -0,0 +1,106 @@ +"""Tests for the triton_scaled_mm kernel + +Run `pytest tests/kernels/test_triton_scaled_mm.py`. +""" +import importlib +from typing import Optional, Type + +import pytest +import torch + +from vllm.platforms import current_platform + +device = "cuda" + + +def scaled_mm_torch(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + out = torch.mm(a.to(torch.float32), b.to(torch.float32)) + out = scale_a * out + out = scale_b.T * out + out = out.to(out_dtype) + if bias is not None: + out = out + bias + + return out + + +def get_8bit_types(): + types = [torch.int8] + supports_fp8 = current_platform.has_device_capability(89) + if current_platform.is_rocm() and supports_fp8: + types.append(torch.float8_e4m3fnuz) + elif current_platform.is_cuda() and supports_fp8: + types.append(torch.float8_e4m3fn) + return types + + +@pytest.mark.parametrize("M", [1, 33, 64, 512]) +@pytest.mark.parametrize("N", [256, 971, 20486]) +@pytest.mark.parametrize("K", [128, 496, 1024]) +@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("in_dtype", get_8bit_types()) +@pytest.mark.parametrize("use_scalar_scale_a", [True, False]) +@pytest.mark.parametrize("use_scalar_scale_b", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, + use_scalar_scale_b, use_bias): + is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t + ).is_floating_point() + + current_platform.seed_everything(0) + + # NOTE: There are cases, where if the matrix is large enough, an output + # like 65504.4 can be produced, and can easily turn into inf when + # multiplied when using float16/bfloat16. This means one function, e.g., + # testing function, and another function, e.g. golden function, can + # produce a non-inf value while the other produces an inf value, and + # will cause assert_close/allclose to fail, even though if overflow + # wouldn't have occurred, the values would have been "close." + # + # So, the values here are kept small enough to avoid this situation. + if is_floating_point_type(in_dtype): + a = (0.25 * torch.rand( + (M, K), dtype=torch.float32, device=device)).to(in_dtype) + b = (0.25 * torch.rand( + (K, N), dtype=torch.float32, device=device)).to(in_dtype) + else: + a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device) + b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device) + + if use_scalar_scale_a: + scale_a = torch.rand((1, 1), device=device) + else: + scale_a = 0.25 * torch.rand((M, 1), device=device) + + if use_scalar_scale_b: + scale_b = torch.rand((1, 1), device=device) + else: + scale_b = 0.25 * torch.rand((N, 1), device=device) + + bias = None + if use_bias: + bias = torch.rand((N, ), device=device, dtype=out_dtype) + + triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") + triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm + + c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + + a_cpu = a.cpu() + b_cpu = b.cpu() + scale_a_cpu = scale_a.cpu() + scale_b_cpu = scale_b.cpu() + bias_cpu = None if bias is None else bias.cpu() + + c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu, + out_dtype, bias_cpu) + + c_check_cpu = c_check.cpu() + torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index a2d414f636e13..e7865fb2500ef 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,8 +13,8 @@ from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul -from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, - make_tensor_with_pad) +from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, + STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. @@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend: if backend_name == STR_XFORMERS_ATTN_VAL: # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. from vllm.attention.backends.xformers import XFormersBackend - return XFormersBackend() + elif backend_name == STR_FLASH_ATTN_VAL: + from vllm.attention.backends.flash_attn import FlashAttentionBackend + return FlashAttentionBackend() + raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") def _make_metadata_tensors( - seq_lens: Optional[List[int]], context_lens: Optional[List[int]], - encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str] -) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]], - torch.Tensor, Optional[int]]: + seq_lens: Optional[List[int]], + context_lens: Optional[List[int]], + encoder_seq_lens: Optional[List[int]], + device: Union[torch.device, str], +) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], + torch.Tensor, torch.Tensor, Optional[int]]: ''' Build scalar & tensor values required to build attention metadata structure. @@ -553,6 +558,8 @@ def _make_metadata_tensors( * max_context_len: max(context_lens) * max_seq_len: max(seq_lens) * seq_start_loc: start idx of each sequence + * encoder_seq_lens_tensor: encoder seq_lens list, as tensor + * encoder_seq_start_loc: start idx of each encoder sequence * max_encoder_seq_len: encoder seq_lens list, as tensor ''' seq_lens_tensor = maybe_make_int_tensor(seq_lens, device) @@ -566,8 +573,26 @@ def _make_metadata_tensors( seq_start_loc = None + if seq_lens_tensor is not None: + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=seq_lens_tensor.device) + torch.cumsum(seq_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + + encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=encoder_seq_lens_tensor.device) + torch.cumsum(encoder_seq_lens_tensor, + dim=0, + dtype=encoder_seq_start_loc.dtype, + out=encoder_seq_start_loc[1:]) + return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len, - seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len) + seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc, + max_encoder_seq_len) def make_kv_cache(num_blocks: int, @@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int, head_size: int, block_size: int, device: Union[torch.device, str], + backend: str, default_val: float = 0.0) -> torch.Tensor: ''' Create a fake KV cache. @@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) + * for backend 'XFORMERS' + * kv_cache: 2 x num_blocks x block_size x num_heads x head_size + * for backend 'FLASH_ATTN' ''' - - kv_cache = torch.rand( - (2, num_blocks, block_size * num_heads * head_size)).to(device) + if backend == 'XFORMERS': + kv_cache = torch.rand( + (2, num_blocks, block_size * num_heads * head_size)).to(device) + elif backend == 'FLASH_ATTN': + kv_cache = torch.rand( + (2, num_blocks, block_size, num_heads, head_size)).to(device) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") if default_val is not None: kv_cache[:, :, :] = default_val return kv_cache @@ -858,8 +894,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -869,10 +906,12 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=None if seq_lens is None else max(seq_lens), max_decode_seq_len=0, context_lens_tensor=context_lens_tensor, @@ -881,6 +920,7 @@ def make_test_metadata( num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -903,8 +943,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -914,18 +955,22 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=0, max_decode_seq_len=max(seq_lens), + max_decode_query_len=1, context_lens_tensor=context_lens_tensor, block_tables=kv_mmap.block_tables, use_cuda_graph=False, num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -934,7 +979,8 @@ def make_test_metadata( def assert_actual_matches_ideal(test_params: PhaseTestParameters, - output_under_test: torch.Tensor) -> None: + output_under_test: torch.Tensor, + backend: str) -> None: ''' Assert that observed output matches the ideal output contained in the test parameters data structure. @@ -945,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters, * output_under_test: actually observed output value ''' ideal_output = test_params.packed_qkvo.ideal_output - torch.testing.assert_close(ideal_output, - output_under_test.view_as(ideal_output)) + if backend == 'XFORMERS': + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output)) + + elif backend == 'FLASH_ATTN': + # For FlashAttention override the accuracy thresholds to non default + # values since we notice a higher difference between the ideal and + # actual output. + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output), + atol=0.01, + rtol=0.016) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") # Copied/modified from torch._refs.__init__.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index e40f0dd74602e..29ecf37808205 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,6 +152,11 @@ def sql_lora_files(sql_lora_huggingface_id): return snapshot_download(repo_id=sql_lora_huggingface_id) +@pytest.fixture(scope="session") +def lora_bias_files(): + return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias") + + @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test @@ -248,11 +253,10 @@ def llama_2_7b_engine_extra_embeddings(): cleanup_dist_env_and_memory(shutdown_ray=True) get_model_old = get_model - def get_model_patched(*, model_config, device_config, **kwargs): - kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8) - return get_model_old(model_config=model_config, - device_config=device_config, - **kwargs) + def get_model_patched(**kwargs): + kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4, + max_lora_rank=8) + return get_model_old(**kwargs) with patch("vllm.worker.model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index eb882faf3974a..15e576cb065c7 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -51,6 +51,7 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] + # We will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) STAGES = [True, False] @@ -120,11 +121,12 @@ def populate_loras( subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): - sublora = DummyLoRAManager().init_random_lora( - module_name=f"fake_{i}", - weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, - ) + sublora = DummyLoRAManager( + layer_weights.device).init_random_lora( + module_name=f"fake_{i}", + weight=layer_weights, + generate_embeddings_tensor=generate_embeddings_tensor, + ) sublora.lora_b = sublora.lora_b[:, (sublora_len * i):(sublora_len * (i + 1))] sublora.optimize() @@ -152,6 +154,7 @@ def create_random_inputs( input_size: Tuple[int, ...], input_range: Tuple[float, float], input_type: torch.dtype = torch.int, + device: torch.device = "cuda" ) -> Tuple[List[torch.Tensor], List[int], List[int]]: """Creates random inputs. @@ -173,10 +176,14 @@ def create_random_inputs( for _ in range(num_inputs): if input_type == torch.int: inputs.append( - torch.randint(low=int(low), high=int(high), size=input_size)) + torch.randint(low=int(low), + high=int(high), + size=input_size, + device=device)) else: inputs.append( - torch.rand(size=input_size, dtype=input_type) * high + low) + torch.rand(size=input_size, dtype=input_type, device=device) * + high + low) lora_id = random.choice(active_lora_ids) index_mapping += [lora_id] * input_size[0] @@ -191,6 +198,10 @@ def create_random_inputs( @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA + # device, see: https://github.com/triton-lang/triton/issues/2925 + # Same below. + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -225,7 +236,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -263,7 +274,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -291,6 +302,7 @@ def create_random_embedding_layer(): def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -345,7 +357,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -400,7 +412,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) original_inputs = deepcopy(inputs) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, @@ -426,6 +438,7 @@ def create_random_embedding_layer(): def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -471,7 +484,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -520,7 +533,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -554,6 +567,7 @@ def _pretest(): @pytest.mark.parametrize("stage", STAGES) def test_linear_replicated(dist_init, num_loras, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -592,7 +606,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -631,7 +645,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -658,6 +672,7 @@ def create_random_linear_replicated_layer(): def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -706,7 +721,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -745,7 +760,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -772,6 +787,7 @@ def create_random_linear_parallel_layer(): def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -842,7 +858,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -883,7 +899,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -962,7 +978,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, input_size=(1, max_position), input_range=(0, lora_config.lora_extra_vocab_size), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping) long_lora_context = LongContextLoRAContext(list(scaling_factors), diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py index c8edb02a88d4b..eada902c891f7 100644 --- a/tests/lora/test_long_context.py +++ b/tests/lora/test_long_context.py @@ -138,13 +138,7 @@ def test_rotary_emb_replaced(dist_init): enable_lora=True) engine_config = engine_args.create_engine_config() model_runner = ModelRunner( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - lora_config=engine_config.lora_config, + vllm_config=engine_config, is_driver_worker=True, ) model_runner.load_model() diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py new file mode 100644 index 0000000000000..c2520c847d873 --- /dev/null +++ b/tests/lora/test_lora_bias_e2e.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ibm-granite/granite-3b-code-base" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + generated_texts: List[str] = [] + for output in outputs: + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + return generated_texts + + +@pytest.mark.parametrize("lora_bias", [True]) +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_lora_rank=8, + max_loras=1, + enable_lora_bias=lora_bias, + tensor_parallel_size=1, + fully_sharded_loras=fully_sharded) + + print("lora adapter created") + output1 = do_sample(llm, lora_bias_files, lora_id=0) + + print("lora") + output2 = do_sample(llm, lora_bias_files, lora_id=1) + + if lora_bias: + assert output1 != output2 + else: + assert output1 == output2 diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 67cf298b4df2b..8d109b2c81503 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -25,8 +25,13 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] -def test_from_lora_tensors(sql_lora_files): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( @@ -36,7 +41,7 @@ def test_from_lora_tensors(sql_lora_files): 8, 16, tensors, - "cuda", + device, embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES) @@ -46,6 +51,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.lora_alpha == 16 assert lora.lora_a is not None assert lora.lora_b is not None + assert lora.lora_a.device == torch.device(device) + assert lora.lora_b.device == torch.device(device) assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" assert lora.lora_a.shape[1] == 8 @@ -60,8 +67,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, - sub_modules: List[str]) -> LoRAModel: +def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], + device: torch.device) -> LoRAModel: loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight @@ -69,8 +76,8 @@ def create_lora(lora_id: int, model: nn.Module, name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), - torch.rand([8, w.shape[0]], device="cuda"), + torch.rand([w.shape[1], 8], device=device), + torch.rand([8, w.shape[0]], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -80,6 +87,7 @@ def create_packed_lora( model: nn.Module, module_name, replaced_module_names, + device: torch.device, empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight @@ -91,9 +99,9 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), + torch.rand([w.shape[1], 8], device=device), torch.rand([8, w.shape[0] // len(replaced_module_names)], - device="cuda"), + device=device), ) return LoRAModel(lora_id, 8, loras) @@ -104,7 +112,8 @@ def test_replace_submodules(dist_init, dummy_model): model.packed_modules_mapping = {} manager = LoRAModelManager( model, 1, 1, 1, - LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8)) + LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), + torch.device("cuda")) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -116,16 +125,28 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -def test_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -161,17 +182,32 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 + assert manager.device == device + assert manager.punica_wrapper.device == device -def test_lora_lru_cache_model_manager(dist_init, dummy_model): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -238,20 +274,37 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): with pytest.raises(ValueError): assert manager.pin_adapter(3) + assert manager.punica_wrapper.device == device + assert manager.device == device + -def test_lru_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora4 = create_lora(4, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) @@ -351,14 +404,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} + assert manager.punica_wrapper.device == device + assert manager.device == device +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = LRUCacheWorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -426,14 +482,19 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): # Should remove every LoRA not specified in the request. lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = WorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -497,8 +558,13 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + -def test_packed_loras(dist_init, dummy_model_gate_up): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] model.packed_modules_mapping = { @@ -511,18 +577,25 @@ def test_packed_loras(dist_init, dummy_model_gate_up): 1, model, module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"]) + replaced_module_names=["gate_proj", "up_proj"], + device=device) model_lora1 = create_packed_lora( 2, model, module_name="gate_up_proj", replaced_module_names=["gate_proj", "up_proj"], + device=device, empty_replaced_module_name="gate_proj", ) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) model = manager.model assert isinstance(model.get_submodule("gate_up_proj"), diff --git a/tests/lora/test_minicpmv.py b/tests/lora/test_minicpmv.py index be040060d02b2..2c45ce5141f7d 100644 --- a/tests/lora/test_minicpmv.py +++ b/tests/lora/test_minicpmv.py @@ -1,8 +1,11 @@ from typing import List +import pytest + import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files): trust_remote_code=True, gpu_memory_utilization=0.97 # This model is pretty big for CI gpus ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output1[i]) diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py index e756544d96e98..66b5f82bbb97d 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_sizes.py @@ -4,8 +4,6 @@ whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. """ -from unittest.mock import patch - import pytest import torch @@ -16,7 +14,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.platforms import current_platform -from vllm.triton_utils.libentry import LibEntry from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -235,9 +232,6 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -262,33 +256,21 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -324,7 +306,6 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) current_platform.seed_everything(seed) @@ -374,22 +355,16 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index dc0edeb10ef46..52b82f25d23e1 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -3,8 +3,6 @@ under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ -from unittest.mock import patch - import pytest import torch @@ -15,7 +13,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.platforms import current_platform -from vllm.triton_utils.libentry import LibEntry from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -150,8 +147,6 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) current_platform.seed_everything(seed) @@ -177,33 +172,22 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -239,8 +223,6 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -289,22 +271,15 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index db02bacdb6439..85110b8fa8cd2 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -12,36 +12,40 @@ def test_parse_fine_tuned_lora_name_valid(): fixture = { - ("base_model.model.lm_head.lora_A.weight", "lm_head", True), - ("base_model.model.lm_head.lora_B.weight", "lm_head", False), + ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False), + ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False), ( "base_model.model.model.embed_tokens.lora_embedding_A", "model.embed_tokens", True, + False, ), ( "base_model.model.model.embed_tokens.lora_embedding_B", "model.embed_tokens", False, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight", "model.layers.9.mlp.down_proj", True, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight", "model.layers.9.mlp.down_proj", False, + False, ), } - for name, module_name, is_lora_a in fixture: - assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name) + for name, module_name, is_lora_a, is_bias in fixture: + assert (module_name, is_lora_a, + is_bias) == parse_fine_tuned_lora_name(name) def test_parse_fine_tuned_lora_name_invalid(): fixture = { - "weight", "base_model.weight", "base_model.model.weight", } diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 2f7ac85507425..9d814f657ac43 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -4,7 +4,8 @@ from unittest.mock import patch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig) + ModelConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker @@ -12,7 +13,7 @@ @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): - worker = Worker( + vllm_config = VllmConfig( model_config=ModelConfig( "meta-llama/Llama-2-7b-hf", task="auto", @@ -34,10 +35,13 @@ def test_worker_apply_lora(sql_lora_files): gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), - local_rank=0, - rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, max_loras=32), + ) + worker = Worker( + vllm_config=vllm_config, + local_rank=0, + rank=0, distributed_init_method=f"file://{tempfile.mkstemp()[1]}", ) worker.init_device() diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 00f8e26d1041f..e394c33b3f9ea 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -7,9 +7,10 @@ class DummyLoRAManager: - def __init__(self): + def __init__(self, device: torch.device = "cuda:0"): super().__init__() self._loras: Dict[str, LoRALayerWeights] = {} + self._device = device def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora @@ -28,16 +29,16 @@ def init_random_lora(self, lora_alpha=1, lora_a=torch.rand([weight.shape[1], rank], dtype=weight.dtype, - device="cuda"), + device=self._device), lora_b=torch.rand([rank, weight.shape[0]], dtype=weight.dtype, - device="cuda"), + device=self._device), ) if generate_embeddings_tensor: lora.embeddings_tensor = torch.rand(5, generate_embeddings_tensor, dtype=weight.dtype, - device="cuda") + device=self._device) self.set_module_lora(module_name, lora) return lora diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7a361ef320810..4a824c7acef21 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, "vllm:request_prompt_tokens", "vllm:request_generation_tokens", "vllm:request_params_n", + "vllm:request_params_max_tokens", ] for metric_name in request_histogram_metrics: metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py new file mode 100644 index 0000000000000..7e5e2780d3916 --- /dev/null +++ b/tests/model_executor/test_model_load_with_params.py @@ -0,0 +1,50 @@ +import os + +import pytest + +from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.models.bert import BertEmbeddingModel +from vllm.platforms import current_platform + +MAX_MODEL_LEN = 128 +MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") +REVISION = os.environ.get("REVISION", "main") + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME, + revision=REVISION, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_config = model.model.llm_engine.model_config + + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.encoder_config["max_seq_length"] == 512 + assert model_config.encoder_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooler_config.pooling_type == PoolingType.CLS.name + assert model_config.pooler_config.pooling_norm + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" + assert model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer.model_max_length == 512 + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + # assert output + assert output diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py index ad6c2d854d1f0..e100c6b9bb906 100644 --- a/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/tests/models/decoder_only/audio_language/test_ultravox.py @@ -2,12 +2,14 @@ import numpy as np import pytest +import pytest_asyncio from transformers import AutoModel, AutoTokenizer, BatchEncoding from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ....conftest import HfRunner, VllmRunner +from ....utils import RemoteOpenAIServer from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_3" @@ -17,6 +19,13 @@ VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" HF_PLACEHOLDER = "<|audio|>" +CHUNKED_PREFILL_KWARGS = { + "enable_chunked_prefill": True, + "max_num_seqs": 2, + # Use a very small limit to exercise chunked prefill. + "max_num_batched_tokens": 16 +} + @pytest.fixture(scope="session") def audio_assets(): @@ -30,6 +39,29 @@ def audio(request): return AudioAsset(request.param) +@pytest.fixture(params=[ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) +def server(request, audio_assets): + args = [ + "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", + f"--limit-mm-per-prompt=audio={len(audio_assets)}" + ] + [ + f"--{key.replace('_','-')}={value}" + for key, value in request.param.items() + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + def _get_prompt(audio_count, question, placeholder): tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) placeholder = f"{placeholder}\n" * audio_count @@ -68,8 +100,7 @@ def run_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): """Inference result should be the same between hf and vllm.""" torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -79,11 +110,8 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True, + **kwargs) as vllm_model: vllm_outputs_per_audio = [ vllm_model.generate_greedy_logprobs([vllm_prompt], max_tokens, @@ -135,18 +163,16 @@ def run_multi_audio_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): with vllm_runner(model, dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, enforce_eager=True, limit_mm_per_prompt={ "audio": max((len(audio) for _, audio in prompts_and_audios)) - }) as vllm_model: + }, + **kwargs) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( [prompt for prompt, _ in prompts_and_audios], max_tokens, @@ -158,11 +184,16 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, - num_logprobs: int) -> None: + num_logprobs: int, vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) @@ -174,16 +205,21 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + max_tokens: int, num_logprobs: int, + vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(len(audio_assets), "Describe each of the audios above.", @@ -196,5 +232,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) + + +@pytest.mark.asyncio +async def test_online_inference(client, audio_assets): + """Exercises online inference with/without chunked prefill enabled.""" + + messages = [{ + "role": + "user", + "content": [ + *[{ + "type": "audio_url", + "audio_url": { + "url": audio.url + } + } for audio in audio_assets], + { + "type": + "text", + "text": + f"What's happening in these {len(audio_assets)} audio clips?" + }, + ], + }] + + chat_completion = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_tokens=10) + + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py index de46032113086..a8cb5bbf9349e 100644 --- a/tests/models/decoder_only/language/test_aqlm.py +++ b/tests/models/decoder_only/language/test_aqlm.py @@ -38,6 +38,7 @@ ] +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("aqlm"), reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py deleted file mode 100644 index fcfc159e4f5a0..0000000000000 --- a/tests/models/decoder_only/language/test_big_models.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests bigger models and use half precision. - -Run `pytest tests/models/test_big_models.py`. -""" -import pytest - -from vllm.platforms import current_platform - -from ...utils import check_logprobs_close, check_outputs_equal - -MODELS = [ - "meta-llama/Llama-2-7b-hf", - # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py - # "Deci/DeciLM-7b", # Broken - # "tiiuae/falcon-7b", # Broken - "EleutherAI/gpt-j-6b", - # "mosaicml/mpt-7b", # Broken - # "Qwen/Qwen1.5-0.5B" # Broken, -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - # Head size isn't supported on CPU - "h2oai/h2o-danube3-4b-base", - ] - -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - - if model == "openbmb/MiniCPM3-4B": - # the output becomes slightly different when upgrading to - # pytorch 2.5 . Changing to logprobs checks instead of exact - # output checks. - NUM_LOG_PROBS = 8 - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 5a947ce62c785..53f23e24511b3 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -15,17 +15,18 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize( "kv_cache_dtype,base_model,test_model,scale_path", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. - ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. - ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", None), + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", None), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", @@ -33,7 +34,7 @@ ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 5dc83942632fd..2b8f5e2faa45e 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -17,26 +17,21 @@ MAX_MODEL_LEN = 1024 -# FIXME: Move this to confest -MODELS = [ - ("meta-llama/Llama-3.2-1B-Instruct", - hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", - filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")), - ("meta-llama/Llama-3.2-1B-Instruct", - hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", - filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")), - ("Qwen/Qwen2-1.5B-Instruct", - hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", - filename="qwen2-1_5b-instruct-q4_k_m.gguf")), - ("Qwen/Qwen2-1.5B-Instruct", - hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")), -] - @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ + ("meta-llama/Llama-3.2-1B-Instruct", + "bartowski/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), + ("meta-llama/Llama-3.2-1B-Instruct", + "bartowski/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), + ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", + "qwen2-1_5b-instruct-q4_k_m.gguf"), + ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", + "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), +]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -45,7 +40,9 @@ def test_models( num_gpus_available, vllm_runner, example_prompts, - model, + original_model, + gguf_id, + gguf_path, dtype: str, max_tokens: int, num_logprobs: int, @@ -54,7 +51,7 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - original_model, gguf_model = model + gguf_model = hf_hub_download(gguf_id, filename=gguf_path) tokenizer = AutoTokenizer.from_pretrained(original_model) messages = [[{ diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py index 2155e83dbe915..037411a18c19f 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/tests/models/decoder_only/language/test_gptq_marlin.py @@ -22,30 +22,18 @@ MAX_MODEL_LEN = 1024 MODELS = [ - # act_order==False, group_size=channelwise - ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), - # act_order==False, group_size=128 - ("TheBloke/Llama-2-7B-GPTQ", "main"), - # act_order==True, group_size=128 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), - # act_order==True, group_size=64 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), - # act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), # 8-bit, act_order==True, group_size=channelwise ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), - # 8-bit, act_order==True, group_size=128 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), - # 8-bit, act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), # 4-bit, act_order==True, group_size=128 ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") ] +@pytest.mark.quant_model @pytest.mark.flaky(reruns=3) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py index d65be05f141b4..26cb3ec310701 100644 --- a/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -25,19 +25,20 @@ class ModelPair: # 4-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), - # 4-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), + # # 4-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # 8-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), - # 8-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), + # # 8-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), ] +@pytest.mark.quant_model @pytest.mark.flaky(reruns=2) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), reason="Marlin24 is not supported on this GPU type.") diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index 0b71f0d49c70a..5e93842f46164 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -7,7 +7,9 @@ from ...utils import check_logprobs_close MODELS = [ + # TODO(sang): Sliding window should be tested separately. "ibm/PowerLM-3b", + "ibm/PowerMoE-3b", ] @@ -24,7 +26,6 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - # TODO(sang): Sliding window should be tested separately. with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/tests/models/decoder_only/language/test_granitemoe.py b/tests/models/decoder_only/language/test_granitemoe.py deleted file mode 100644 index ba73375229eb3..0000000000000 --- a/tests/models/decoder_only/language/test_granitemoe.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. - -Run `pytest tests/models/test_granite.py`. -""" -import pytest - -from ...utils import check_logprobs_close - -MODELS = [ - "ibm/PowerMoE-3b", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/tests/models/decoder_only/language/test_marlin.py b/tests/models/decoder_only/language/test_marlin.py deleted file mode 100644 index c802346dee8af..0000000000000 --- a/tests/models/decoder_only/language/test_marlin.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Compare the outputs of a GPTQ model to a Marlin model. - -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the -Marlin/GPTQ models are in the top 3 selections of each other. - -Note: Marlin internally uses locks to synchronize the threads. This can -result in very slight nondeterminism for Marlin. As a result, we re-run the test -up to 3 times to see if we pass. - -Run `pytest tests/models/test_marlin.py`. -""" -from dataclasses import dataclass - -import pytest - -from tests.quantization.utils import is_quant_method_supported - -from ...utils import check_logprobs_close - - -@dataclass -class ModelPair: - model_marlin: str - model_gptq: str - - -model_pairs = [ - ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", - model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), - ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", - model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), - ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", - model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") -] - - -@pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model_pair", model_pairs) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_pair: ModelPair, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") as marlin_model: - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model_pair.model_gptq, dtype=dtype, - quantization="gptq") as gptq_model: - gptq_outputs = gptq_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=gptq_outputs, - outputs_1_lst=marlin_outputs, - name_0="gptq", - name_1="marlin", - ) diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py index 174b905d9cbb9..6ec4b7e7e3f71 100644 --- a/tests/models/decoder_only/language/test_mistral.py +++ b/tests/models/decoder_only/language/test_mistral.py @@ -4,13 +4,18 @@ """ import pytest -from vllm import LLM, SamplingParams +from vllm import SamplingParams from ...utils import check_logprobs_close MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", +] + +MISTRAL_FORMAT_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.3", + # uses the v3-Tekken tokenizer + "mistralai/Ministral-8B-Instruct-2410", # Mistral-Nemo is to big for CI, but passes locally # "mistralai/Mistral-Nemo-Instruct-2407" ] @@ -19,6 +24,8 @@ SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese "寫一首關於勇敢的水手的詩", # chinese + "ပုံပြင်လေးပြောပြပါ်:\n", # burmese + "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625 ] # for function calling @@ -95,7 +102,7 @@ def test_models( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -135,28 +142,29 @@ def test_mistral_format( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS) def test_mistral_symbolic_languages( + vllm_runner, model: str, dtype: str, - prompt: str, ) -> None: - prompt = "hi" - msg = {"role": "user", "content": prompt} - llm = LLM(model=model, - dtype=dtype, - max_model_len=8192, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") - outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) - assert "�" not in outputs[0].outputs[0].text.strip() + with vllm_runner(model, + dtype=dtype, + max_model_len=8192, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") as vllm_model: + for prompt in SYMBOLIC_LANG_PROMPTS: + msg = {"role": "user", "content": prompt} + outputs = vllm_model.model.chat([msg], + sampling_params=SAMPLING_PARAMS) + assert "�" not in outputs[0].outputs[0].text.strip() @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling +@pytest.mark.parametrize("model", + MISTRAL_FORMAT_MODELS) # v1 can't do func calling def test_mistral_function_calling( vllm_runner, model: str, diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py index e643b115d0ea8..077e50e3a4dfd 100644 --- a/tests/models/decoder_only/language/test_modelopt.py +++ b/tests/models/decoder_only/language/test_modelopt.py @@ -39,6 +39,7 @@ @pytest.mark.skip( reason= "Prevent unstable test based on golden strings from breaking the build.") +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 68055cbe29095..beb1ffb18436e 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -1,31 +1,42 @@ """Compare the outputs of HF and vLLM when using greedy sampling. -This test only tests small models. Big models such as 7B should be tested from -test_big_models.py because it could use a larger instance to run tests. - Run `pytest tests/models/test_models.py`. """ import pytest -from ...utils import check_outputs_equal +from vllm.platforms import current_platform + +from ...utils import check_logprobs_close MODELS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", # Testing alibi slopes. - "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", - # "allenai/OLMo-1B", # Broken - "bigcode/starcoder2-3b", - "google/gemma-1.1-2b-it", + "facebook/opt-125m", # opt + "openai-community/gpt2", # gpt2 + # "Milos/slovak-gpt-j-405M", # gptj + # "bigcode/tiny_starcoder_py", # gpt_bigcode + # "EleutherAI/pythia-70m", # gpt_neox + "bigscience/bloom-560m", # bloom - testing alibi slopes + "microsoft/phi-2", # phi + # "stabilityai/stablelm-3b-4e1t", # stablelm + # "bigcode/starcoder2-3b", # starcoder2 + "google/gemma-1.1-2b-it", # gemma + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + "meta-llama/Llama-3.2-1B-Instruct", # llama ] +if not current_platform.is_cpu(): + MODELS += [ + # fused_moe which not supported on CPU + "openbmb/MiniCPM3-4B", + ] + +target_dtype = "half" + +@pytest.mark.core_model @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) def test_models( hf_runner, vllm_runner, @@ -33,33 +44,24 @@ def test_models( model: str, dtype: str, max_tokens: int, + num_logprobs: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/tests/models/decoder_only/language/test_qwen.py b/tests/models/decoder_only/language/test_qwen.py deleted file mode 100644 index 128fe65afbb84..0000000000000 --- a/tests/models/decoder_only/language/test_qwen.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Ensure that a text-only Qwen model can be run without throwing an error. -We explicitly test this because Qwen is implemented as a multimodal and -supports a visual encoder for models like Qwen-VL. -""" -from typing import List, Type - -import pytest - -from ....conftest import VllmRunner - -models = [ - "Qwen/Qwen-7B-Chat" # Has no visual encoder -] - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_text_only_qwen_model_can_be_loaded_and_run( - vllm_runner: Type[VllmRunner], - example_prompts: List[str], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, -): - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_model.generate_greedy_logprobs( - example_prompts, - max_tokens, - num_logprobs=num_logprobs, - ) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py new file mode 100644 index 0000000000000..31896bfd13e8c --- /dev/null +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py @@ -0,0 +1,187 @@ +"""Tests for Idefics3's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest +import torch +import transformers +from transformers import AutoImageProcessor, AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["HuggingFaceM4/Idefics3-8B-Llama3"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_idefics3(): + from vllm.model_executor.models.idefics3 import ( + input_processor_for_idefics3) + return input_processor_for_idefics3 + + +@pytest.fixture() +def dummy_data_for_idefics3(): + from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3 + return dummy_data_for_idefics3 + + +@pytest.fixture() +def get_max_idefics3_image_tokens(): + from vllm.model_executor.models.idefics3 import ( + get_max_idefics3_image_tokens) + return get_max_idefics3_image_tokens + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336]) +def test_input_mapper_override(model: str, image_assets: _ImageAssets, + longest_edge: Optional[int]): + """Ensure that the [default] input mapper handles size properly.""" + + mm_processor_kwargs = { + "size": { + "longest_edge": longest_edge + } + } if longest_edge is not None else {} + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + hf_processor = AutoImageProcessor.from_pretrained(model, + trust_remote_code=True, + **mm_processor_kwargs) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + + assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge, expected_max_tokens", [ + (None, 2873), + (168, 169), + (336, 169), + (400, 338), + (672, 338), +]) +def test_max_tokens_override(get_max_idefics3_image_tokens, model: str, + longest_edge: Optional[int], + expected_max_tokens: int): + """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs.""" + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + actual_max_tokens = get_max_idefics3_image_tokens( + ctx=InputContext(ctx.model_config), + size=size, + ) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [ + (168, 169, 1), + (168, 169, 2), + (400, 338, 1), + (400, 338, 2), +]) +def test_dummy_data_override(dummy_data_for_idefics3, model: str, + longest_edge: int, toks_per_img: int, + num_imgs: int): + """Ensure dummy_data_for_idefics3 handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + dummy_data = dummy_data_for_idefics3( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + size=size) + sequence_data = dummy_data.seq_data + # Ensure we have the right number of placeholders per size + image_token_id = ctx.get_hf_config().image_token_id + img_tok_count = sequence_data.get_token_ids().count(image_token_id) + assert img_tok_count == toks_per_img * num_imgs + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [ + (336, 169 * (1**2 + 1), 1), + (336, 169 * (1**2 + 1), 2), + (400, 169 * (2**2 + 1), 1), + (400, 169 * (2**2 + 1), 2), +]) +def test_input_processor_override(input_processor_for_idefics3, + image_assets: _ImageAssets, model: str, + longest_edge: int, + expected_toks_per_img: int, num_imgs: int): + """Ensure input_processor_for_idefics3 handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + # Build the image str / prompt based on the number of images we pass + tokenizer = AutoTokenizer.from_pretrained(model) + placeholders = "" if num_imgs == 1 else "\n".join( + f"Image-{i}: \n" for i in range(1, num_imgs + 1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n\nAssistant:" # noqa: E501 + images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = ctx.get_hf_config().image_token_id + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py index c2d3fda6994f6..51c0085101dd0 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py @@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, ctx.model_config.hf_config.image_grid_pinpoints = gridpoints seq_len = 5000 # bigger than the max feature size for any image - seq_data, mm_data = dummy_data_for_llava_next( + dummy_data = dummy_data_for_llava_next( ctx, seq_len=seq_len, mm_counts={"image": 1}, ) + seq_data = dummy_data.seq_data + mm_data = dummy_data.multi_modal_data # The dummy data dims should match the gridpoint with the biggest feat size assert mm_data["image"].height == expected_size[0] diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py index d6a7b34fdde9f..60a8f63eb5faa 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int, mm_processor_kwargs=None, ) - sequence_data, _, = dummy_data_for_phi3v( + dummy_data = dummy_data_for_phi3v( ctx=ctx, seq_len=8192, # Should be bigger than num_imgs * toks_per_img mm_counts={"image": num_imgs}, num_crops=num_crops, ) + sequence_data = dummy_data.seq_data # Ensure we have the right number of placeholders per num_crops size img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID) assert img_tok_count == toks_per_img * num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index a01651b171d60..163220c91a27d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -6,7 +6,7 @@ from PIL.Image import Image from vllm.inputs import InputContext, token_inputs -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import IMAGE_ASSETS @@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) # Ensure that we get the appropriately shaped pixel_values # for images and image embeddings, respectively. - assert isinstance(mapped_img_data, MultiModalInputs) + assert isinstance(mapped_img_data, MultiModalKwargs) assert "pixel_values" in mapped_img_data assert mapped_img_data["pixel_values"].shape == expected_shape @@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen, # Sad path tests for the multimodal input processor and mapper, respectively @pytest.mark.parametrize("mm_data", [ { - "image": torch.rand((5)) + "image": torch.rand(5) }, { "image": torch.rand((5, 5, 5, 5, 5)) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index 5c90e7f7a267c..7e2bea130583e 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -17,7 +17,7 @@ # Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple +# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple # input mappers. @pytest.fixture() def image_input_mapper_for_qwen2_vl(): @@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl, # NOTE: video value is required, but isn't actually used # when making the dummy data except for error handling currently - seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, { - "image": 1, - "video": 0 - }, **mm_processor_kwargs) + dummy_data = dummy_data_for_qwen2_vl( + ctx=qwen2_vl_context, + seq_len=seq_len, + mm_counts={ + "image": 1, + "video": 0 + }, + **mm_processor_kwargs, + ) + seq_data = dummy_data.seq_data + mm_data = dummy_data.multi_modal_data # Ensure we have the right number of placeholders for min/max pixel values assert seq_data.get_token_ids().count(image_token_id) == token_count diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_awq.py similarity index 90% rename from tests/models/decoder_only/vision_language/test_internvl.py rename to tests/models/decoder_only/vision_language/test_awq.py index 2fd1ac4bb08f7..6e6e5b40d6a35 100644 --- a/tests/models/decoder_only/vision_language/test_internvl.py +++ b/tests/models/decoder_only/vision_language/test_awq.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Type import pytest import torch @@ -19,7 +19,8 @@ def run_awq_test( vllm_runner: Type[VllmRunner], image_assets: _ImageAssets, - models: Tuple[str, str], + source_model: str, + quant_model: str, *, size_factors: List[float], dtype: str, @@ -28,8 +29,6 @@ def run_awq_test( tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ): - source_model, quant_model = models - images = [asset.pil_image for asset in image_assets] inputs_per_image = [( @@ -84,8 +83,11 @@ def run_awq_test( ) +@pytest.mark.quant_model @pytest.mark.parametrize( - "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")]) + ("source_model", "quant_model"), + [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")], +) @pytest.mark.parametrize( "size_factors", [ @@ -103,12 +105,13 @@ def run_awq_test( @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @torch.inference_mode() -def test_awq_models(vllm_runner, image_assets, models, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: +def test_awq_models(vllm_runner, image_assets, source_model, quant_model, + size_factors, dtype, max_tokens, num_logprobs) -> None: run_awq_test( vllm_runner, image_assets, - models, + source_model, + quant_model, size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py new file mode 100644 index 0000000000000..45a7365204403 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -0,0 +1,129 @@ +from typing import Optional, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoConfig + +# Import the functions to test +from vllm.model_executor.models.h2ovl import (calculate_num_blocks, + image_to_pixel_values_wrapper) +from vllm.multimodal.utils import rescale_image_size + +models = [ + "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names + "h2oai/h2ovl-mississippi-2b", +] + + +def run_preprocessing_test( + image: Image, + config, + max_dynamic_patch: Optional[int] = None, +) -> Tuple[torch.Tensor, int]: + """Test the image preprocessing and calculate expected blocks.""" + + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + + width, height = image.size + use_MSAC = config.use_msac + + # Create the mapper function with the provided configuration + mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC) + pixel_values = mapper(image) + + # Calculate the expected number of blocks + if use_MSAC: + # First pass + blocks1, _, _, aspect_ratio = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + prior_aspect_ratio=None, + ) + + # Second pass + blocks2, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=aspect_ratio, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_blocks = total_blocks + + else: + blocks, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=None, + ) + expected_blocks = blocks + + if config.use_thumbnail and expected_blocks > 1: + expected_blocks += 1 + + return pixel_values, expected_blocks + + +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8]) +def test_image_preprocessing(image_assets, model_name, size_factors, + max_dynamic_patch): + """Test image preprocessing pipeline with different configurations.""" + # Load the configuration from the model + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + + for asset in image_assets: + image = asset.pil_image + for factor in size_factors: + scaled_image = rescale_image_size(image, factor) + + # Test preprocessing and get expected number of blocks + pixel_values, expected_blocks = run_preprocessing_test( + scaled_image, config, max_dynamic_patch) + + # Verify output shapes and properties + actual_blocks = pixel_values.shape[0] + assert actual_blocks == expected_blocks, ( + f"Expected {expected_blocks} blocks, got {actual_blocks}") + + # Check image dimensions + expected_size = ( + 3, # Number of channels (C, H, W) + config.vision_config.image_size, + config.vision_config.image_size, + ) + for img in pixel_values: + assert img.shape == expected_size, ( + f"Expected image size {expected_size}, got {img.shape}") diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py index 98f313eb9b9af..32fcb0bbc42f1 100644 --- a/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -11,21 +11,17 @@ # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] -models = [ - snapshot_download("OpenGVLab/InternViT-300M-448px", - allow_patterns=DOWNLOAD_PATTERN), - snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5", - allow_patterns=DOWNLOAD_PATTERN), -] def run_intern_vit_test( image_assets: _ImageAssets, - model: str, + model_id: str, *, dtype: str, distributed_executor_backend: Optional[str] = None, ): + model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) + img_processor = CLIPImageProcessor.from_pretrained(model) images = [asset.pil_image for asset in image_assets] pixel_values = [ @@ -67,12 +63,15 @@ def run_intern_vit_test( assert cos_similar(vllm_output, hf_output).mean() > 0.99 -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", [ + "OpenGVLab/InternViT-300M-448px", + "OpenGVLab/InternViT-6B-448px-V1-5", +]) @pytest.mark.parametrize("dtype", [torch.half]) @torch.inference_mode() -def test_models(dist_init, image_assets, model, dtype: str) -> None: +def test_models(dist_init, image_assets, model_id, dtype: str) -> None: run_intern_vit_test( image_assets, - model, + model_id, dtype=dtype, ) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 9370527e3cd57..3f6d8ef42cd5f 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -34,6 +34,7 @@ "dtype": "half", "max_tokens": 5, "tensor_parallel_size": 2, + "model_kwargs": {"device_map": "auto"}, "image_size_factors": [(.25, 0.5, 1.0)], "distributed_executor_backend": ( "ray", @@ -75,6 +76,64 @@ # this is a good idea for checking your command first, since tests are slow. VLM_TEST_SETTINGS = { + #### Core tests to always run in the CI + "llava": VLMTestInfo( + models=["llava-hf/llava-1.5-7b-hf"], + test_type=( + VLMTestType.EMBEDDING, + VLMTestType.IMAGE, + VLMTestType.CUSTOM_INPUTS + ), + prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", + convert_assets_to_embeddings=model_utils.get_llava_embeddings, + max_model_len=4096, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( + formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" + ), + limit_mm_per_prompt={"image": 4}, + )], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + "paligemma": VLMTestInfo( + models=["google/paligemma-3b-mix-224"], + test_type=VLMTestType.IMAGE, + prompt_formatter=identity, + img_idx_to_prompt = lambda idx: "", + # Paligemma uses its own sample prompts because the default one fails + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "caption es", + "cherry_blossom": "What is in the picture?", + }), + auto_cls=AutoModelForVision2Seq, + postprocess_inputs=model_utils.get_key_type_post_processor( + "pixel_values" + ), + vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, + dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm() + else ("half", "float")), + marks=[pytest.mark.core_model], + ), + "qwen2_vl": VLMTestInfo( + models=["Qwen/Qwen2-VL-2B-Instruct"], + test_type=( + VLMTestType.IMAGE, + VLMTestType.MULTI_IMAGE, + VLMTestType.VIDEO + ), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + #### Extended model tests "blip2": VLMTestInfo( models=["Salesforce/blip2-opt-2.7b"], test_type=VLMTestType.IMAGE, @@ -100,9 +159,9 @@ dtype="bfloat16", marks=[ pytest.mark.skipif( - transformers.__version__.startswith("4.46"), + transformers.__version__ < "4.46.2", reason="Model broken in HF, see huggingface/transformers#34379" - ) + ), ] ), "fuyu": VLMTestInfo( @@ -115,7 +174,6 @@ use_tokenizer_eos=True, vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, - dtype="bfloat16" if current_platform.is_cpu() else "half", image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], ), "glm4": VLMTestInfo( @@ -127,8 +185,41 @@ max_num_seqs=2, dtype="bfloat16", get_stop_token_ids=lambda tok: [151329, 151336, 151338], - marks=[large_gpu_mark(min_gb=48)], patch_hf_runner=model_utils.glm_patch_hf_runner, + marks=[large_gpu_mark(min_gb=48)], + ), + "h2ovl": VLMTestInfo( + models = [ + "h2oai/h2ovl-mississippi-800m", + "h2oai/h2ovl-mississippi-2b", + ], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 + single_image_prompts=IMAGE_ASSETS.prompts({ + "stop_sign": "\nWhat's the content in the center of the image?", # noqa: E501 + "cherry_blossom": "\nWhat is the season?", + }), + multi_image_prompt="Image-1: \nImage-2: \nDescribe the two images in short.", # noqa: E501 + max_model_len=8192, + dtype="bfloat16", + use_tokenizer_eos=True, + patch_hf_runner=model_utils.h2ovl_patch_hf_runner, + ), + "idefics3": VLMTestInfo( + models=["HuggingFaceM4/Idefics3-8B-Llama3"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + img_idx_to_prompt=lambda idx: "", + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + marks=[ + pytest.mark.skipif( + transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0" + ), + large_gpu_mark(min_gb=48), + ], ), "intern_vl": VLMTestInfo( models=[ @@ -151,25 +242,6 @@ use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, ), - "llava": VLMTestInfo( - models=["llava-hf/llava-1.5-7b-hf"], - test_type=( - VLMTestType.EMBEDDING, - VLMTestType.IMAGE, - VLMTestType.CUSTOM_INPUTS - ), - prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", - convert_assets_to_embeddings=model_utils.get_llava_embeddings, - max_model_len=4096, - auto_cls=AutoModelForVision2Seq, - vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, - custom_test_opts=[CustomTestOptions( - inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs( - formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:" - ), - limit_mm_per_prompt={"image": 4}, - )], - ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), @@ -190,7 +262,6 @@ models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 - dtype="half", num_video_frames=16, max_model_len=16384, postprocess_inputs=model_utils.get_key_type_post_processor( @@ -200,15 +271,14 @@ vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, # Llava-one-vision tests fixed sizes & the default size factors image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 ), limit_mm_per_prompt={"video": 4}, + runner_mm_key="videos", )], ), - # FIXME "llava_next_video": VLMTestInfo( models=["llava-hf/LLaVA-NeXT-Video-7B-hf"], test_type=VLMTestType.VIDEO, @@ -218,9 +288,11 @@ auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], - runner_mm_key="videos", marks=[ - pytest.mark.skip(reason="LLava next video tests currently fail.") + pytest.mark.skipif( + transformers.__version__ < "4.46.2", + reason="Model broken with changes in transformers 4.46" + ) ], ), "minicpmv": VLMTestInfo( @@ -234,23 +306,6 @@ postprocess_inputs=model_utils.wrap_inputs_post_processor, hf_output_post_proc=model_utils.minicmpv_trunc_hf_output, ), - "paligemma": VLMTestInfo( - models=["google/paligemma-3b-mix-224"], - test_type=VLMTestType.IMAGE, - prompt_formatter=identity, - img_idx_to_prompt = lambda idx: "", - # Paligemma uses its own sample prompts because the default one fails - single_image_prompts=IMAGE_ASSETS.prompts({ - "stop_sign": "caption es", - "cherry_blossom": "What is in the picture?", - }), - auto_cls=AutoModelForVision2Seq, - postprocess_inputs=model_utils.get_key_type_post_processor( - "pixel_values" - ), - vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - dtype="half" if current_platform.is_rocm() else ("half", "float"), - ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 @@ -268,6 +323,16 @@ # vllm_output_post_proc=model_utils.phi3v_vllm_to_hf_output, # num_logprobs=10, # ), + "pixtral_hf": VLMTestInfo( + models=["nm-testing/pixtral-12b-FP8-dynamic"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"[INST]{img_prompt}[/INST]", + img_idx_to_prompt=lambda idx: "[IMG]", + max_model_len=8192, + max_num_seqs=2, + auto_cls=AutoModelForVision2Seq, + marks=[large_gpu_mark(min_gb=48)], + ), "qwen": VLMTestInfo( models=["Qwen/Qwen-VL"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), @@ -297,7 +362,7 @@ reason="Need at least 2 GPUs to run the test.", ), pytest.mark.skipif( - transformers.__version__.startswith("4.46"), + transformers.__version__ < "4.46.2", reason="Model broken in HF, see huggingface/transformers#34379" ) ], @@ -339,7 +404,6 @@ prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=4096, - dtype="bfloat16" if current_platform.is_cpu() else "half", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, custom_test_opts=[ @@ -354,7 +418,6 @@ test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, max_num_seqs=2, - dtype="half", postprocess_inputs=model_utils.get_key_type_post_processor( "pixel_values" ), diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index b9c20ddb2d746..82eae0705c9ba 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, target_dtype = "half" -if current_platform.is_cpu(): - target_dtype = "bfloat16" # ROCm Triton FA can run into shared memory issues with these models, # use other backends in the meantime diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py new file mode 100644 index 0000000000000..718c675b86fb4 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -0,0 +1,428 @@ +from typing import Any, List, Optional, Tuple, Type, TypedDict, Union + +import numpy.typing as npt +import pytest +import torch +from PIL import Image + +from vllm.entrypoints.llm import LLM +from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, + sample_frames_from_video) + +from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, + PromptVideoInput, VllmRunner) +from ...utils import check_logprobs_close + +models = ["Qwen/Qwen2-VL-2B-Instruct"] +target_dtype = "half" + +IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" +VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" + + +def qwen2_vl_chat_template(*query): + return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the biggest text's content in this image?", + ), + "cherry_blossom": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the season shown in this image? ", + "Reply with a short sentence (no more than 20 words)", + ), +}) + +VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ + "sample_demo_1": + qwen2_vl_chat_template( + VIDEO_PLACEHOLDER, + "Describe this video with a short sentence ", + "(no more than 20 words)", + ), +}) + +MULTIIMAGE_PROMPT = qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + IMAGE_PLACEHOLDER, + "Describe these two images separately. ", + "For each image, reply with a short sentence ", + "(no more than 10 words).", +) + + +class Qwen2VLPromptImageEmbeddingInput(TypedDict): + image_embeds: torch.Tensor + image_grid_thw: torch.Tensor + + +class Qwen2VLPromptVideoEmbeddingInput(TypedDict): + video_embeds: torch.Tensor + video_grid_thw: torch.Tensor + + +def batch_make_image_embeddings( + image_batches: List[Union[Image.Image, List[Image.Image]]], processor, + llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + """batched image embeddings for Qwen2-VL + + This will infer all images' embeddings in a single batch, + and split the result according to input batches. + + image_batches: + - Single-image batches: `List[Image.Image]` + - Multiple-image batches: `List[List[Image.Image]]]` + + returns: `List[Qwen2VLPromptImageEmbeddingInput]` + """ + + image_batches_: List[Any] = image_batches[:] + + # convert single-image batches to multiple-image batches + for idx in range(len(image_batches_)): + if not isinstance(image_batches_[idx], list): + image_batches_[idx] = [image_batches_[idx]] + + assert isinstance(image_batches_[idx], list) + + # append all images into a list (as a batch) + images: List[Image.Image] = [] + for image_batch in image_batches_: + images += image_batch + + # image to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=images, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values"] + image_grid_thw = preprocess_result["image_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker. \ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + image_embeds = visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptImageEmbeddingInput] = [] + image_counter = 0 + embed_counter = 0 + for image_batch in image_batches_: + cur_batch_image_count = len(image_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in image_grid_thw[image_counter:image_counter + + cur_batch_image_count] + ]) + + result.append({ + "image_embeds": + image_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "image_grid_thw": + image_grid_thw[image_counter:image_counter + + cur_batch_image_count], + }) + + embed_counter += cur_batch_embed_len + image_counter += cur_batch_image_count + + # ensure we don't lost any images or embeddings + assert embed_counter == image_embeds.size(0) + assert image_counter == image_grid_thw.size(0) + assert len(image_batches) == len(result) + + return result + + +def batch_make_video_embeddings( + video_batches: PromptVideoInput, processor, + llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + """batched video embeddings for Qwen2-VL + + A NDArray represents a single video's all frames. + + This will infer all videos' embeddings in a single batch, + and split the result according to input batches. + + video_batches: + - Single-video batches: `List[NDArray]` + - Multiple-video batches: `List[List[NDArray]]` + """ + + video_batches_: List[Any] = video_batches[:] + + for idx in range(len(video_batches_)): + if not isinstance(video_batches_[idx], list): + single_video_batch: List[npt.NDArray] = [video_batches_[idx]] + video_batches_[idx] = single_video_batch + + assert isinstance(video_batches_[idx], list) + + # append all videos into a list (as a batch) + videos: List[npt.NDArray] = [] + for video_batch in video_batches_: + videos += video_batch + + # video to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=None, videos=videos, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values_videos"] + video_grid_thw = preprocess_result["video_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker.\ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + video_embeds = visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptVideoEmbeddingInput] = [] + video_counter = 0 + embed_counter = 0 + for video_batch in video_batches_: + cur_batch_video_count = len(video_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in video_grid_thw[video_counter:video_counter + + cur_batch_video_count] + ]) + + result.append({ + "video_embeds": + video_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "video_grid_thw": + video_grid_thw[video_counter:video_counter + + cur_batch_video_count], + }) + + embed_counter += cur_batch_embed_len + video_counter += cur_batch_video_count + + # ensure we don't lost any videos or embeddings + assert embed_counter == video_embeds.size(0) + assert video_counter == video_grid_thw.size(0) + assert len(video_batches) == len(result) + + return result + + +def run_test( + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between + original image/video input and image/video embeddings input. + """ + from transformers import AutoProcessor # noqa: F401 + + processor = AutoProcessor.from_pretrained(model) + + # NOTE: + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + task="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + + outputs_per_case_for_original_input = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images or None, + videos=videos or None) + for prompts, images, videos in inputs + ] + + outputs_per_case_for_embeddings_input = [ + vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=batch_make_image_embeddings( + images, processor, vllm_model.model) if images else None, + videos=batch_make_video_embeddings( + videos, processor, vllm_model.model) if videos else None) + for prompts, images, videos in inputs + ] + + for outputs_for_original_input, \ + outputs_for_embeddings_input \ + in zip(outputs_per_case_for_original_input, + outputs_per_case_for_embeddings_input): + check_logprobs_close( + outputs_0_lst=outputs_for_original_input, + outputs_1_lst=outputs_for_embeddings_input, + name_0="original_input", + name_1="embeddings_input", + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + [], + ) for image, prompt in zip(images, IMAGE_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + [], + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, + model, size_factors, + dtype: str, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[List[str], PromptImageInput, + PromptVideoInput]] = [( + [MULTIIMAGE_PROMPT for _ in size_factors], + [[ + rescale_image_size(image, factor) + for image in images + ] for factor in size_factors], + [], + )] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=2, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.25, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + num_frames = 4 + sampled_vids = [ + sample_frames_from_video(asset.np_ndarrays, num_frames) + for asset in video_assets + ] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [], + [rescale_video_size(video, factor) for factor in size_factors], + ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 6856e8df81a13..849857b4232e7 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -56,6 +56,17 @@ def qwen_vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +def qwen2_vllm_to_hf_output( + vllm_output: RunnerOutput, + model: str) -> Tuple[List[int], str, Optional[SampleLogprobs]]: + """Sanitize vllm output [qwen2 models] to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = output_str + "<|im_end|>" + + return output_ids, hf_output_str, out_logprobs + + def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput: config = AutoConfig.from_pretrained(model) @@ -248,6 +259,66 @@ def processor(*args, text="", images=None, **kwargs): return hf_model +def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for H2OVL.""" + + class H2OVLProcessor: + """A simple processor for H2OVL models.""" + + def __init__(self, hf_runner: HfRunner): + self.num_image_token = hf_runner.model.num_image_token + self.tokenizer = hf_runner.tokenizer + self.dtype = hf_runner.model.dtype + + self.config = AutoConfig.from_pretrained(hf_runner.model_name, + trust_remote_code=True) + self.vision_config = self.config.vision_config + self.use_thumbnail = self.config.use_thumbnail + self.min_num = self.config.min_dynamic_patch + self.max_num = self.config.max_dynamic_patch + self.image_size = self.vision_config.image_size + + def __call__(self, text: str, images: Union[Image, List[Image]], + **kwargs): + # yapf: disable + from vllm.model_executor.models.h2ovl import ( + IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + + # yapf: enable + images = [images] if isinstance(images, Image) else images + pixel_values = [ + image_to_pixel_values(image, + self.image_size, + self.min_num, + self.max_num, + self.use_thumbnail, + use_MSAC=self.config.use_msac).to( + self.dtype) for image in images + ] + num_patches_list = [ + pixel_value.shape[0] for pixel_value in pixel_values + ] + pixel_values = torch.cat(pixel_values, dim=0) + for num_patches in num_patches_list: + context_tokens = IMG_CONTEXT * self.num_image_token \ + * num_patches + image_tokens = IMG_START + context_tokens + IMG_END + text = text.replace('', image_tokens, 1) + prompt = self.tokenizer(text, return_tensors="pt") + prompt.update({"pixel_values": pixel_values}) + return prompt + + img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids( + "") + hf_model.model.img_context_token_id = img_context_token_id + hf_model.processor = H2OVLProcessor(hf_model) + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language_model.get_output_embeddings() + hf_model.model.generate = types.MethodType(_internvl_generate, + hf_model.model) + return hf_model + + def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner: """Patches and returns an instance of the HfRunner to use for InternVL.""" diff --git a/tests/models/decoder_only/vision_language/vlm_utils/runners.py b/tests/models/decoder_only/vision_language/vlm_utils/runners.py index 5a3f9e820dad0..2d3b39fe3594e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/runners.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/runners.py @@ -29,6 +29,7 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": 1}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -51,6 +52,7 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"image": len(image_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -74,6 +76,7 @@ def run_embedding_test(*, model_test_info: VLMTestInfo, limit_mm_per_prompt={"image": 1}, vllm_embeddings=vllm_embeddings, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="images", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -101,6 +104,7 @@ def run_video_test( num_logprobs=test_case.num_logprobs, limit_mm_per_prompt={"video": len(video_assets)}, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key="videos", **model_test_info.get_non_parametrized_runner_kwargs()) @@ -115,7 +119,11 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, inputs = test_case.custom_test_opts.inputs limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt - assert inputs is not None and limit_mm_per_prompt is not None + runner_mm_key = test_case.custom_test_opts.runner_mm_key + # Inputs, limit_mm_per_prompt, and runner_mm_key should all be set + assert inputs is not None + assert limit_mm_per_prompt is not None + assert runner_mm_key is not None core.run_test( hf_runner=hf_runner, @@ -127,4 +135,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo, num_logprobs=test_case.num_logprobs, limit_mm_per_prompt=limit_mm_per_prompt, distributed_executor_backend=test_case.distributed_executor_backend, + runner_mm_key=runner_mm_key, **model_test_info.get_non_parametrized_runner_kwargs()) diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py index 4d18d53af30fa..8459476dc2d07 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/types.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py @@ -52,6 +52,8 @@ class SizeType(Enum): class CustomTestOptions(NamedTuple): inputs: List[Tuple[List[str], List[Union[List[Image], Image]]]] limit_mm_per_prompt: Dict[str, int] + # kwarg to pass multimodal data in as to vllm/hf runner instances. + runner_mm_key: str = "images" class ImageSizeWrapper(NamedTuple): @@ -141,9 +143,6 @@ class VLMTestInfo(NamedTuple): Callable[[PosixPath, str, Union[List[ImageAsset], _ImageAssets]], str]] = None # noqa: E501 - # kwarg to pass multimodal data in as to vllm/hf runner instances - runner_mm_key: str = "images" - # Allows configuring a test to run with custom inputs custom_test_opts: Optional[List[CustomTestOptions]] = None @@ -159,6 +158,7 @@ def get_non_parametrized_runner_kwargs(self): "max_model_len": self.max_model_len, "max_num_seqs": self.max_num_seqs, "task": self.task, + "tensor_parallel_size": self.tensor_parallel_size, "hf_output_post_proc": self.hf_output_post_proc, "vllm_output_post_proc": self.vllm_output_post_proc, "auto_cls": self.auto_cls, @@ -168,7 +168,6 @@ def get_non_parametrized_runner_kwargs(self): "get_stop_token_ids": self.get_stop_token_ids, "model_kwargs": self.model_kwargs, "patch_hf_runner": self.patch_hf_runner, - "runner_mm_key": self.runner_mm_key, } diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 39b6bbaf43180..cd920aec6502e 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,6 +4,8 @@ """ import pytest +from vllm.utils import current_platform + from ..utils import check_embeddings_close # Model, Guard @@ -21,15 +23,14 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models( - monkeypatch, hf_runner, vllm_runner, example_prompts, model, dtype: str, ) -> None: - if model in ENCODER_ONLY: - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + if model not in ENCODER_ONLY and current_platform.is_cpu(): + pytest.skip("Skip large embedding models test on CPU.") # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py new file mode 100644 index 0000000000000..3dd8cb729f8a6 --- /dev/null +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -0,0 +1,209 @@ +from functools import partial +from typing import Callable, Dict, List, Type + +import pytest +import torch +from PIL import Image +from transformers import BatchEncoding, Qwen2VLForConditionalGeneration + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ..utils import check_embeddings_close + +HF_TEXT_PROMPTS = [ + # T -> X + ( + "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501, + Image.new("RGB", (56, 56))), + # T -> X + ("Query: Retrieve an image of this caption: cherry blossom", + Image.new("RGB", (56, 56))), +] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "What is shown in this image?", + "cherry_blossom": + "What is shown in this image?" +}) + +MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"] + + +def get_messages(image: Image.Image, text: str, embed_text: bool): + # assert False, 'remember to use outer [] as required' + if embed_text: + messages = [{ + "role": + "user", + "content": [ + { + "type": "image", + "image": Image.new("RGB", (56, 56)), + "resized_height": 1, + "resized_width": 1 + }, # need a dummy image here for an easier process. + { + "type": "text", + "text": text + }, + ] + }] + else: + messages = [{ + "role": + "user", + "content": [{ + "type": "image", + "image": image + }, { + "type": "text", + "text": text + }] + }] + return messages + + +def apply_chat_template_and_add_eos( + messages: List[Dict], + apply_chat_template_fn: Callable, +): + prompt = apply_chat_template_fn( + messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>" + return prompt + + +def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs): + return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs) + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, + embed_texts: List[bool], + model: str, + *, + dtype: str, +) -> None: + '''SET PYTHONPATH''' + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + task="embedding", + dtype=dtype, + enforce_eager=True, + max_model_len=8192) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + texts = [ + # this is necessary because vllm_model.encode will not apply any + # templating to the prompt, and therefore lacks an image_pad + # token unless one is inserted beforehand (the (28,28) image + # above is converted to an image pad token by the chat template). + apply_chat_template_and_add_eos( + get_messages(image, text, False), + apply_chat_template_fn=tokenizer.apply_chat_template, + ) for text, image in zip(input_texts, input_images) + # vllm will replace the pad token with the actual image, + # which may be a placeholder image, later. + ] + vllm_outputs = vllm_model.encode(texts, images=input_images) + + hf_outputs = [] + with hf_runner(model, + dtype=dtype, + auto_cls=Qwen2VLForConditionalGeneration) as hf_model: + hf_model.postprocess_inputs = partial( + postprocess_inputs, + hf_model, + cache_position=torch.arange( + 0, + 1, # 1 for batch size + requires_grad=False), + use_cache=False) + for text, image, embed_text in zip(input_texts, input_images, + embed_texts): + # dse requires non-standard input processing + # because it needs an image_pad token + messages = get_messages(image, text, embed_text) + prompt = apply_chat_template_and_add_eos( + messages, hf_model.processor.apply_chat_template) + inputs = hf_model.get_inputs( + prompts=[[prompt]], + images=[[image]], + ) + with torch.no_grad(): + outputs = hf_model.model( + **hf_model.wrap_device(inputs[0], + device=hf_model.model.device.type), + return_dict=True, + output_hidden_states=True, + ) + pooled_output = torch.nn.functional.normalize( + outputs.hidden_states[-1][0, -1], p=2, dim=-1) + hf_outputs.append(pooled_output.tolist()) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, image_placeholder) + for text, image_placeholder in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [True] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + embed_texts, + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [False] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + embed_texts, + model, + dtype=dtype, + ) diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py index a8d0ac4fc160d..9fab5898a06ba 100644 --- a/tests/models/embedding/vision_language/test_llava_next.py +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -2,6 +2,7 @@ import pytest import torch.nn.functional as F +import transformers from transformers import AutoModelForVision2Seq from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner @@ -85,8 +86,8 @@ def _run_test( ) -# FIXME -@pytest.mark.skip(reason="LLava next embedding tests currently fail") +@pytest.mark.skipif(transformers.__version__.startswith("4.46"), + reason="Model broken with changes in transformers 4.46") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models_text( diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py index 483773f069133..d686f1da3fa17 100644 --- a/tests/models/encoder_decoder/vision_language/test_florence2.py +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -85,7 +85,7 @@ def run_test( @pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("dtype", ["float", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) def test_models(hf_runner, vllm_runner, model, dtype, max_tokens, diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 7f82347841cdb..a3b1c0950d9a2 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -4,6 +4,8 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, + global_force_attn_backend_context_manager) from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs @@ -14,6 +16,8 @@ _LIMIT_IMAGE_PER_PROMPT = 3 +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": "<|image|><|begin_of_text|>The meaning of the image is", @@ -221,6 +225,13 @@ def process(hf_inputs: BatchEncoding, **kwargs): ) +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @large_gpu_test(min_gb=48) @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( @@ -244,20 +255,26 @@ def process(hf_inputs: BatchEncoding, **kwargs): @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, model, sizes, dtype, max_tokens, - num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + num_logprobs, + attn_backend: _Backend) -> None: + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + sizes=sizes, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -265,9 +282,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: + model, dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -291,17 +309,20 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, cherry_blossom.resize((512, 1024)), ], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -309,8 +330,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, - dtype, max_tokens, num_logprobs) -> None: + dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -325,14 +348,17 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, [stop_sign], [stop_sign, cherry_blossom], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/models/utils.py b/tests/models/utils.py index f7802d98ad678..0eb3f61f1f047 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -5,7 +5,6 @@ from vllm.config import ModelConfig, TaskOption from vllm.inputs import InputContext -from vllm.platforms import current_platform from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs TokensText = Tuple[List[int], str] @@ -270,7 +269,7 @@ def build_model_context(model_name: str, if tokenizer_name is None: tokenizer_name = model_name if dtype is None: - dtype = "bfloat16" if current_platform.is_cpu() else "half" + dtype = "half" model_config = ModelConfig( model_name, diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_inputs.py similarity index 79% rename from tests/multimodal/test_base.py rename to tests/multimodal/test_inputs.py index 68d05de904ba8..678bbb52b8c2f 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_inputs.py @@ -1,6 +1,6 @@ import torch -from vllm.multimodal.base import MultiModalInputs, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors def assert_nested_tensors_equal(expected: NestedTensors, @@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors, assert_nested_tensors_equal(expected_item, actual_item) -def assert_multimodal_inputs_equal(expected: MultiModalInputs, - actual: MultiModalInputs): +def assert_multimodal_inputs_equal(expected: MultiModalKwargs, + actual: MultiModalKwargs): assert set(expected.keys()) == set(actual.keys()) for key in expected: assert_nested_tensors_equal(expected[key], actual[key]) @@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs, def test_multimodal_input_batch_single_tensor(): t = torch.rand([1, 2]) - result = MultiModalInputs.batch([{"image": t}]) + result = MultiModalKwargs.batch([{"image": t}]) assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)}) @@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors(): a = torch.rand([1, 1, 2]) b = torch.rand([1, 1, 2]) c = torch.rand([1, 1, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])}) @@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors(): a = torch.rand([1, 2, 2]) b = torch.rand([1, 3, 2]) c = torch.rand([1, 4, 2]) - result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}]) + result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}]) assert_multimodal_inputs_equal(result, {"image": [a, b, c]}) @@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors(): a = torch.rand([2, 3]) b = torch.rand([2, 3]) c = torch.rand([2, 3]) - result = MultiModalInputs.batch([{ + result = MultiModalKwargs.batch([{ "image": [a] }, { "image": [b] @@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists(): a = torch.rand([1, 2, 3]) b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal( result, {"image": [torch.stack([a, b]), c.unsqueeze(0)]}) @@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists(): b = torch.rand([1, 2, 3]) c = torch.rand([1, 2, 3]) d = torch.rand([1, 2, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}]) assert_multimodal_inputs_equal( result, {"image": torch.stack([torch.stack([a, b]), @@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths(): b = torch.rand([1, 3, 3]) c = torch.rand([1, 4, 3]) - result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}]) + result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}]) assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]}) - result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}]) + result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}]) assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]}) diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index 5044740c3e734..e6c8793989e13 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -1,12 +1,12 @@ from array import array -from typing import Mapping +from typing import Callable, Dict, Mapping, Optional from unittest.mock import patch import pytest import torch -from vllm.inputs import DecoderOnlyInputs, InputContext, token_inputs -from vllm.inputs.registry import InputRegistry +from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext, + InputRegistry, ProcessorInputs, token_inputs) from vllm.multimodal import MultiModalRegistry from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData @@ -34,10 +34,9 @@ def custom_processor(ctx: InputContext, inputs: DecoderOnlyInputs, *, num_crops=DEFAULT_NUM_CROPS): - # For testing purposes, we don't worry about the llm inputs / return - # type validation, and just return the value of the kwarg that we - # clobber. - return num_crops + # For testing purposes, we don't worry about the prompt + return token_inputs(prompt_token_ids=[], + mm_processor_kwargs={"num_crops": num_crops}) with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor", return_value=custom_processor): @@ -56,7 +55,7 @@ def custom_dummy_data_factory(self, num_crops=DEFAULT_NUM_CROPS): seq_data = SequenceData( array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * num_crops)) - return seq_data, None + return DummyData(seq_data, None) with patch( "vllm.inputs.registry.InputRegistry._default_dummy_data_factory", @@ -109,6 +108,21 @@ def _get_num_crops_info(init_num_crops: int, inference_num_crops: int): return init_kwargs, inference_kwargs, expected_seq_count +def _get_processed_num_crops( + processor: Callable[[ProcessorInputs], ProcessorInputs], + inference_kwargs: Optional[Dict[str, int]], +) -> int: + processed_inputs = processor( + token_inputs(prompt_token_ids=[], + prompt="", + mm_processor_kwargs=inference_kwargs)) + + assert "type" in processed_inputs + assert processed_inputs["type"] == "token" + assert "mm_processor_kwargs" in processed_inputs + return processed_inputs["mm_processor_kwargs"]["num_crops"] + + @pytest.mark.parametrize("init_num_crops,inference_num_crops", [ (None, None), (NUM_CROPS_OVERRIDE, None), @@ -124,10 +138,8 @@ def test_input_processor_kwargs(use_processor_mock, init_num_crops, ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs) processor = dummy_registry.create_input_processor(ctx.model_config) - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=inference_kwargs)) + num_crops_val = _get_processed_num_crops(processor, inference_kwargs) + assert num_crops_val == expected_seq_count @@ -153,10 +165,7 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock, processor = dummy_registry.create_input_processor(ctx.model_config) # Should filter out the inference time kwargs - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=mm_processor_kwargs)) + num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs) assert num_crops_val == DEFAULT_NUM_CROPS @@ -177,9 +186,9 @@ def test_dummy_data_kwarg_overrides(use_dummy_data_mock, num_crops): # NOTE: seq_len is thrown away here since this will leverage the # default dummy data factory that we have patched in, whose seq # len is solely dependent on the value of the mm_processor_kwargs. - seq_data, _ = dummy_registry.dummy_data_for_profiling( + dummy_data = dummy_registry.dummy_data_for_profiling( ctx.model_config, seq_len=-1, mm_registry=mm_registry) - assert len(seq_data.prompt_token_ids) == expected_seq_count + assert len(dummy_data.seq_data.prompt_token_ids) == expected_seq_count @pytest.mark.parametrize( @@ -206,9 +215,9 @@ def test_dummy_data_with_sad_kwarg_overrides(use_dummy_data_mock, # NOTE: seq_len is thrown away here since this will leverage the # default dummy data factory that we have patched in, whose seq # len is solely dependent on the value of the mm_processor_kwargs. - seq_data, _ = dummy_registry.dummy_data_for_profiling( + dummy_data = dummy_registry.dummy_data_for_profiling( ctx.model_config, seq_len=-1, mm_registry=mm_registry) - assert len(seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS + assert len(dummy_data.seq_data.prompt_token_ids) == DEFAULT_NUM_CROPS ### Test overrides for the max token count per multimodal instance diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 38cd48629f903..9869c8123f001 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,11 +1,12 @@ import base64 import mimetypes -from tempfile import NamedTemporaryFile +import os +from tempfile import NamedTemporaryFile, TemporaryDirectory from typing import Dict, Tuple import numpy as np import pytest -from PIL import Image +from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer from vllm.multimodal.utils import (async_fetch_image, fetch_image, @@ -84,6 +85,40 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image], assert _image_equals(data_image_sync, data_image_async) +@pytest.mark.asyncio +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_fetch_image_local_files(image_url: str): + with TemporaryDirectory() as temp_dir: + origin_image = fetch_image(image_url) + origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)), + quality=100, + icc_profile=origin_image.info.get('icc_profile')) + + image_async = await async_fetch_image( + f"file://{temp_dir}/{os.path.basename(image_url)}", + allowed_local_media_path=temp_dir) + + image_sync = fetch_image( + f"file://{temp_dir}/{os.path.basename(image_url)}", + allowed_local_media_path=temp_dir) + # Check that the images are equal + assert not ImageChops.difference(image_sync, image_async).getbbox() + + with pytest.raises(ValueError): + await async_fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}", + allowed_local_media_path=temp_dir) + with pytest.raises(ValueError): + await async_fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}") + + with pytest.raises(ValueError): + fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}", + allowed_local_media_path=temp_dir) + with pytest.raises(ValueError): + fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}") + + @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"]) def test_repeat_and_pad_placeholder_tokens(model): config = AutoConfig.from_pretrained(model) @@ -92,18 +127,50 @@ def test_repeat_and_pad_placeholder_tokens(model): tokenizer = AutoTokenizer.from_pretrained(model) test_cases = [ - ("", 2, "", [32000, 32000]), - ("", 2, "", [32000, 32000, 32000]), - ("", [3, 2], "", - [32000, 32000, 32000, 32000, 32000]), - ("Image:Image:!", [3, 2], - "Image:Image:!", - [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918]), - ("", [3, 2], "", [32000, 32000, 32000]), - ] - - for prompt, repeat_count, expected_prompt, expected_token_ids in test_cases: - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + ( + "", + 2, + "", + [32000, 32000], + [{ "offset": 0, "length": 2 }], + ), + ( + "", + 2, + "", + [32000, 32000, 32000], + [{ "offset": 0, "length": 2 }]), + ( + "", + [3, 2], + "", + [32000, 32000, 32000, 32000, 32000], + [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }], + ), + ( + "Image:Image:!", + [3, 2], + "Image:Image:!", + [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918], + [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }], + ), + ( + "", + [3, 2], + "", + [32000, 32000, 32000], + [{ "offset": 0, "length": 3 }], + ), + ] # yapf: disable + + for ( + prompt, + repeat_count, + expected_prompt, + expected_token_ids, + expected_ranges, + ) in test_cases: + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer=tokenizer, prompt=prompt, prompt_token_ids=tokenizer.encode(prompt, @@ -113,3 +180,4 @@ def test_repeat_and_pad_placeholder_tokens(model): ) assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids + assert ranges == expected_ranges diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index 366b030eaa399..50723dbb610ac 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -5,6 +5,7 @@ import pytest from tests.kernels.utils import override_backend_env_variable +from vllm import SamplingParams, TokensPrompt from ..models.utils import check_outputs_equal @@ -12,12 +13,21 @@ "facebook/opt-125m", ] +UNSTABLE_PROMPT_SEQUENCE = [ + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([3] * 1), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([5] * 50), + ([0] * 588) + ([1] * 1332) + ([2] * 30) + ([6] * 95), + ([0] * 588) + ([1] * 1332) + ([4] * 3) + ([7] * 174), + ([0] * 588) + ([8] * 1539), +] + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("cached_position", [0, 1]) +@pytest.mark.parametrize("block_size", [16]) def test_mixed_requests( hf_runner, vllm_runner, @@ -27,11 +37,12 @@ def test_mixed_requests( dtype: str, max_tokens: int, cached_position: int, + block_size: int, monkeypatch, ) -> None: """ Test the case when some sequences have the prefix cache hit - and the others don't. The cached position determines where + and the others don't. The cached position determines where the sequence is at among the batch of prefills. """ override_backend_env_variable(monkeypatch, backend) @@ -44,12 +55,30 @@ def test_mixed_requests( model, dtype=dtype, enable_prefix_caching=True, + block_size=block_size, ) as vllm_model: # Run the first prompt so the cache is populated vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) # Run all the promopts - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert req_outputs[ + i].num_cached_tokens == expected_num_cached_tokens + + vllm_outputs = [ + (output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text) for output in req_outputs + ] check_outputs_equal( outputs_0_lst=hf_outputs, @@ -57,3 +86,22 @@ def test_mixed_requests( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) +def test_unstable_prompt_sequence( + vllm_runner, + backend: str, + monkeypatch, +) -> None: + override_backend_env_variable(monkeypatch, backend) + + with vllm_runner( + "Qwen/Qwen2.5-0.5B-Instruct", + enable_chunked_prefill=True, + enable_prefix_caching=True, + max_model_len=4096, + ) as vllm_model: + for prompt in UNSTABLE_PROMPT_SEQUENCE: + vllm_model.generate(TokensPrompt(prompt_token_ids=prompt), + SamplingParams(max_tokens=1)) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 0f01f5f819ea4..569fc8dfb6a21 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -9,7 +9,7 @@ import torch from tests.quantization.utils import is_quant_method_supported -from tests.utils import fork_new_process_for_each_test +from tests.utils import compare_two_settings, fork_new_process_for_each_test models_4bit_to_test = [ ("facebook/opt-125m", "quantize opt model inflight"), @@ -82,6 +82,34 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, vllm_tp_size=2) +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason='Test requires at least 2 GPUs.') +@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), + reason='bitsandbytes is not supported on this GPU type.') +@pytest.mark.parametrize("model_name, description", models_4bit_to_test) +@fork_new_process_for_each_test +def test_load_pp_4bit_bnb_model(model_name, description) -> None: + common_args = [ + "--disable-log-stats", + "--disable-log-requests", + "--dtype", + "bfloat16", + "--enable-prefix-caching", + "--quantization", + "bitsandbytes", + "--load-format", + "bitsandbytes", + "--gpu-memory-utilization", + "0.7", + ] + pp_args = [ + *common_args, + "--pipeline-parallel-size", + "2", + ] + compare_two_settings(model_name, common_args, pp_args) + + def log_generated_texts(prompts, outputs, runner_name): logged_texts = [] for i, (_, generated_text) in enumerate(outputs): diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index a8deab3718be1..f5497976faf7a 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -413,12 +413,10 @@ def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): def generate_probs_for_test( self, draft_and_target_probs_equal: bool ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = [ - F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2) - ] + draft_probs, target_probs = (F.softmax( + torch.rand(self.vocab_size, dtype=torch.float32), + dim=-1, + ) for _ in range(2)) num_reference_probs = 100 reference_probs = F.softmax( diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py index 629074188a6c1..a3f0464e79675 100644 --- a/tests/spec_decode/e2e/test_compatibility.py +++ b/tests/spec_decode/e2e/test_compatibility.py @@ -5,40 +5,6 @@ from .conftest import get_output_from_llm_generator -@pytest.mark.parametrize("common_llm_kwargs", [{ - "model": "JackFram/llama-68m", - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, -}]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "enable_chunked_prefill": True, - }, -]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_xfail_chunked_prefill(test_llm_generator): - """Verify that speculative decoding with chunked prefill fails. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - with pytest.raises(ValueError, - match="Speculative decoding and chunked prefill"): - get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) - - @pytest.mark.parametrize("common_llm_kwargs", [{ "model": "meta-llama/Llama-2-7b-chat-hf", "speculative_model": "JackFram/llama-68m", @@ -84,3 +50,49 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): with pytest.raises(ValueError, match="cannot be larger than"): get_output_from_llm_generator(test_llm_generator, prompts, sampling_params) + + +@pytest.mark.parametrize("common_llm_kwargs", + [{ + "model": "meta-llama/Llama-2-7b-chat-hf", + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": "True", + }]) +@pytest.mark.parametrize("per_test_common_llm_kwargs", [ + { + "tensor_parallel_size": 2, + "speculative_draft_tensor_parallel_size": 2, + }, + { + "tensor_parallel_size": 4, + "speculative_draft_tensor_parallel_size": 4, + }, + { + "tensor_parallel_size": 8, + "speculative_draft_tensor_parallel_size": 8, + }, +]) +@pytest.mark.parametrize("test_llm_kwargs", [{}]) +@pytest.mark.parametrize("seed", [1]) +def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one( + test_llm_generator): + """Verify that speculative decoding fails if chunked prefill is enabled for + draft model with tensor parallelism of more than 1. + """ + output_len = 128 + temperature = 0.0 + + prompts = [ + "Hello, my name is", + ] + + sampling_params = SamplingParams( + max_tokens=output_len, + ignore_eos=True, + temperature=temperature, + ) + + with pytest.raises(ValueError, match="with tensor parallel size 1"): + get_output_from_llm_generator(test_llm_generator, prompts, + sampling_params) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index 5f240d42d9e09..a13cca41f99e5 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -62,6 +62,16 @@ { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + # Chunked prefill enabled with small value + # to make sure we get mixed batches. + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, { # Verify the detokenizer assertions in the test work when spec @@ -141,6 +151,14 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, }, ]) @pytest.mark.parametrize( @@ -204,6 +222,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize( @@ -255,6 +281,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize("max_output_len", [ @@ -300,6 +334,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize("batch_size", [1]) @@ -347,6 +389,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize("batch_size", [32]) @@ -397,6 +447,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize( @@ -454,6 +512,14 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption( { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize("batch_size", [2]) @@ -503,6 +569,15 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, # Artificially limit the draft model max model len; this forces vLLM # to skip speculation once the sequences grow beyond 32-k tokens. "speculative_max_model_len": 32, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, + "speculative_max_model_len": 32, }, ]) @pytest.mark.parametrize("batch_size", [8]) @@ -551,6 +626,15 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs, "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 5, "speculative_disable_by_batch_size": 2, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "speculative_disable_by_batch_size": 2, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, }, ]) @pytest.mark.parametrize("batch_size", [8]) @@ -590,10 +674,17 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, + "enable_chunked_prefill": False, } # Try a range of common k, as well as large speculation. for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] - ]) + ] + [{ + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": k, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, + } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize( "output_len", @@ -636,11 +727,19 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, { "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": k, - "spec_decoding_acceptance_method": "typical_acceptance_sampler" + "spec_decoding_acceptance_method": "typical_acceptance_sampler", + "enable_chunked_prefill": False } # Try a range of common k. for k in [1, 2, 3] - ]) + ] + [{ + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": k, + "spec_decoding_acceptance_method": "typical_acceptance_sampler", + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 + } for k in [1, 2, 3]]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize( "output_len", diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index 31bedad480283..e53d169a8fcc3 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -50,18 +50,33 @@ "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + }, ]) @pytest.mark.parametrize("output_len", [ 256, ]) @pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality on a tiny model with different batch size.""" + if prefill_chunk_size > 0: + common_llm_kwargs.update( + **{ + "enable_chunked_prefill": True, + "max_num_batched_tokens": prefill_chunk_size, + "max_num_seqs": prefill_chunk_size + }) + else: + common_llm_kwargs["enable_chunked_prefill"] = False run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -151,6 +166,16 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "enable_chunked_prefill": False, + }, + { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + "enable_chunked_prefill": True, + "speculative_disable_mqa_scorer": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }, ]) @pytest.mark.parametrize( @@ -251,6 +276,15 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs, "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, "speculative_disable_by_batch_size": 4 + }, { + "speculative_model": "[ngram]", + "num_speculative_tokens": 5, + "ngram_prompt_lookup_max": 3, + "speculative_disable_by_batch_size": 4, + "enable_chunked_prefill": True, + "speculative_disable_mqa_scorer": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4 }]) @pytest.mark.parametrize("batch_size", [1, 5]) @pytest.mark.parametrize( diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py index e6f7f480eebb2..0b5d82b6610ca 100644 --- a/tests/spec_decode/test_multi_step_worker.py +++ b/tests/spec_decode/test_multi_step_worker.py @@ -5,6 +5,8 @@ import pytest import torch +from vllm.attention.selector import (_Backend, + global_force_attn_backend_context_manager) from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob, @@ -303,6 +305,7 @@ def test_multi_step_with_batch_expansion_correct_output(): seed, model_runner_cls=TP1DraftModelRunner, ) + multi_step_worker.set_include_gpu_probs_tensor() worker = create_worker( Worker, model_name, @@ -397,6 +400,7 @@ def test_multi_step_with_batch_expansion_incorrect_output(): seed, model_runner_cls=TP1DraftModelRunner, ) + multi_step_worker.set_include_gpu_probs_tensor() worker = create_worker( Worker, model_name, @@ -477,6 +481,109 @@ def test_multi_step_with_batch_expansion_incorrect_output(): assert (num_mismatch > 0) +@torch.inference_mode() +@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) +# The choice of backends forces the multi_step_worker to choose between +# the vanilla model_runner and TP1DraftModelRunner and that we can test +# both code paths. +@pytest.mark.parametrize('attn_backend', + [_Backend.XFORMERS, _Backend.FLASH_ATTN]) +def test_multi_step_correct_kvcache(num_steps, attn_backend): + """Verify that the KV cache of the draft model + is correctly updated for sequences with bonus token. + """ + seed = 100 + model_name = "JackFram/llama-68m" + + block_size = 16 + num_gpu_blocks = 2048 // block_size + batch_size = 1 + + with global_force_attn_backend_context_manager(attn_backend): + dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' + multi_step_worker = create_worker(MultiStepWorker, + model_name, + block_size, + num_gpu_blocks, + seed, + model_runner_cls=TP1DraftModelRunner, + dtype=dtype) + multi_step_worker.set_include_gpu_probs_tensor() + worker = create_worker(Worker, + model_name, + block_size, + num_gpu_blocks, + seed, + dtype=dtype) + + prompts = [[0] for _ in range(batch_size)] + # Already generate two tokens for the sequence + # so that we can simulate the bonus token case + multi_step_continuations = [[ + random.randint(0, 1000), + random.randint(0, 1000) + ] for _ in prompts] + final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] + + seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=multi_step_continuations, + final_prompt_lens=final_prompt_lens) + + # Run multi-step. + zero_kv_cache(multi_step_worker.cache_engine) + multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list), + sample_len=num_steps, + seq_ids_with_bonus_token_in_last_step= + seq_ids_with_bonus_token_in_last_step) + + # Run single-step repeatedly. + zero_kv_cache(worker.cache_engine) + # Generate the kv cache for the bonus token first + single_step_continuations = [c[:1] for c in multi_step_continuations] + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=single_step_continuations, + final_prompt_lens=final_prompt_lens) + single_step_output = worker.execute_model( + execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list)) + for _ in range(num_steps): + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, + num_gpu_blocks, + block_size, + continuations=multi_step_continuations, + final_prompt_lens=final_prompt_lens) + + single_step_output = worker.execute_model( + execute_model_req=ExecuteModelRequest( + seq_group_metadata_list=seq_group_metadata_list)) + + for i, seq_group_output in enumerate(single_step_output[-1]): + multi_step_continuations[i].append( + seq_group_output.samples[0].output_token) + + # Verify that the KV cache of the single-step and + # multi-step workers are the same. + single_step_gpu_cache = worker.cache_engine[0].gpu_cache + multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache + num_layers = len(single_step_gpu_cache) + allclose = lambda a, b: torch.allclose( + a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) + for i in range(num_layers): + assert allclose(single_step_gpu_cache[i][0], + multi_step_gpu_cache[i][0]) + assert allclose(single_step_gpu_cache[i][1], + multi_step_gpu_cache[i][1]) + + @torch.inference_mode() def test_draft_proposals_full_speculation_len(): """Verify Top1Proposer correctly handles case where all sequences diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py index 3995f87898afb..f66e957186604 100644 --- a/tests/spec_decode/test_ngram_worker.py +++ b/tests/spec_decode/test_ngram_worker.py @@ -118,7 +118,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all(): num_gpu_blocks, block_size, final_prompt_lens=final_prompt_lens) - + for sg in seq_group_metadata_list: + sg.is_prompt = False proposals = proposer.get_spec_proposals( execute_model_req=ExecuteModelRequest( seq_group_metadata_list=seq_group_metadata_list, @@ -147,7 +148,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all(): def test_ngram_algo_correctness_for_batches_match_all(): """Verify our ngram algo find the right candidate in the prompt - For the scenario find candidate in all batchs + For the scenario find candidate in all batches """ block_size = 32 @@ -192,6 +193,10 @@ def test_ngram_algo_correctness_for_batches_match_all(): block_size, final_prompt_lens=final_prompt_lens) + # Normally drafter is run on decode requests only; here we check the output + # of the ngram worker as it is the sole proposer that has no forward. + for sg in seq_group_metadata_list: + sg.is_prompt = False proposals = proposer.get_spec_proposals( execute_model_req=ExecuteModelRequest( seq_group_metadata_list=seq_group_metadata_list, diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index e579c8b38db91..0b1509d8b7785 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -46,12 +46,14 @@ def assert_score_equal(score1: SpeculativeScores, @pytest.mark.parametrize('max_propose_len', [1, 3, 5]) @pytest.mark.parametrize('mixed_propose_len', [True]) @pytest.mark.parametrize('device', ['cuda']) +@pytest.mark.parametrize('prefill_chunking', [False, True]) def test_scorer(model_name: str, batch_size: int, max_propose_len: int, - mixed_propose_len: bool, device: str) -> None: + mixed_propose_len: bool, device: str, + prefill_chunking: bool) -> None: """ Compare the batch expansion scorer and mqa scorer return the same score. We test for both queries with the same propose length and different - propose length. + propose length, as well as mixed prefill-decode batches. """ seed = 0 block_size = 32 @@ -67,16 +69,37 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int, if not mixed_propose_len: propose_lens = [max_propose_len] * batch_size else: - non_zero_cnt = random.randint(0, batch_size) + # There must be at least 1 decode request, otherwise + # we have nothing to score (`_run_no_spec`). + non_zero_cnt = random.randint(1, batch_size) propose_lens = [max_propose_len ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt) random.shuffle(propose_lens) - proposals = create_proposal(propose_lens, vocab_size, device) seq_group_metadatalist, _, _ = create_batch(batch_size, max_propose_len, block_size=block_size, num_gpu_blocks=num_gpu_blocks) + + if mixed_propose_len and prefill_chunking and (n_prefills := + batch_size - non_zero_cnt): + prefill, _, _ = create_batch(n_prefills, + None, + prefill_chunk_size=4, + block_size=block_size, + num_gpu_blocks=num_gpu_blocks, + seq_ids=list( + range(batch_size, + batch_size + n_prefills))) + # re-order to guarantee prefill|decode order + target_group_metadatalist = [ + seq_group_metadatalist[i] for i, p in enumerate(propose_lens) + if p > 0 + ] + seq_group_metadatalist = prefill + target_group_metadatalist + propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0] + + proposals = create_proposal(propose_lens, vocab_size, device) requests = ExecuteModelRequest(seq_group_metadatalist, num_lookahead_slots=max_propose_len) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index e0b7b7d47f1f1..8df143104c279 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -10,6 +10,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed from vllm.sequence import ExecuteModelRequest, SequenceOutput +from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer from vllm.spec_decode.interfaces import SpeculativeProposals from vllm.spec_decode.metrics import (AsyncMetricsCollector, SpecDecodeWorkerMetrics) @@ -819,3 +820,84 @@ def test_handle_finished_requests(): # and 'request-3' are removed from seq_with_bonus_token_in_last_step. assert worker._seq_with_bonus_token_in_last_step == \ {4,5,10} + + +@pytest.mark.parametrize('k', [3]) +@pytest.mark.parametrize('batch_size', [2, 32]) +@pytest.mark.parametrize("batch_composition", + ["prefill_only", "decode_only", "mixed"]) +@torch.inference_mode() +def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str): + """ + Verify SpecDecodeWorker calls match the expected flow. + """ + vocab_size = 32_000 + draft_worker = mock_worker(cls=MultiStepWorker) + target_worker = mock_worker() + metrics_collector = MagicMock(spec=AsyncMetricsCollector) + worker = SpecDecodeWorker(draft_worker, + target_worker, + mock_spec_decode_sampler("rejection_sampler"), + disable_logprobs=False, + metrics_collector=metrics_collector) + exception_secret = 'artificial stop' + worker.scorer = mock_worker(BatchExpansionTop1Scorer) + worker.scorer.score_proposals.side_effect = ValueError(exception_secret) + + # Create batch with combination of terminal/non-terminal prefill chunks + # and decodes (different seq_ids). + decodes, _, _ = create_batch(batch_size, k) + # Pre-chunking here, get 'batch_size' chunks. + prefill, _, _ = create_batch(batch_size, + k, + prefill_chunk_size=4, + seq_ids=list(range(batch_size, + batch_size * 2))) + + if batch_composition == "prefill_only": + n_prefills = batch_size + elif batch_composition == "decode_only": + n_prefills = 0 + else: + n_prefills = random.randint(1, batch_size - 1) + n_decodes = batch_size - n_prefills + + prefill = random.sample(prefill, n_prefills) + decodes = random.sample(decodes, n_decodes) + target_group_metadata_list = prefill + decodes + execute_model_req = ExecuteModelRequest( + seq_group_metadata_list=target_group_metadata_list, + num_lookahead_slots=k) + + target_token_ids = torch.randint(low=0, + high=vocab_size, + size=(1, batch_size * (k + 1)), + dtype=torch.int64, + device='cuda') + target_token_probs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_token_logprobs = torch.rand(1, + batch_size * (k + 1), + vocab_size, + dtype=torch.float32, + device='cuda') + target_output = create_sampler_output_list(target_token_ids, + target_token_probs, + target_token_logprobs) + + target_worker.execute_model.return_value = [target_output[0]] + + if not len(decodes): + worker.execute_model(execute_model_req=execute_model_req) + # no spec run (prefill only) + draft_worker.execute_model.assert_called_once_with(execute_model_req) + target_worker.execute_model.assert_called_once_with(execute_model_req) + else: + # Decode-only run OR mixed batch, scorer call fails (it's mocked) + with pytest.raises(ValueError, match=exception_secret): + worker.execute_model(execute_model_req=execute_model_req) + # but first draft still counted + assert draft_worker.get_spec_proposals.call_count == 1 diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index f683942a5854b..a4bfa6b2f384b 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -68,12 +68,14 @@ def create_worker(cls: Callable[..., T], seed: int, is_driver_worker: bool = True, enforce_eager: bool = True, - model_runner_cls: Optional[ModelRunner] = None) -> T: + model_runner_cls: Optional[ModelRunner] = None, + dtype: Optional[str] = "auto") -> T: engine_args = EngineArgs( model=model_name, seed=seed, block_size=block_size, enforce_eager=enforce_eager, + dtype=dtype, ) engine_config = engine_args.create_engine_config() @@ -81,12 +83,7 @@ def create_worker(cls: Callable[..., T], get_ip(), get_open_port()) worker = cls( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, + vllm_config=engine_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, @@ -149,6 +146,41 @@ def create_seq_group_metadata_from_prompts( return seq_grou_metadata_list +def create_chunked_seq_group_metadata_from_prompt( + prompt: List[int], + num_gpu_blocks: int, + chunk_size: int, + block_size: int, + seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]: + + if seq_id is None: + seq_id = 0 + + free_gpu_blocks = list(range(num_gpu_blocks)) + + block_allocations = [ + free_gpu_blocks.pop() + for _ in range(round_up_to_next_block(len(prompt), block_size)) + ] + + seq_group_metadata_list = [] + for i, idx in enumerate(range(0, len(prompt), chunk_size)): + chunk_ids = prompt[idx:idx + chunk_size] + data = SequenceData.from_seqs(prompt) + data.update_num_computed_tokens(idx) + seq_data = {i: data} + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + do_sample=idx + chunk_size >= len(prompt), # terminal chunk + seq_data=seq_data, + sampling_params=SamplingParams(temperature=0.0), + block_tables={i: block_allocations}, + token_chunk_size=len(chunk_ids))) + return seq_group_metadata_list + + def assert_logprobs_dict_allclose( actual_logprobs: List[Dict[int, Logprob]], expected_logprobs: List[Dict[int, Logprob]]) -> None: @@ -201,7 +233,8 @@ def create_batch(batch_size, prev_output_token_len: int = 10, seq_ids: Optional[List[int]] = None, num_gpu_blocks: Optional[int] = None, - block_size: Optional[int] = None): + block_size: Optional[int] = None, + prefill_chunk_size: Optional[int] = None): if block_size is None: block_size = 8 @@ -216,15 +249,28 @@ def create_batch(batch_size, prompt_lens = prompt_len prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] - prev_output_tokens = [[ - next(iterator) for _ in range(prev_output_token_len) - ] for _ in range(batch_size)] - final_prompt_lens = [ - len(prompt) + len(prev_output_token) + k + 1 - for prompt, prev_output_token in zip(prompts, prev_output_tokens) - ] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, final_prompt_lens, - prev_output_tokens, seq_ids) + if prefill_chunk_size: + # Create a batch of chunked prompts. + if not seq_ids: + seq_ids = list(range(len(prompts))) + seq_group_metadata_list = [] + for p, sid in zip(prompts, seq_ids): + seq_group_metadata_list += \ + create_chunked_seq_group_metadata_from_prompt( + p, num_gpu_blocks, prefill_chunk_size, block_size, sid) + seq_group_metadata_list = seq_group_metadata_list[:batch_size] + prev_output_tokens = [] + else: + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_prompt_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, num_gpu_blocks, block_size, final_prompt_lens, + prev_output_tokens, seq_ids) return seq_group_metadata_list, prompts, prev_output_tokens diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 3576a4834ebc3..e8f8499aa88ca 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -6,6 +6,7 @@ import pytest +from vllm.inputs import token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Sequence from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -70,10 +71,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, - inputs={ - "prompt": prompt, - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, + prompt=prompt), block_size=block_size, eos_token_id=tokenizer.tokenizer.eos_token_id, lora_request=lora_request) diff --git a/tests/test_config.py b/tests/test_config.py index 69918b67607d9..df382d22d83ec 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,8 @@ import pytest from vllm.config import ModelConfig +from vllm.model_executor.layers.pooler import PoolingType +from vllm.platforms import current_platform @pytest.mark.parametrize(("model_id", "expected_task"), [ @@ -102,6 +104,76 @@ def test_get_sliding_window(): assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_get_pooling_config(): + model_id = "sentence-transformers/all-MiniLM-L12-v2" + minilm_model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + minilm_pooling_config = minilm_model_config._init_pooler_config( + pooling_type=None, + pooling_norm=None, + pooling_returned_token_ids=None, + pooling_softmax=None, + pooling_step_tag_id=None) + + assert minilm_pooling_config.pooling_norm + assert minilm_pooling_config.pooling_type == PoolingType.MEAN.name + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_get_pooling_config_from_args(): + model_id = "sentence-transformers/all-MiniLM-L12-v2" + minilm_model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None) + + minilm_pooling_config = minilm_model_config._init_pooler_config( + pooling_type='CLS', + pooling_norm=True, + pooling_returned_token_ids=None, + pooling_softmax=None, + pooling_step_tag_id=None) + + assert minilm_pooling_config.pooling_norm + assert minilm_pooling_config.pooling_type == PoolingType.CLS.name + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_get_bert_tokenization_sentence_transformer_config(): + bge_model_config = ModelConfig( + model="BAAI/bge-base-en-v1.5", + task="auto", + tokenizer="BAAI/bge-base-en-v1.5", + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + revision=None, + ) + + bert_bge_model_config = bge_model_config._get_encoder_config() + + assert bert_bge_model_config["max_seq_length"] == 512 + assert bert_bge_model_config["do_lower_case"] + + def test_rope_customization(): TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0} TEST_ROPE_THETA = 16_000_000.0 @@ -128,8 +200,10 @@ def test_rope_customization(): trust_remote_code=False, dtype="float16", seed=0, - rope_scaling=TEST_ROPE_SCALING, - rope_theta=TEST_ROPE_THETA, + hf_overrides={ + "rope_scaling": TEST_ROPE_SCALING, + "rope_theta": TEST_ROPE_THETA, + }, ) assert getattr(llama_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING @@ -160,8 +234,50 @@ def test_rope_customization(): trust_remote_code=False, dtype="float16", seed=0, - rope_scaling=TEST_ROPE_SCALING, + hf_overrides={ + "rope_scaling": TEST_ROPE_SCALING, + }, ) assert getattr(longchat_model_config.hf_config, "rope_scaling", None) == TEST_ROPE_SCALING assert longchat_model_config.max_model_len == 4096 + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Encoder Decoder models not supported on ROCm.") +@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ + ("facebook/opt-125m", False), + ("facebook/bart-base", True), + ("meta-llama/Llama-3.2-1B", False), + ("meta-llama/Llama-3.2-11B-Vision", True), +]) +def test_is_encoder_decoder(model_id, is_encoder_decoder): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + + assert config.is_encoder_decoder == is_encoder_decoder + + +@pytest.mark.parametrize(("model_id", "uses_mrope"), [ + ("facebook/opt-125m", False), + ("Qwen/Qwen2-VL-2B-Instruct", True), +]) +def test_uses_mrope(model_id, uses_mrope): + config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + dtype="float16", + seed=0, + ) + + assert config.uses_mrope == uses_mrope diff --git a/tests/test_logger.py b/tests/test_logger.py index fadf66f2b61d4..e3749616d4203 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -13,7 +13,7 @@ from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger, enable_trace_function_call, init_logger) -from vllm.logging import NewLineFormatter +from vllm.logging_utils import NewLineFormatter def f1(x): @@ -29,7 +29,7 @@ def test_trace_function_call(): cur_dir = os.path.dirname(__file__) enable_trace_function_call(path, cur_dir) f1(1) - with open(path, 'r') as f: + with open(path) as f: content = f.read() assert "f1" in content diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f4551ed42efb8..84348cbc0bced 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -1,17 +1,24 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Generator, List, Optional import pytest from transformers import AutoTokenizer +from vllm.inputs import token_inputs from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) from vllm.transformers_utils.tokenizer_group import get_tokenizer_group +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer TRUTH = [ "Hello here, this is a simple test", "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa - "我很感谢你的热情" + "我很感谢你的热情", + # Burmese text triggers an edge-case for Mistral's V3-Tekken tokenizer (eg. + # for mistralai/Pixtral-12B-2409) where tokens may map to bytes with + # incomplete UTF-8 characters + # see https://github.com/vllm-project/vllm/pull/9625 + "ပုံပြင်လေးပြောပြပါ်", ] TOKENIZERS = [ "facebook/opt-125m", @@ -24,6 +31,7 @@ "tiiuae/falcon-7b", "meta-llama/Llama-2-7b-hf", "codellama/CodeLlama-7b-hf", + "mistralai/Pixtral-12B-2409", ] @@ -49,15 +57,55 @@ def _run_incremental_decode(tokenizer, all_input_ids, return decoded_text +@pytest.fixture +def tokenizer(tokenizer_name): + return (MistralTokenizer.from_pretrained(tokenizer_name) + if "mistral" in tokenizer_name else + AutoTokenizer.from_pretrained(tokenizer_name)) + + +@pytest.mark.parametrize("tokenizer_name", ["mistralai/Pixtral-12B-2409"]) +@pytest.mark.parametrize( + "truth", + [ + # Burmese text triggers an edge-case where tokens may map to bytes with + # incomplete UTF-8 characters + "ပုံပြင်လေးပြောပြပါ", + # Using "URGENCY" since "CY" has token id 130282 + "URGENCY🌶️", + ]) +def test_mistral_edge_case(tokenizer, truth): + """Test for a specific edge cases with V3-Tekken MistralTokenizer. + + See https://github.com/vllm-project/vllm/pull/9625 + """ + starting_index = 0 + all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids + + decoded_text = _run_incremental_decode(tokenizer, + all_input_ids, + skip_special_tokens=True, + starting_index=starting_index) + assert decoded_text == truth + + +@pytest.fixture +def skip_special_tokens(request, tokenizer_name) -> Generator[bool, Any, None]: + if "mistral" in tokenizer_name: + yield ( + True if request.param else + pytest.skip("mistral doesn't support skip_special_tokens=False")) + else: + yield bool(request.param) + + @pytest.mark.parametrize("truth", TRUTH) @pytest.mark.parametrize("with_prompt", [True, False]) -@pytest.mark.parametrize("tokenizer_id", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", (True, False)) -def test_decode_streaming(tokenizer_id, truth, with_prompt, - skip_special_tokens): - tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) +@pytest.mark.parametrize("tokenizer_name", TOKENIZERS) +@pytest.mark.parametrize("skip_special_tokens", (True, False), indirect=True) +def test_decode_streaming(tokenizer, truth, with_prompt, skip_special_tokens): if with_prompt: - truth_tokens = tokenizer(truth, add_special_tokens=False)["input_ids"] + truth_tokens = tokenizer(truth, add_special_tokens=False).input_ids prompt_input_ids = truth_tokens[:len(truth) // 2] generated_input_ids = truth_tokens[len(truth) // 2:] all_input_ids = prompt_input_ids + generated_input_ids @@ -68,7 +116,7 @@ def test_decode_streaming(tokenizer_id, truth, with_prompt, else: generated = truth starting_index = 0 - all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] + all_input_ids = tokenizer(truth, add_special_tokens=False).input_ids if skip_special_tokens: if tokenizer.bos_token_id is not None: all_input_ids = [tokenizer.bos_token_id] + all_input_ids @@ -98,7 +146,7 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: enable_lora=False, max_num_seqs=100, max_input_length=None, - tokenizer_mode="auto", + tokenizer_mode="mistral" if "mistral" in tokenizer_name else "auto", trust_remote_code=False, revision=None, ) @@ -113,9 +161,8 @@ def detokenizer(tokenizer_name: str) -> Detokenizer: @pytest.fixture(name="complete_sequence_token_ids") def create_complete_sequence_token_ids(complete_sequence: str, - tokenizer_name: str) -> List[int]: - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - complete_sequence_token_ids = tokenizer(complete_sequence)["input_ids"] + tokenizer) -> List[int]: + complete_sequence_token_ids = tokenizer(complete_sequence).input_ids return complete_sequence_token_ids @@ -123,10 +170,7 @@ def create_sequence(prompt_token_ids=None): prompt_token_ids = prompt_token_ids or [1] return Sequence( seq_id=0, - inputs={ - "prompt": "", - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, prompt=""), block_size=16, ) @@ -150,7 +194,7 @@ def create_dummy_prompt_logprobs( @pytest.mark.parametrize("complete_sequence", TRUTH) @pytest.mark.parametrize("tokenizer_name", TOKENIZERS) -@pytest.mark.parametrize("skip_special_tokens", [True, False]) +@pytest.mark.parametrize("skip_special_tokens", [True, False], indirect=True) def test_decode_sequence_logprobs(complete_sequence: str, complete_sequence_token_ids: List[int], detokenizer: Detokenizer, @@ -208,9 +252,9 @@ def test_decode_prompt_logprobs(complete_sequence_token_ids: List[int], # decoded_prompt_logprobs doesn't contain the first token. token_ids = complete_sequence_token_ids - tokenzier = detokenizer.get_tokenizer_for_seq(seq) - text_full = tokenzier.decode(token_ids, skip_special_tokens=True) - text_first = tokenzier.decode(token_ids[0], skip_special_tokens=True) + tokenizer = detokenizer.get_tokenizer_for_seq(seq) + text_full = tokenizer.decode(token_ids, skip_special_tokens=True) + text_first = tokenizer.decode(token_ids[0], skip_special_tokens=True) text = text_full[len(text_first):] # Text for logprobs for the chosen token should be the same as the diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py index ab6a29eba1b3f..294acf202a232 100644 --- a/tests/tool_use/conftest.py +++ b/tests/tool_use/conftest.py @@ -3,6 +3,7 @@ from huggingface_hub import snapshot_download from tests.utils import RemoteOpenAIServer +from vllm.platforms import current_platform from .utils import ARGS, CONFIGS, ServerConfig @@ -11,6 +12,11 @@ @pytest.fixture(scope="session", params=CONFIGS.keys()) def server_config(request): config = CONFIGS[request.param] + + if current_platform.is_rocm() and not config.get("supports_rocm", True): + pytest.skip("The {} model can't be tested on the ROCm platform".format( + config["model"])) + # download model and tokenizer using transformers snapshot_download(config["model"]) yield CONFIGS[request.param] diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index d9ee0b1d54b0a..576555b368afe 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -13,6 +13,7 @@ class ServerConfig(TypedDict, total=False): arguments: List[str] system_prompt: Optional[str] supports_parallel: Optional[bool] + supports_rocm: Optional[bool] def patch_system_prompt(messages: List[Dict[str, Any]], @@ -36,7 +37,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], # universal args for all models go here. also good if you need to test locally # and change type or KV cache quantization or something. -ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"] +ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"] CONFIGS: Dict[str, ServerConfig] = { "hermes": { @@ -88,18 +89,28 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " "to the user's question - just respond to it normally." }, - ## FIXME: temporary disabled due to lack of hardware specification - ## for individual runs - #"granite20b": { - # "model": - # "ibm-granite/granite-20b-functioncalling", - # "arguments": [ - # "--tool-call-parser", "granite-20b-fc", "--chat-template", - # str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja") - # ], - # "supports_parallel": - # False, - #}, + "granite20b": { + "model": + "mbayser/granite-20b-functioncalling-FP8-KV", + "arguments": [ + "--tool-call-parser", "granite-20b-fc", "--chat-template", + str(VLLM_PATH / + "examples/tool_chat_template_granite_20b_fc.jinja"), + "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20" + ], + "supports_parallel": + False, + "supports_rocm": + False, + }, + "granite8b": { + "model": + "ibm-granite/granite-3.0-8b-instruct", + "arguments": [ + "--tool-call-parser", "granite", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_granite.jinja") + ], + }, "internlm": { "model": "internlm/internlm2_5-7b-chat", diff --git a/tests/utils.py b/tests/utils.py index e8aad9cb3268f..a893667e144a6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ import asyncio +import copy import functools import os import signal @@ -8,13 +9,15 @@ import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Literal, Optional, Type, Union +from typing import Any, Callable, Dict, List, Optional, Type, Union import openai import pytest import requests +import torch +import torch.nn.functional as F from openai.types.completion import Completion -from typing_extensions import ParamSpec, assert_never +from typing_extensions import ParamSpec import vllm.envs as envs from tests.models.utils import TextTextLogprobs @@ -272,6 +275,31 @@ def _test_completion( return results +def _test_completion_close( + client: openai.OpenAI, + model: str, + prompt: str, +): + results = [] + + # test with text prompt + completion = client.completions.create(model=model, + prompt=prompt, + max_tokens=1, + logprobs=5, + temperature=0.0) + + logporbs = completion.choices[0].logprobs.top_logprobs[0] + logporbs = {k: round(v, 2) for k, v in logporbs.items()} + + results.append({ + "test": "completion_close", + "logprobs": logporbs, + }) + + return results + + def _test_embeddings( client: openai.OpenAI, model: str, @@ -295,13 +323,81 @@ def _test_embeddings( return results +def _test_image_text( + client: openai.OpenAI, + model_name: str, + image_url: str, +): + results = [] + + # test pure text input + messages = [{ + "role": + "user", + "content": [ + { + "type": "text", + "text": "How do you feel today?" + }, + ], + }] + + chat_completion = client.chat.completions.create(model=model_name, + messages=messages, + temperature=0.0, + max_tokens=1, + logprobs=True, + top_logprobs=5) + top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs + + for x in top_logprobs: + x.logprob = round(x.logprob, 2) + + results.append({ + "test": "pure_text", + "logprobs": top_logprobs, + }) + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + chat_completion = client.chat.completions.create(model=model_name, + messages=messages, + temperature=0.0, + max_tokens=1, + logprobs=True, + top_logprobs=5) + top_logprobs = chat_completion.choices[0].logprobs.content[0].top_logprobs + + results.append({ + "test": "text_image", + "logprobs": top_logprobs, + }) + + return results + + def compare_two_settings(model: str, arg1: List[str], arg2: List[str], env1: Optional[Dict[str, str]] = None, env2: Optional[Dict[str, str]] = None, *, - method: Literal["generate", "encode"] = "generate", + method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: """ Launch API server with two different sets of arguments/environments @@ -328,7 +424,7 @@ def compare_all_settings(model: str, all_args: List[List[str]], all_envs: List[Optional[Dict[str, str]]], *, - method: Literal["generate", "encode"] = "generate", + method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: """ Launch API server with several different sets of arguments/environments @@ -397,10 +493,17 @@ def compare_all_settings(model: str, if method == "generate": results += _test_completion(client, model, prompt, token_ids) + elif method == "generate_close": + results += _test_completion_close(client, model, prompt) + elif method == "generate_with_image": + results += _test_image_text( + client, model, + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png" + ) elif method == "encode": results += _test_embeddings(client, model, prompt) else: - assert_never(method) + raise ValueError(f"Unknown method: {method}") if i > 0: # if any setting fails, raise an error early @@ -410,6 +513,19 @@ def compare_all_settings(model: str, compare_envs = all_envs[i] for ref_result, compare_result in zip(ref_results, compare_results): + ref_result = copy.deepcopy(ref_result) + compare_result = copy.deepcopy(compare_result) + if "embedding" in ref_result and method == "encode": + sim = F.cosine_similarity( + torch.tensor(ref_result["embedding"]), + torch.tensor(compare_result["embedding"]), + dim=0, + ) + assert sim >= 0.999, ( + f"Embedding for {model=} are not the same.\n" + f"cosine_similarity={sim}\n") + del ref_result["embedding"] + del compare_result["embedding"] assert ref_result == compare_result, ( f"Results for {model=} are not the same.\n" f"{ref_args=} {ref_envs=}\n" @@ -585,7 +701,7 @@ def large_gpu_mark(min_gb: int) -> pytest.MarkDecorator: return pytest.mark.skipif( memory_gb < min_gb, - reason=f"Need at least {memory_gb}GB GPU memory to run the test.", + reason=f"Need at least {min_gb}GB GPU memory to run the test.", ) diff --git a/vllm/v1/tokenizer/__init__.py b/tests/v1/__init__.py similarity index 100% rename from vllm/v1/tokenizer/__init__.py rename to tests/v1/__init__.py diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py new file mode 100644 index 0000000000000..d614d3e67460f --- /dev/null +++ b/tests/v1/core/test_prefix_caching.py @@ -0,0 +1,219 @@ +"""Compare the with and without prefix caching.""" +from vllm.inputs import token_inputs +from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_manager import KVCacheManager, Request +from vllm.v1.core.kv_cache_utils import hash_block_tokens + + +def make_request(request_id, prompt_token_ids): + return Request( + request_id=request_id, + inputs=token_inputs(prompt_token_ids=prompt_token_ids), + sampling_params=SamplingParams(max_tokens=17), + eos_token_id=100, + arrival_time=0, + lora_request=None, + ) + + +def test_prefill(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=16, + ) + + # Complete 3 blocks (48 tokens) + common_token_ids = [i for i in range(3) for _ in range(16)] + + # Fully cache miss + # Incomplete 1 block (7 tokens) + unique_token_ids = [3] * 7 + req0 = make_request("0", common_token_ids + unique_token_ids) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + blocks = manager.allocate_slots(req0, 55, computed_blocks) + assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] + + # Check full block metadata + parent_block_hash = None + for block_id in (0, 1, 2): + block_hash = hash_block_tokens(parent_block_hash, + manager.block_pool[block_id].token_ids) + assert manager.block_pool[block_id].block_hash == block_hash + assert manager.block_pool[block_id].ref_cnt == 1 + assert manager.block_pool[block_id].num_hashed_tokens == 16 * ( + block_id + 1) + assert manager.block_pool[block_id].token_ids == tuple([block_id] * 16) + parent_block_hash = block_hash + + # Check partial/preallocated block metadata + for block_id in (3, 4): + assert manager.block_pool[block_id].block_hash is None + assert manager.block_pool[block_id].ref_cnt == 1 + assert manager.block_pool[block_id].num_hashed_tokens == 0 + if block_id == 3: + assert manager.block_pool[block_id].token_ids == [3] * 7 + else: + assert not manager.block_pool[block_id].token_ids + + # Cache hit in the common prefix when the original block is still in use. + # Incomplete 1 block (5 tokens) + unique_token_ids = [3] * 5 + req1 = make_request("1", common_token_ids + unique_token_ids) + computed_blocks = manager.get_computed_blocks(req1) + assert [b.block_id for b in computed_blocks] == [0, 1, 2] + num_new_tokens = 53 - 3 * 16 + blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks) + assert [b.block_id for b in blocks] == [5, 6] + for block in computed_blocks: + assert block.ref_cnt == 2 + + # At this point, we should have 3 free blocks left. + assert manager.free_block_queue.num_free_blocks == 3 + + manager.free(req0) + manager.free(req1) + + # All blocks should be available. + assert manager.free_block_queue.num_free_blocks == 10 + # The order should be + # [unallocated (7, 8)] + # [unique_req0 (4, 3)] + # [unique_req1 (6, 5)] + # [common (2, 1, 0)] + assert [ + b.block_id for b in manager.free_block_queue.get_all_free_blocks() + ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0] + + # Cache hit in the common prefix when the original block is already free. + # Incomplete 1 block (6 tokens) + unique_token_ids = [3] * 6 + req2 = make_request("2", common_token_ids + unique_token_ids) + computed_block = manager.get_computed_blocks(req2) + assert [b.block_id for b in computed_block] == [0, 1, 2] + num_new_tokens = 53 - 3 * 16 + blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks) + assert [b.block_id for b in blocks] == [7, 8] + + # Although we only have 5 free blocks, we have 8 blocks in + # the free block queue due to lazy removal. + assert manager.free_block_queue.num_free_blocks == 5 + assert all([ + b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks() + ]) + assert len([b + for b in manager.free_block_queue.get_all_free_blocks()]) == 5 + + manager.free(req2) + + # Cache miss and eviction. + req3 = make_request("3", [99] * (16 * 9)) + computed_blocks = manager.get_computed_blocks(req3) + assert not computed_blocks + blocks = manager.allocate_slots(req2, 16 * 9, computed_blocks) + # This block ID order also checks the eviction order. + assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0] + assert manager.free_block_queue.num_free_blocks == 0 + assert manager.free_block_queue.free_list_head is None + assert manager.free_block_queue.free_list_tail is None + + +def test_decode(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=16, + ) + + # Complete 3 blocks (48 tokens) + common_token_ids = [i for i in range(3) for _ in range(16)] + + # Fully cache miss + # Incomplete 1 block (7 tokens) + unique_token_ids = [3] * 7 + req0 = make_request("0", common_token_ids + unique_token_ids) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + blocks = manager.allocate_slots(req0, 55, computed_blocks) + assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] + + # Append slots without allocating a new block. + req0.num_computed_tokens = 55 + for _ in range(4): + req0.append_output_token_ids(8) + new_blocks = manager.append_slots(req0, 4) + assert new_blocks is not None and len(new_blocks) == 0 + assert len(manager.block_pool[3].token_ids) == 11 + + # Append slots without allocating a new block, but start using the + # preallocated block. + req0.num_computed_tokens = 59 + # 6 tokens to fill the previous block, and 10 tokens to fill + # the preallocated block. + for _ in range(5 + 10): + req0.append_output_token_ids(7) + new_blocks = manager.append_slots(req0, 15) + assert new_blocks is not None and len(new_blocks) == 0 + assert len(manager.block_pool[3].token_ids) == 16 + assert len(manager.block_pool[4].token_ids) == 10 + + # Append slots with allocating a new block. + req0.num_computed_tokens = 74 + # 6 tokens to fill the previous block, and 10 tokens to fill + # the preallocated block. + for _ in range(6 + 11): + req0.append_output_token_ids(12) + new_blocks = manager.append_slots(req0, 17) + # Plus one preallocated block. + assert new_blocks is not None and len(new_blocks) == 2 + assert len(manager.block_pool[4].token_ids) == 16 + assert len(manager.block_pool[5].token_ids) == 11 + assert len(manager.block_pool[6].token_ids) == 0 + + +def test_evict(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + sliding_window=False, + enable_caching=True, + num_preallocate_tokens=16, + ) + + last_token_id = 5 * 16 + 7 + req0 = make_request("0", list(range(last_token_id))) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks) + assert len(blocks) == 7 # 5 full + 1 partial + 1 preallocated + + # 3 blocks. + req1 = make_request("1", list(range(last_token_id, + last_token_id + 3 * 16))) + computed_blocks = manager.get_computed_blocks(req1) + assert not computed_blocks + blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks) + assert len(blocks) == 3 # 3 full blocks + last_token_id += 3 * 16 + + assert manager.free_block_queue.num_free_blocks == 0 + + manager.free(req0) + manager.free(req1) + assert manager.free_block_queue.num_free_blocks == 10 + assert [ + b.block_id for b in manager.free_block_queue.get_all_free_blocks() + ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] + + # Touch the first 2 blocks. + req2 = make_request("2", list(range(2 * 16 + 3))) + computed_blocks = manager.get_computed_blocks(req2) + assert [b.block_id for b in computed_blocks] == [0, 1] + blocks = manager.allocate_slots(req2, 3, computed_blocks) + assert [b.block_id for b in blocks] == [6, 5] + assert manager.free_block_queue.num_free_blocks == 6 diff --git a/tests/v1/engine/__init__.py b/tests/v1/engine/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py new file mode 100644 index 0000000000000..1f26fe0fc892f --- /dev/null +++ b/tests/v1/engine/test_async_llm.py @@ -0,0 +1,66 @@ +import asyncio +from typing import Tuple + +import pytest + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.platforms import current_platform +from vllm.v1.engine.async_llm import AsyncLLM + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + disable_log_requests=True) + + +async def generate(engine: AsyncLLM, request_id: str, + max_tokens: int) -> Tuple[int, str]: + count = 0 + async for _ in engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=SamplingParams( + max_tokens=max_tokens, temperature=0)): + + count += 1 + await asyncio.sleep(0.) + + return count, request_id + + +@pytest.mark.asyncio +async def test_load(monkeypatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + + NUM_REQUESTS = 10000 + NUM_EXPECTED_TOKENS = 10 + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks = [] + for request_id in request_ids: + tasks.append( + asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS))) + + # Confirm that we got all the EXPECTED tokens from the requests. + failed_request_id = None + tokens = None + for task in tasks: + num_generated_tokens, request_id = await task + if (num_generated_tokens != NUM_EXPECTED_TOKENS + and failed_request_id is None): + failed_request_id = request_id + tokens = num_generated_tokens + + assert failed_request_id is None, ( + f"{failed_request_id} generated {tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") + + engine.shutdown() diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py new file mode 100644 index 0000000000000..07f343666cb5e --- /dev/null +++ b/tests/v1/engine/test_detokenizer.py @@ -0,0 +1,205 @@ +from typing import List + +import pytest +from transformers import AutoTokenizer + +from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine import EngineCoreOutput +from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest + +TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" +tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) + +FULL_STRINGS = [ + "My name is Robert from Neural Magic and I love working on vLLM so much!", + "Red Hat is the best open source company by far across Linux, K8s, and AI.", + "Nick is the name of my brother in addition to my colleague from Red Hat.", +] + +STOP_STRINGS = ["I love working on", "company by far", "brother in"] + +FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS] +PROMPT_LEN = 5 +PROMPT_TOKENS = [ + tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS +] +GENERATION_TOKENS = [ + tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS +] +PROMPT_STRINGS = [ + tokenizer.decode(prompt_tokens, skip_special_tokens=True) + for prompt_tokens in PROMPT_TOKENS +] +PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS] +GENERATION_STRINGS = [ + text[prompt_len:] + for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN) +] + + +class MockEngineCore: + """Mock outputs form premade tokens lists.""" + + def __init__(self, tokens_list: List[List[int]]): + self.tokens_list = tokens_list + self.current_idx = 0 + + def get_outputs(self) -> List[EngineCoreOutput]: + token_idx = self.current_idx + self.current_idx += 1 + + outputs = [] + for req_idx, token_ids in enumerate(self.tokens_list): + if len(token_ids) > token_idx: + output = EngineCoreOutput(request_id=f"request-{req_idx}", + new_token_ids=[token_ids[token_idx]], + finished=False) + if token_idx == len(token_ids) - 1: + output.finished = True + output.finish_reason = "stopped" + outputs.append(output) + + return outputs + + +@pytest.mark.parametrize( + "request_output_kind", + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +def test_incremental_detokenization(request_output_kind: RequestOutputKind): + detokenizer = Detokenizer(TOKENIZER_NAME) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + DetokenizerRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False, + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + detokenizer.add_request(request) + + gen_strings = {} + gen_tokens = {} + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + request_outputs, requests_to_abort = detokenizer.step(outputs) + assert len(requests_to_abort) == 0 + + # Update tracking. + for request_output in request_outputs: + request_id = request_output.request_id + new_text = request_output.outputs[0].text + new_tokens = request_output.outputs[0].token_ids + if request_id not in gen_strings: + gen_strings[request_id] = new_text + gen_tokens[request_id] = new_tokens + else: + gen_strings[request_id] += new_text + gen_tokens[request_id].extend(new_tokens) + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, ref_gen_toks) in enumerate( + zip(GENERATION_STRINGS, GENERATION_TOKENS)): + gen_str = gen_strings[f"request-{idx}"] + gen_toks = gen_tokens[f"request-{idx}"] + + assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" + assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" + + assert detokenizer.get_num_unfinished_requests() == 0 + assert not detokenizer.has_unfinished_requests() + + +@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) +def test_stop_string(include_stop_str_in_output: bool): + detokenizer = Detokenizer(TOKENIZER_NAME) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + DetokenizerRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + detokenizer.add_request(request) + + gen_strings = {} + aborted = [] + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + request_outputs, requests_to_abort = detokenizer.step(outputs) + for request_output in request_outputs: + # If aborted, we should not get a request output. + assert request_output.request_id not in aborted + aborted.extend(requests_to_abort) + + # Update tracking. + for request_output in request_outputs: + if request_output.finished: + assert request_output.outputs[0].finish_reason == "stop" + + request_id = request_output.request_id + new_text = request_output.outputs[0].text + if request_id not in gen_strings: + gen_strings[request_id] = new_text + else: + gen_strings[request_id] += new_text + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, + stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)): + + # Request should be aborted. + request_id = f"request-{idx}" + assert request_id in aborted + + # Collected values that were generated. + gen_str = gen_strings[request_id] + + # Construct reference strings. + stop_str_idx = ref_gen_str.find(stop_str) + ref_str_exc_stop = ref_gen_str[:stop_str_idx] + ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str + + if include_stop_str_in_output: + assert gen_str == ref_str_inc_stop, ( + f"{gen_str=}, {ref_str_inc_stop=}") + else: + assert gen_str == ref_str_exc_stop, ( + f"{gen_str=}, {ref_str_exc_stop=}") + + assert detokenizer.get_num_unfinished_requests() == 0 + assert not detokenizer.has_unfinished_requests() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py new file mode 100644 index 0000000000000..b3692b594326a --- /dev/null +++ b/tests/v1/engine/test_engine_core.py @@ -0,0 +1,140 @@ +import time +import uuid + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.core import EngineCore + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) +PROMPT = "Hello my name is Robert and I love quantization kernels" +PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids + + +def make_request() -> EngineCoreRequest: + return EngineCoreRequest( + request_id=uuid.uuid4(), + prompt=PROMPT, + prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, + sampling_params=SamplingParams(), + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, + ) + + +def test_engine_core(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + """Setup the EngineCore.""" + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + usage_context=UsageContext.UNKNOWN_CONTEXT) + """Test basic request lifecycle.""" + + # First request. + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + + # Second request. + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 1 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Add two requests in a row. + engine_core.add_request(make_request()) + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 4 + + # Loop through until they are all done. + while len(engine_core.step()) > 0: + pass + + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + """Test abort cycle.""" + + # Basic abort. + req = make_request() + request_id = req.request_id + + engine_core.add_request(req) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + + engine_core.abort_requests([request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + # Add, step, abort 1 of the 3. + req0 = make_request() + req1 = make_request() + req2 = make_request() + + engine_core.add_request(req0) + engine_core.add_request(req1) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + engine_core.add_request(req2) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 3 + + # Abort just one. + engine_core.abort_requests([req1.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Abort the other requests at the same time. + engine_core.abort_requests([req2.request_id, req0.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py new file mode 100644 index 0000000000000..7b241bf836a0e --- /dev/null +++ b/tests/v1/engine/test_engine_core_client.py @@ -0,0 +1,205 @@ +import asyncio +import time +import uuid +from typing import Dict, List + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.core_client import EngineCoreClient + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) +PROMPT = "Hello my name is Robert and I love quantization kernels" +PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids + + +def make_request(params: SamplingParams) -> EngineCoreRequest: + return EngineCoreRequest( + request_id=str(uuid.uuid4()), + prompt=PROMPT, + prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, + sampling_params=params, + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, + ) + + +def loop_until_done(client: EngineCoreClient, outputs: Dict): + + while True: + engine_core_outputs = client.get_output() + + if len(engine_core_outputs) == 0: + break + + all_finished = True + for out in engine_core_outputs: + outputs[out.request_id].append(out) + if not out.finished: + all_finished = False + + if all_finished: + break + + +async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): + + while True: + engine_core_outputs = await client.get_output_async() + + if len(engine_core_outputs) == 0: + break + + all_finished = True + for out in engine_core_outputs: + outputs[out.request_id].append(out) + if not out.finished: + all_finished = False + + if all_finished: + break + + +@pytest.mark.parametrize("multiprocessing_mode", [True, False]) +def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + client = EngineCoreClient.make_client( + vllm_config, + executor_class, + UsageContext.UNKNOWN_CONTEXT, + multiprocess_mode=multiprocessing_mode, + asyncio_mode=False, + ) + + MAX_TOKENS = 20 + params = SamplingParams(max_tokens=MAX_TOKENS) + """Normal Request Cycle.""" + requests = [make_request(params) for _ in range(10)] + request_ids = [req.request_id for req in requests] + + # Add requests to the engine. + for request in requests: + client.add_request(request) + time.sleep(0.01) + + outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + loop_until_done(client, outputs) + + for req_id in request_ids: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{outputs[req_id]=}, {MAX_TOKENS=}") + """Abort Request Cycle.""" + + # Note: this code pathway will only work for multiprocessing + # since we have to call get_output() explicitly + + # Add requests to the engine. + for idx, request in enumerate(requests): + client.add_request(request) + time.sleep(0.01) + if idx % 2 == 0: + client.abort_requests([request.request_id]) + + outputs = {req_id: [] for req_id in request_ids} + loop_until_done(client, outputs) + + for idx, req_id in enumerate(request_ids): + if idx % 2 == 0: + assert len(outputs[req_id]) < MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + else: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + """Abort after request is finished.""" + + # Note: this code pathway will only work for multiprocessing + # since we have to call get_output() explicitly + + request = requests[0] + client.add_request(request) + time.sleep(10.) + + client.abort_requests([request.request_id]) + + # Shutdown the client. + client.shutdown() + + +@pytest.mark.asyncio +async def test_engine_core_client_asyncio(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + client = EngineCoreClient.make_client( + vllm_config, + executor_class, + UsageContext.UNKNOWN_CONTEXT, + multiprocess_mode=True, + asyncio_mode=True, + ) + + MAX_TOKENS = 20 + params = SamplingParams(max_tokens=MAX_TOKENS) + """Normal Request Cycle.""" + + requests = [make_request(params) for _ in range(10)] + request_ids = [req.request_id for req in requests] + + # Add requests to the engine. + for request in requests: + await client.add_request_async(request) + await asyncio.sleep(0.01) + + outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + await loop_until_done_async(client, outputs) + + for req_id in request_ids: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{outputs[req_id]=}, {MAX_TOKENS=}") + """Abort Request Cycle.""" + + # Add requests to the engine. + for idx, request in enumerate(requests): + await client.add_request_async(request) + await asyncio.sleep(0.01) + if idx % 2 == 0: + await client.abort_requests_async([request.request_id]) + + outputs = {req_id: [] for req_id in request_ids} + await loop_until_done_async(client, outputs) + + for idx, req_id in enumerate(request_ids): + if idx % 2 == 0: + assert len(outputs[req_id]) < MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + else: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + + # Shutdown the client. + client.shutdown() diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh index e80c1d6c5849c..a4d0c44c22b51 100755 --- a/tests/weight_loading/run_model_weight_loading_test.sh +++ b/tests/weight_loading/run_model_weight_loading_test.sh @@ -14,7 +14,7 @@ while getopts "c:" OPT; do done -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" do diff --git a/tests/worker/test_encoder_decoder_model_runner.py b/tests/worker/test_encoder_decoder_model_runner.py index e75884a7395e2..9e166ae64dbfb 100644 --- a/tests/worker/test_encoder_decoder_model_runner.py +++ b/tests/worker/test_encoder_decoder_model_runner.py @@ -19,14 +19,7 @@ def _create_model_runner(model: str, *args, engine_args = EngineArgs(model, *args, **kwargs) engine_config = engine_args.create_engine_config() model_runner = EncoderDecoderModelRunner( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - lora_config=engine_config.lora_config, - prompt_adapter_config=engine_config.prompt_adapter_config, + vllm_config=engine_config, is_driver_worker=True, ) return model_runner diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index 1e7f560fc68cc..b36e8bfe73ff3 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -73,6 +73,7 @@ def test_model_runner_input(): num_prefill_tokens=2, num_decode_tokens=3, slot_mapping=torch.zeros(1), + multi_modal_placeholder_index_maps=None, ) model_input = ModelInputForGPUWithSamplingMetadata( input_tokens=torch.ones(10), @@ -124,6 +125,7 @@ def test_embedding_model_runner_input(): num_prefill_tokens=2, num_decode_tokens=3, slot_mapping=torch.zeros(1), + multi_modal_placeholder_index_maps=None, ) model_input = ModelInputForGPUWithPoolingMetadata( input_tokens=torch.ones(10), @@ -174,6 +176,7 @@ def test_multi_step_model_runner_input(): num_prefill_tokens=2, num_decode_tokens=3, slot_mapping=torch.zeros(1), + multi_modal_placeholder_index_maps=None, ) frozen_model_input = ModelInputForGPUWithSamplingMetadata( input_tokens=torch.ones(10), diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py index fe97199bac62d..433a9b30ba57a 100644 --- a/tests/worker/test_model_runner.py +++ b/tests/worker/test_model_runner.py @@ -16,15 +16,7 @@ def _create_model_runner(model: str, *args, **kwargs) -> ModelRunner: engine_args = EngineArgs(model, *args, **kwargs) engine_config = engine_args.create_engine_config() model_runner = ModelRunner( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - lora_config=engine_config.lora_config, - prompt_adapter_config=engine_config.prompt_adapter_config, - observability_config=engine_config.observability_config, + vllm_config=engine_config, is_driver_worker=True, ) return model_runner diff --git a/tests/worker/test_profile.py b/tests/worker/test_profile.py index acd2ed6836365..194ea2aa506f4 100644 --- a/tests/worker/test_profile.py +++ b/tests/worker/test_profile.py @@ -24,12 +24,7 @@ def test_gpu_memory_profiling(): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) worker = Worker( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, + vllm_config=engine_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py index 7aa439ba0a154..acede959f59f8 100644 --- a/tests/worker/test_swap.py +++ b/tests/worker/test_swap.py @@ -19,12 +19,7 @@ def test_swap() -> None: distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) worker = Worker( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, + vllm_config=engine_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, diff --git a/tools/mypy.sh b/tools/mypy.sh index 14b0976a27da5..e984e739d70cf 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -1,18 +1,19 @@ #!/bin/bash CI=${1:-0} +PYTHON_VERSION=${2:-3.9} -if [ $CI -eq 1 ]; then +if [ "$CI" -eq 1 ]; then set -e fi run_mypy() { echo "Running mypy on $1" - if [ $CI -eq 1 ] && [ -z "$1" ]; then - mypy "$@" + if [ "$CI" -eq 1 ] && [ -z "$1" ]; then + mypy --python-version "${PYTHON_VERSION}" "$@" return fi - mypy --follow-imports skip "$@" + mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" } run_mypy # Note that this is less strict than CI diff --git a/tools/profiler/print_layerwise_table.py b/tools/profiler/print_layerwise_table.py index bbd24b085e3a7..081076ad7dbdc 100644 --- a/tools/profiler/print_layerwise_table.py +++ b/tools/profiler/print_layerwise_table.py @@ -46,7 +46,7 @@ def get_entries(node, curr_depth=0): args = parser.parse_args() - with open(args.json_trace, "r") as f: + with open(args.json_trace) as f: profile_data = json.load(f) if args.table == "summary": diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py index 65ee3ae108ae1..adc44474aa4c1 100644 --- a/tools/profiler/visualize_layerwise_profile.py +++ b/tools/profiler/visualize_layerwise_profile.py @@ -196,8 +196,8 @@ def is_cross_device_reduce_1stage(op_name: str): def is_cross_device_reduce_2stage(op_name: str): return "cross_device_reduce_2stage" in op_name - def is_custom_ar_all_reduce_unreg(op_name: str): - return "_C_custom_ar::all_reduce_unreg" in op_name + def is_custom_ar_all_reduce(op_name: str): + return "_C_custom_ar::all_reduce" in op_name def is_reduce_kernel(op_name: str): return "reduce_kernel" in op_name @@ -246,9 +246,9 @@ def is_reduce_kernel(op_name: str): filter(lambda x: is_cross_device_reduce_2stage(x), ops)) ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops)) - custom_ar_all_reduce_unreg_ops = list( - filter(lambda x: is_custom_ar_all_reduce_unreg(x), ops)) - ops = list(filter(lambda x: x not in custom_ar_all_reduce_unreg_ops, ops)) + custom_ar_all_reduce_ops = list( + filter(lambda x: is_custom_ar_all_reduce(x), ops)) + ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops)) reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops)) ops = list(filter(lambda x: x not in reduce_kernel_ops, ops)) @@ -289,21 +289,21 @@ def is_reduce_kernel(op_name: str): if len(cross_device_reduce_2stage_ops): trace_df['cross_device_reduce_2stage_ops'] = trace_df[ cross_device_reduce_2stage_ops].agg("sum", axis=1) - if len(custom_ar_all_reduce_unreg_ops): - trace_df['custom_ar_all_reduce_unreg_ops'] = trace_df[ - custom_ar_all_reduce_unreg_ops].agg("sum", axis=1) + if len(custom_ar_all_reduce_ops): + trace_df['custom_ar_all_reduce_ops'] = trace_df[ + custom_ar_all_reduce_ops].agg("sum", axis=1) if len(reduce_kernel_ops): trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum", axis=1) - trace_df.drop( - attention_ops + quant_ops + gemm_ops + rms_norm_ops + vocab_embed_ops + - mem_ops + elementwise_ops + nccl_all_reduce_ops + nccl_gather_ops + - nccl_broadcast_ops + nccl_other_ops + cross_device_reduce_1stage_ops + - cross_device_reduce_2stage_ops + custom_ar_all_reduce_unreg_ops + - reduce_kernel_ops, - axis=1, - inplace=True) + trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops + + vocab_embed_ops + mem_ops + elementwise_ops + + nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops + + nccl_other_ops + cross_device_reduce_1stage_ops + + cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops + + reduce_kernel_ops, + axis=1, + inplace=True) return trace_df @@ -434,7 +434,7 @@ def make_plot_title_suffix(profile_json: dict) -> str: f"{', Sparsity ' + sparsity if sparsity else ''}") profile_json = None - with open(json_trace, "r") as f: + with open(json_trace) as f: profile_json = json.load(f) assert profile_json is not None diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 33431a33ac837..51ad2adc74fe1 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -81,7 +81,7 @@ def WeightedDuration(self): # Allow for modest floating-point errors epsilon = 0.000002 if (self.weighted_duration > self.Duration() + epsilon): - print('%s > %s?' % (self.weighted_duration, self.Duration())) + print('{} > {}?'.format(self.weighted_duration, self.Duration())) assert (self.weighted_duration <= self.Duration() + epsilon) return self.weighted_duration @@ -104,7 +104,7 @@ def ReadTargets(log, show_all): The result is a list of Target objects.""" header = log.readline() assert header == '# ninja log v5\n', \ - 'unrecognized ninja log version %r' % header + 'unrecognized ninja log version {!r}'.format(header) targets_dict = {} last_end_seen = 0.0 for line in log: @@ -254,8 +254,8 @@ def SummarizeEntries(entries, extra_step_types): # Warn if the sum of weighted times is off by more than half a second. if abs(length - weighted_total) > 500: print('Warning: Possible corrupt ninja log, results may be ' - 'untrustworthy. Length = %.3f, weighted total = %.3f' % - (length, weighted_total)) + 'untrustworthy. Length = {:.3f}, weighted total = {:.3f}'.format( + length, weighted_total)) entries_by_ext = defaultdict(list) for target in entries: @@ -263,16 +263,17 @@ def SummarizeEntries(entries, extra_step_types): entries_by_ext[extension].append(target) for key, values in entries_by_ext.items(): - print(' Longest build steps for %s:' % key) + print(' Longest build steps for {}:'.format(key)) values.sort(key=lambda x: x.WeightedDuration()) for target in values[-long_count:]: - print(' %8.1f weighted s to build %s (%.1f s elapsed time)' % - (target.WeightedDuration(), target.DescribeTargets(), - target.Duration())) - - print(' %.1f s weighted time (%.1f s elapsed time sum, %1.1fx ' - 'parallelism)' % - (length, total_cpu_time, total_cpu_time * 1.0 / length)) + print( + ' {:8.1f} weighted s to build {} ({:.1f} s elapsed time)'. + format(target.WeightedDuration(), target.DescribeTargets(), + target.Duration())) + + print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' + 'parallelism)'.format(length, total_cpu_time, + total_cpu_time * 1.0 / length)) print(' %d build steps completed, average of %1.2f/s' % (len(entries), len(entries) / (length))) @@ -298,11 +299,12 @@ def main(): long_ext_count += len(args.step_types.split(';')) try: - with open(log_file, 'r') as log: + with open(log_file) as log: entries = ReadTargets(log, False) SummarizeEntries(entries, args.step_types) - except IOError: - print('Log file %r not found, no build summary created.' % log_file) + except OSError: + print('Log file {!r} not found, no build summary created.'.format( + log_file)) return errno.ENOENT diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh new file mode 100755 index 0000000000000..0bb6fd2eafa14 --- /dev/null +++ b/tools/shellcheck.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +scversion="stable" + +if [ -d "shellcheck-${scversion}" ]; then + export PATH="$PATH:$(pwd)/shellcheck-${scversion}" +fi + +if ! [ -x "$(command -v shellcheck)" ]; then + if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then + echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" + exit 1 + fi + + # automatic local install if linux x86_64 + wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv + export PATH="$PATH:$(pwd)/shellcheck-${scversion}" +fi + +# TODO - fix warnings in .buildkite/run-amd-test.sh +find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -exec sh -c 'git check-ignore -q $1 || shellcheck $1' _ {} \; diff --git a/use_existing_torch.py b/use_existing_torch.py index e11746459908b..319d262898fe3 100644 --- a/use_existing_torch.py +++ b/use_existing_torch.py @@ -4,7 +4,7 @@ requires_files += ["pyproject.toml"] for file in requires_files: print(f">>> cleaning {file}") - with open(file, 'r') as f: + with open(file) as f: lines = f.readlines() if "torch" in "".join(lines).lower(): print("removed:") diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 46a2fb8bc80a2..8f331a27a20de 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1,5 +1,6 @@ import contextlib import functools +import importlib from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch @@ -12,7 +13,7 @@ logger = init_logger(__name__) -if not current_platform.is_tpu(): +if not current_platform.is_tpu() and not current_platform.is_hpu(): try: import vllm._C except ImportError as e: @@ -486,6 +487,14 @@ def cutlass_scaled_mm(a: torch.Tensor, m = a.shape[0] n = b.shape[1] + + if current_platform.is_rocm(): + triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") + triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm + return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + out = torch.empty((m, n), dtype=out_dtype, device=a.device) torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) @@ -912,20 +921,16 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int: # custom ar -def init_custom_ar(meta: torch.Tensor, rank_data: torch.Tensor, - handles: List[str], offsets: List[int], rank: int, - full_nvlink: bool) -> int: - return torch.ops._C_custom_ar.init_custom_ar(meta, rank_data, handles, - offsets, rank, full_nvlink) - - -def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None: - torch.ops._C_custom_ar.all_reduce_reg(fa, inp, out) +def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor, + rank: int, full_nvlink: bool) -> int: + return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank, + full_nvlink) -def all_reduce_unreg(fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor, - out: torch.Tensor) -> None: - torch.ops._C_custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out) +def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int, + reg_buffer_sz_bytes: int) -> None: + torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer, + reg_buffer_sz_bytes) def dispose(fa: int) -> None: @@ -936,16 +941,15 @@ def meta_size() -> int: return torch.ops._C_custom_ar.meta_size() -def register_buffer(fa: int, t: torch.Tensor, handles: List[str], - offsets: List[int]) -> None: - return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets) +def register_buffer(fa: int, ipc_tensors: List[int]) -> None: + return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors) -def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]: +def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]: return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) -def register_graph_buffers(fa: int, handles: List[str], +def register_graph_buffers(fa: int, handles: List[List[int]], offsets: List[List[int]]) -> None: torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py index 31fcc4c3256a8..28b804f765a3a 100644 --- a/vllm/_ipex_ops.py +++ b/vllm/_ipex_ops.py @@ -74,20 +74,12 @@ def paged_attention_v1( assert kv_cache_dtype == "auto" num_heads = out.size(1) num_queries_per_tokens = num_heads // num_kv_heads - head_mapping = torch.arange( - 0, - num_kv_heads, - device=query.device, - dtype=torch.int32, - ).view(num_kv_heads, - 1).repeat_interleave(num_queries_per_tokens).flatten() - # todo: ipex will refactor namespace - torch.xpu.paged_attention_v1( # type: ignore + ipex.llm.modules.PagedAttention.single_query_kv_attention( out, query.contiguous(), key_cache.view_as(value_cache), value_cache, - head_mapping, + num_queries_per_tokens, scale, block_tables, context_lens, @@ -124,26 +116,15 @@ def paged_attention_v2( assert kv_cache_dtype == "auto" num_heads = out.size(1) num_queries_per_tokens = num_heads // num_kv_heads - head_mapping = torch.arange( - 0, - num_kv_heads, - dtype=torch.int32, - device=query.device, - ).view(num_kv_heads, - 1).repeat_interleave(num_queries_per_tokens).flatten() - # todo: ipex will refactor namespace - torch.xpu.paged_attention_v2( # type: ignore + ipex.llm.modules.PagedAttention.single_query_kv_attention( out, - exp_sum, - max_logits, - tmp_out, query.contiguous(), key_cache.view_as(value_cache), value_cache, - head_mapping, + num_queries_per_tokens, + scale, block_tables, context_lens, - scale, block_size, max_context_len, alibi_slopes, @@ -202,6 +183,7 @@ def varlen_attention( is_causal: bool, return_softmax: bool, gen_: torch.Generator, + logits_soft_cap: float, ) -> None: ipex.llm.functional.varlen_attention(query.contiguous(), key.contiguous(), @@ -210,7 +192,8 @@ def varlen_attention( max_seqlen_q, max_seqlen_k, pdropout, softmax_scale, zero_tensors, is_causal, - return_softmax, gen_) + return_softmax, gen_, + logits_soft_cap) @staticmethod def reshape_and_cache( diff --git a/vllm/assets/image.py b/vllm/assets/image.py index 5eec78c328903..389ecd5c869bc 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path) + return torch.load(image_path, map_location="cpu") diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 05e031affabae..e4dcab10466db 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -35,7 +35,7 @@ def download_video_asset(filename: str) -> str: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: - cv2 = try_import_video_packages() + cv2, _ = try_import_video_packages() cap = cv2.VideoCapture(path) if not cap.isOpened(): @@ -59,7 +59,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_pil_images_list(path: str, num_frames: int = -1) -> List[Image.Image]: - cv2 = try_import_video_packages() + cv2, _ = try_import_video_packages() frames = video_to_ndarrays(path, num_frames) return [ Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 9ea89eca01f5b..a504cb1f7e318 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -7,6 +7,8 @@ import torch +from vllm.multimodal import MultiModalPlaceholderMap + if TYPE_CHECKING: from vllm.worker.model_runner_base import (ModelRunnerBase, ModelRunnerInputBase, @@ -108,6 +110,15 @@ class AttentionMetadata: # in block 0, and 1st slot in block 1, respectively. slot_mapping: torch.Tensor + # The index maps that relate multi-modal embeddings to the corresponding + # placeholders. + # + # N.B. These aren't really related to attention and don't belong on this + # type -- this is just a temporary solution to make them available to + # `model_executable`. + multi_modal_placeholder_index_maps: Optional[Dict[ + str, MultiModalPlaceholderMap.IndexMap]] + @property @abstractmethod def prefill_metadata(self) -> Optional["AttentionMetadata"]: diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index c216d195c9e7e..409a42187f46c 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -215,6 +215,8 @@ def prefill_metadata( num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=self.slot_mapping[:self.num_prefill_tokens], + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_query_len=self.max_query_len, @@ -243,6 +245,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]: num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=self.slot_mapping[self.num_prefill_tokens:], + multi_modal_placeholder_index_maps=None, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_query_len=None, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index ffa05e80623ac..314822b695722 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -1,5 +1,7 @@ """Attention layer with FlashAttention.""" +from collections import defaultdict from dataclasses import dataclass +from itertools import accumulate from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type import torch @@ -9,12 +11,15 @@ AttentionMetadata, AttentionMetadataBuilder, AttentionType) -from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, - compute_slot_mapping, - compute_slot_mapping_start_idx, - is_block_tables_empty) +from vllm.attention.backends.utils import ( + PAD_SLOT_ID, CommonAttentionState, compute_slot_mapping, + compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, + get_seq_len_block_table_args, is_all_cross_attn_metadata_set, + is_all_encoder_attn_metadata_set, is_block_tables_empty) from vllm.forward_context import get_forward_context -from vllm.utils import async_tensor_h2d, make_tensor_with_pad +from vllm.multimodal import MultiModalPlaceholderMap +from vllm.utils import (async_tensor_h2d, direct_register_custom_op, + make_tensor_with_pad) if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, @@ -70,7 +75,6 @@ def swap_blocks( src_key_cache = src_kv_cache[0] dst_key_cache = dst_kv_cache[0] ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) - src_value_cache = src_kv_cache[1] dst_value_cache = dst_kv_cache[1] ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) @@ -82,6 +86,7 @@ def copy_blocks( ) -> None: key_caches = [kv_cache[0] for kv_cache in kv_caches] value_caches = [kv_cache[1] for kv_cache in kv_caches] + ops.copy_blocks(key_caches, value_caches, src_to_dists) @@ -108,26 +113,12 @@ class FlashAttentionMetadata(AttentionMetadata): # |-------------------- seq_len ---------------------| # |-- query_len ---| - # Maximum query length in the batch. - max_query_len: Optional[int] - - # Max number of query tokens among request in the batch. - max_decode_query_len: Optional[int] - # Maximum sequence length among prefill batch. 0 if there are decoding # requests only. max_prefill_seq_len: int # Maximum sequence length among decode batch. 0 if there are prefill # requests only. max_decode_seq_len: int - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] # (batch_size,) A tensor of context lengths (tokens that are computed # so far). context_lens_tensor: Optional[torch.Tensor] @@ -143,11 +134,62 @@ class FlashAttentionMetadata(AttentionMetadata): # Whether or not if cuda graph is enabled. # Cuda-graph is currently enabled for decoding only. # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. + use_cuda_graph: bool + # Maximum query length in the batch. + max_query_len: Optional[int] = None + + # Max number of query tokens among request in the batch. + max_decode_query_len: Optional[int] = None + + # (batch_size + 1,). The cumulative subquery lengths of the sequences in + # the batch, used to index into subquery. E.g., if the subquery length + # is [4, 6], it is [0, 4, 10]. + query_start_loc: Optional[torch.Tensor] = None + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + seq_start_loc: Optional[torch.Tensor] = None + _cached_prefill_metadata: Optional["FlashAttentionMetadata"] = None _cached_decode_metadata: Optional["FlashAttentionMetadata"] = None + # Begin encoder attn & enc/dec cross-attn fields... + + # Encoder sequence lengths representation + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + encoder_seq_start_loc: Optional[torch.Tensor] = None + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + + @property + def is_all_encoder_attn_metadata_set(self): + ''' + All attention metadata required for encoder attention is set. + ''' + return is_all_encoder_attn_metadata_set(self) + + @property + def is_all_cross_attn_metadata_set(self): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return is_all_cross_attn_metadata_set(self) + @property def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: if self.num_prefills == 0: @@ -156,30 +198,52 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: if self._cached_prefill_metadata is not None: return self._cached_prefill_metadata - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - assert self.query_start_loc is not None - assert self.context_lens_tensor is not None - assert self.block_tables is not None - assert self.seq_start_loc is not None + assert ((self.seq_lens is not None) + or (self.encoder_seq_lens is not None)) + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + query_start_loc = (None if self.query_start_loc is None else + self.query_start_loc[:self.num_prefills + 1]) + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[:self.num_prefill_tokens]) + seq_lens = (None if self.seq_lens is None else + self.seq_lens[:self.num_prefills]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[:self.num_prefills]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) + context_lens_tensor = (None if self.context_lens_tensor is None else + self.context_lens_tensor[:self.num_prefills]) + block_tables = (None if self.block_tables is None else + self.block_tables[:self.num_prefills]) self._cached_prefill_metadata = FlashAttentionMetadata( num_prefills=self.num_prefills, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, - slot_mapping=self.slot_mapping[:self.num_prefill_tokens], - seq_lens=self.seq_lens[:self.num_prefills], - seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, max_query_len=self.max_query_len, max_prefill_seq_len=self.max_prefill_seq_len, max_decode_query_len=0, max_decode_seq_len=0, - query_start_loc=self.query_start_loc[:self.num_prefills + 1], - seq_start_loc=self.seq_start_loc[:self.num_prefills + 1], - context_lens_tensor=self.context_lens_tensor[:self.num_prefills], - block_tables=self.block_tables[:self.num_prefills], + query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, + context_lens_tensor=context_lens_tensor, + block_tables=block_tables, use_cuda_graph=False, - ) + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + encoder_seq_start_loc=self.encoder_seq_start_loc, + max_encoder_seq_len=self.max_encoder_seq_len, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables) return self._cached_prefill_metadata @property @@ -189,28 +253,47 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: if self._cached_decode_metadata is not None: return self._cached_decode_metadata - assert self.block_tables is not None - assert self.seq_lens_tensor is not None + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[self.num_prefill_tokens:]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[self.num_prefills:]) + block_tables = (None if self.block_tables is None else + self.block_tables[self.num_prefills:]) self._cached_decode_metadata = FlashAttentionMetadata( num_prefills=0, num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, - slot_mapping=self.slot_mapping[self.num_prefill_tokens:], + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, seq_lens=None, - seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], + seq_lens_tensor=seq_lens_tensor, max_decode_query_len=self.max_decode_query_len, max_query_len=self.max_query_len, max_prefill_seq_len=0, max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=self.query_start_loc[self.num_prefills:] + # Batch may be composed of prefill|decodes, adjust query start + # indices to refer to the start of decodes. E.g. + # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. + query_start_loc=(self.query_start_loc[self.num_prefills:] - + self.query_start_loc[self.num_prefills]) if self.query_start_loc is not None else None, seq_start_loc=self.seq_start_loc[self.num_prefills:] if self.seq_start_loc is not None else None, context_lens_tensor=None, - block_tables=self.block_tables[self.num_prefills:], + block_tables=block_tables, use_cuda_graph=self.use_cuda_graph, - ) + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + encoder_seq_start_loc=self.encoder_seq_start_loc, + max_encoder_seq_len=self.max_encoder_seq_len, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables) return self._cached_decode_metadata def advance_step(self, @@ -296,6 +379,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.context_lens: List[int] = [] self.block_tables: List[List[int]] = [] self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -326,6 +412,12 @@ def _add_seq_group( self.context_lens.append(context_len) if is_prompt: + mm_maps = inter_data.multi_modal_placeholder_maps + if mm_maps: + for modality, placeholders in mm_maps.items(): + self.multimodal_placeholder_maps[modality].extend( + placeholders) + self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -416,6 +508,8 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens + query_start_loc = list(accumulate(query_lens, initial=0)) + seq_start_loc = list(accumulate(seq_lens, initial=0)) num_seqs = len(seq_lens) if use_captured_graph: @@ -438,24 +532,18 @@ def build(self, seq_lens: List[int], query_lens: List[int], device, self.runner.pin_memory) seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, self.runner.pin_memory) - query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device, - self.runner.pin_memory) slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, device, self.runner.pin_memory) - query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=device) - seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=device) - torch.cumsum(seq_lens_tensor, - dim=0, - dtype=seq_start_loc.dtype, - out=seq_start_loc[1:]) - torch.cumsum(query_lens_tensor, - dim=0, - dtype=query_start_loc.dtype, - out=query_start_loc[1:]) + query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, + device, + self.runner.pin_memory) + seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, + device, self.runner.pin_memory) + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + self.multimodal_placeholder_maps.items() + } return FlashAttentionMetadata( num_prefills=self.num_prefills, @@ -463,13 +551,14 @@ def build(self, seq_lens: List[int], query_lens: List[int], num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, + multi_modal_placeholder_index_maps=placeholder_index_maps, seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_decode_query_len=max_decode_query_len, max_prefill_seq_len=max_prefill_seq_len, max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, + query_start_loc=query_start_loc_tensor, + seq_start_loc=seq_start_loc_tensor, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=use_captured_graph, @@ -565,16 +654,20 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashAttentionImpl") - # NOTE(woosuk): FlashAttention does not support FP8 KV cache. assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") + if (attn_type == AttentionType.ENCODER + and (not attn_metadata.is_all_encoder_attn_metadata_set)): + raise AttributeError("Encoder attention requires setting " + "encoder metadata attributes.") + elif (attn_type == AttentionType.ENCODER_DECODER + and (not attn_metadata.is_all_cross_attn_metadata_set)): + raise AttributeError("Encoder/decoder cross-attention " + "requires setting cross-attention " + "metadata attributes.") + output = torch.ops.vllm.unified_flash_attention( query, key, @@ -587,6 +680,7 @@ def forward( k_scale, v_scale, self.scale, + attn_type.value, self.sliding_window, self.alibi_slopes, self.logits_soft_cap, @@ -595,8 +689,89 @@ def forward( return output -@torch.library.custom_op("vllm::unified_flash_attention", - mutates_args=["kv_cache"]) +def _get_query_key_seq_metadata( + attn_metadata, + is_prompt: bool, + attn_type: AttentionType, +) -> tuple: + """ + Returns sequence metadata for key and query based on the specified + attention type and whether input is a prompt. + + This function computes the starting locations and maximum sequence lengths + for key and query sequences for different attention types. + + Args: + attn_metadata: The attention metadata object + is_prompt (bool): A flag indicating if the input is a prompt + attn_type (AttentionType): The type of attention being used. + + Returns: + tuple: A tuple containing four integers: + - Starting location for the query sequence. + - Maximum sequence length for the query sequence. + - Starting location for the key sequence. + - Maximum sequence length for the key sequence. + + Raises: + AttributeError: If an invalid attention type is provided. + """ + if attn_type == AttentionType.DECODER: + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + if is_prompt: + max_seq_len = attn_metadata.max_prefill_seq_len + else: + max_seq_len = attn_metadata.max_decode_seq_len + return (attn_metadata.seq_start_loc, max_seq_len, + attn_metadata.seq_start_loc, max_seq_len) + + elif attn_type == AttentionType.ENCODER_DECODER: + # This is cross attention between the where the key + # is the precomputed encoder attention and query + # is the input sequence. + # Choose query max length based on whether it is prompt + # or not. + if is_prompt: + max_seq_len = attn_metadata.max_prefill_seq_len + else: + max_seq_len = attn_metadata.max_decode_seq_len + return (attn_metadata.seq_start_loc, max_seq_len, + attn_metadata.encoder_seq_start_loc, + attn_metadata.max_encoder_seq_len) + elif attn_type == AttentionType.ENCODER: + # For encoder attention both the query and the key are same i.e the + # encoder sequence. + return (attn_metadata.encoder_seq_start_loc, + attn_metadata.max_encoder_seq_len, + attn_metadata.encoder_seq_start_loc, + attn_metadata.max_encoder_seq_len) + elif attn_type == AttentionType.ENCODER_ONLY: + assert is_prompt, "Should not have decode for encoder only model." + return (attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len, + attn_metadata.seq_start_loc, attn_metadata.max_prefill_seq_len) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + +def _get_causal_option(attn_type: AttentionType) -> bool: + """ + Determine whether the given attention type is suitable for causal + attention mechanisms. + + Args: + attn_type (AttentionType): The type of attention being evaluated + + Returns: + bool: Returns `True` if the attention type is suitable for causal + attention (i.e., not encoder, encoder-only, or encoder-decoder), + otherwise returns `False`. + """ + return not (attn_type == AttentionType.ENCODER + or attn_type == AttentionType.ENCODER_ONLY + or attn_type == AttentionType.ENCODER_DECODER) + + def unified_flash_attention( query: torch.Tensor, key: torch.Tensor, @@ -609,60 +784,76 @@ def unified_flash_attention( k_scale: float, v_scale: float, softmax_scale: float, + attn_type_int_val: int, window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, ) -> torch.Tensor: + # Convert integer attn_type to enum + try: + attn_type = AttentionType(attn_type_int_val) + except ValueError as err: + raise AttributeError( + f"Invalid attention type {str(attn_type_int_val)}") from err + current_metadata = get_forward_context() assert current_metadata is not None assert isinstance(current_metadata, FlashAttentionMetadata) attn_metadata: FlashAttentionMetadata = current_metadata num_tokens, hidden_size = query.shape + # Reshape the query, key, and value tensors. query = query.view(-1, num_heads, head_size) - key = key.view(-1, num_kv_heads, head_size) - value = value.view(-1, num_kv_heads, head_size) + if (key is not None) and (value is not None): + key = key.view(-1, num_kv_heads, head_size) + value = value.view(-1, num_kv_heads, head_size) if kv_cache.numel() > 0: key_cache = kv_cache[0] value_cache = kv_cache[1] + # We skip updating the KV cache under two conditions: + # a. When the Attention Type is ENCODER. In this phase, we compute + # only the encoder attention without updating the cache. + # b. When both Key and Value are None. This occurs during + # cross-attention computation in the decoding phase, where the KV + # cache is already populated with the cross-attention tensor. + # Thus, we skip cache updates during this time. + if (attn_type != AttentionType.ENCODER) and (key is not None) and ( + value is not None): + if attn_type == AttentionType.ENCODER_DECODER: + # Update cross-attention KV cache (prefill-only) + updated_slot_mapping = attn_metadata.cross_slot_mapping + else: + # Update self-attention KV cache (prefill/decode) + updated_slot_mapping = attn_metadata.slot_mapping + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + kv_cache[0], + kv_cache[1], + updated_slot_mapping.flatten(), # type: ignore[union-attr] + kv_cache_dtype, + k_scale, + v_scale, + ) - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], - attn_metadata.slot_mapping.flatten(), - kv_cache_dtype, - k_scale, - v_scale, - ) - - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \ - f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa - assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \ - f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa - - # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] + (num_prefill_query_tokens, num_prefill_kv_tokens, + num_decode_query_tokens) = \ + get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) + decode_query = query[num_prefill_query_tokens:] # QKV for prefill. - query = query[:num_prefill_tokens] - key = key[:num_prefill_tokens] - value = value[:num_prefill_tokens] - - assert query.shape[0] == num_prefill_tokens - assert decode_query.shape[0] == num_decode_tokens + query = query[:num_prefill_query_tokens] + assert query.shape[0] == num_prefill_query_tokens + assert decode_query.shape[0] == num_decode_query_tokens prefill_output: Optional[torch.Tensor] = None decode_output: Optional[torch.Tensor] = None - if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. if (kv_cache.numel() == 0 or prefill_meta.block_tables is None @@ -670,22 +861,30 @@ def unified_flash_attention( # normal attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. + q_seq_start_loc, q_seq_len, k_seq_start_loc, k_seq_len = \ + _get_query_key_seq_metadata(prefill_meta, True, attn_type) + + key = key[:num_prefill_kv_tokens] + value = value[:num_prefill_kv_tokens] + prefill_output = flash_attn_varlen_func( q=query, k=key, v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - max_seqlen_q=prefill_meta.max_prefill_seq_len, - max_seqlen_k=prefill_meta.max_prefill_seq_len, + cu_seqlens_q=q_seq_start_loc, + cu_seqlens_k=k_seq_start_loc, + max_seqlen_q=q_seq_len, + max_seqlen_k=k_seq_len, softmax_scale=softmax_scale, - causal=True, + causal=_get_causal_option(attn_type), window_size=window_size, alibi_slopes=alibi_slopes, softcap=logits_soft_cap, ) else: # prefix-enabled attention + assert attn_type == AttentionType.DECODER, ( + "Only decoder-only models support prefix caching") assert prefill_meta.seq_lens is not None max_seq_len = max(prefill_meta.seq_lens) prefill_output = flash_attn_varlen_func( # noqa @@ -708,8 +907,12 @@ def unified_flash_attention( # Decoding run. # Use flash_attn_varlen_func kernel for speculative decoding # because different queries might have different lengths. + assert decode_meta.max_decode_query_len is not None + # use only for actual varlen decoding if decode_meta.max_decode_query_len > 1: + assert attn_type == AttentionType.DECODER, ( + "Only decoder-only models support max_decode_query_len > 1") decode_output = flash_attn_varlen_func( q=decode_query, k=key_cache, @@ -727,12 +930,17 @@ def unified_flash_attention( ) else: # Use flash_attn_with_kvcache for normal decoding. + ( + seq_lens_arg, + _, + block_tables_arg, + ) = get_seq_len_block_table_args(decode_meta, False, attn_type) decode_output = flash_attn_with_kvcache( q=decode_query.unsqueeze(1), k_cache=key_cache, v_cache=value_cache, - block_table=decode_meta.block_tables, - cache_seqlens=decode_meta.seq_lens_tensor, + block_table=block_tables_arg, + cache_seqlens=seq_lens_arg, softmax_scale=softmax_scale, causal=True, window_size=window_size, @@ -742,21 +950,18 @@ def unified_flash_attention( if prefill_output is None: assert decode_output is not None - return decode_output.view(num_decode_tokens, hidden_size) + return decode_output.view(num_decode_query_tokens, hidden_size) if decode_output is None: assert prefill_output is not None - return prefill_output.view(num_prefill_tokens, hidden_size) + return prefill_output.view(num_prefill_query_tokens, hidden_size) - # Chunked prefill does not work with speculative decoding. - # Therefore, the query length for decode should be 1 in chunked prefill. assert decode_meta is not None decode_output = decode_output.squeeze(1) output = torch.cat([prefill_output, decode_output], dim=0) return output.view(num_tokens, hidden_size) -@unified_flash_attention.register_fake -def _( +def unified_flash_attention_fake( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -768,8 +973,17 @@ def _( k_scale: float, v_scale: float, softmax_scale: float, + attn_type_int_val: int, window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, ) -> torch.Tensor: return torch.empty_like(query) + + +direct_register_custom_op( + op_name="unified_flash_attention", + op_func=unified_flash_attention, + mutates_args=["kv_cache"], + fake_impl=unified_flash_attention_fake, +) diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 5ea101ae0432f..107e3bbf79666 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,7 +1,10 @@ +from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Type +from vllm.multimodal import MultiModalPlaceholderMap + try: from flashinfer import BatchDecodeWithPagedKVCacheWrapper from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper @@ -28,8 +31,8 @@ is_block_tables_empty) from vllm.attention.ops.paged_attn import PagedAttention from vllm.forward_context import get_forward_context -from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, - make_tensor_with_pad) +from vllm.utils import (async_tensor_h2d, direct_register_custom_op, + get_kv_cache_torch_dtype, make_tensor_with_pad) if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, @@ -215,6 +218,7 @@ def graph_capture_get_metadata_for_batch( attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, slot_mapping=self._graph_slot_mapping[:batch_size], + multi_modal_placeholder_index_maps=None, num_prefill_tokens=0, num_decode_tokens=batch_size, max_prefill_seq_len=0, @@ -470,6 +474,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.context_lens: List[int] = [] self.block_tables: List[List[int]] = [] self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -519,6 +526,11 @@ def _add_seq_group( inter_data.curr_sliding_window_blocks): self.context_lens.append(context_len) if is_prompt: + mm_maps = inter_data.multi_modal_placeholder_maps + if mm_maps: + for modality, placeholders in mm_maps.items(): + self.multimodal_placeholder_maps[modality].extend( + placeholders) self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -651,6 +663,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, dtype=torch.int32, device=device) + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + self.multimodal_placeholder_maps.items() + } torch.cumsum(seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, @@ -694,6 +711,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], decode_query_len=decode_query_len, num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, + multi_modal_placeholder_index_maps=placeholder_index_maps, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, max_prefill_seq_len=max_prefill_seq_len, @@ -759,8 +777,6 @@ def forward( v_scale: float = 1.0, attn_type: AttentionType = AttentionType.DECODER, ) -> torch.Tensor: - assert k_scale == 1.0 and v_scale == 1.0, ( - "key/v_scale is not supported in FlashInfer.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " "encoder/decoder cross-attention " @@ -785,8 +801,6 @@ def forward( ) -@torch.library.custom_op("vllm::unified_flash_infer", - mutates_args=["kv_cache"]) def unified_flash_infer( query: torch.Tensor, key: torch.Tensor, @@ -876,7 +890,12 @@ def unified_flash_infer( assert prefill_meta is not None assert prefill_meta.prefill_wrapper is not None prefill_output = prefill_meta.prefill_wrapper.forward( - query, kv_cache, logits_soft_cap=logits_soft_cap, causal=True) + query, + kv_cache, + logits_soft_cap=logits_soft_cap, + causal=True, + k_scale=k_scale, + v_scale=v_scale) if decode_meta := attn_metadata.decode_metadata: assert attn_metadata.decode_metadata is not None assert attn_metadata.decode_metadata.decode_wrapper is not None @@ -906,8 +925,7 @@ def unified_flash_infer( return output.view(num_tokens, hidden_size) -@unified_flash_infer.register_fake -def _( +def unified_flash_infer_fake( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -924,3 +942,11 @@ def _( logits_soft_cap: Optional[float] = None, ) -> torch.Tensor: return torch.empty_like(query).contiguous() + + +direct_register_custom_op( + op_name="unified_flash_infer", + op_func=unified_flash_infer, + mutates_args=["kv_cache"], + fake_impl=unified_flash_infer_fake, +) diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py new file mode 100644 index 0000000000000..a8f4b09b67274 --- /dev/null +++ b/vllm/attention/backends/hpu_attn.py @@ -0,0 +1,264 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import os +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import torch +import vllm_hpu_extension.ops as ops +from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.attention.backends.utils import CommonAttentionState +from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, + HPUPagedAttentionMetadata) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class HPUAttentionBackend(AttentionBackend): + + @staticmethod + def get_impl_cls() -> Type["HPUAttentionImpl"]: + return HPUAttentionImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return HPUAttentionMetadata + + @staticmethod + def get_state_cls() -> Type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size, + num_kv_heads, head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + HPUPagedAttention.copy_blocks(kv_caches, src_to_dists) + + +@dataclass +class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata): + """Metadata for HPUAttentionbackend.""" + # Currently, input sequences can only contain all prompts + # or all decoding. True if all sequences are prompts. + is_prompt: bool + attn_bias: Optional[torch.Tensor] + seq_lens_tensor: Optional[torch.Tensor] + + +class HPUAttentionImpl(AttentionImpl, torch.nn.Module): + """ + If the input tensors contain prompt tokens, the layout is as follows: + |<--------------- num_prefill_tokens ----------------->| + |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| + + Otherwise, the layout is as follows: + |<----------------- num_decode_tokens ------------------>| + |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| + + Generation tokens can contain padding when cuda-graph is used. + Currently, prompt tokens don't contain any padding. + + The prompts might have different lengths, while the generation tokens + always have length 1. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + max_seq_len: int = 4096, + ) -> None: + super(AttentionImpl, self).__init__() + self.kv_cache_dtype = kv_cache_dtype + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.matmul_qk = Matmul() + self.softmax = Softmax() + self.matmul_av = Matmul() + self.k_cache = VLLMKVCache() + self.v_cache = VLLMKVCache() + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.sliding_window = sliding_window + self.alibi_slopes = alibi_slopes + if alibi_slopes is not None: + alibi_slopes_tensor = torch.tensor(alibi_slopes, + dtype=torch.bfloat16) + self.alibi_slopes = alibi_slopes_tensor + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '0').lower() in ['1', 'true'] + if self.prefill_usefusedsdpa: + assert alibi_slopes is None, \ + 'Prefill with FusedSDPA not supported with alibi slopes!' + + suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() + if head_size not in suppored_head_sizes: + raise ValueError( + f"Head size {head_size} is not supported by PagedAttention. " + f"Supported head sizes are: {suppored_head_sizes}.") + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: HPUAttentionMetadata, + k_scale: float = 1.0, + v_scale: float = 1.0, + attn_type: AttentionType = AttentionType.DECODER, + ) -> torch.Tensor: + """Forward pass with xFormers and PagedAttention. + + Args: + query: shape = [num_tokens, num_heads * head_size] + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [num_tokens, num_heads * head_size] + """ + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "HPUAttentionImpl") + batch_size, seq_len, hidden_size = query.shape + _, seq_len_kv, _ = key.shape + + query = query.view(-1, self.num_heads, self.head_size) + key = key.view(-1, self.num_kv_heads, self.head_size) + value = value.view(-1, self.num_kv_heads, self.head_size) + block_indices = attn_metadata.block_indices + block_offsets = attn_metadata.block_offsets + if attn_metadata.is_prompt: + key = key.unflatten(0, (block_indices.size(0), -1)) + value = value.unflatten(0, (block_indices.size(0), -1)) + if kv_cache is not None: + key_cache, value_cache = HPUPagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + # If kv_cache is not provided, the new key and value tensors are + # not cached. This happens during the initial memory profiling run. + key_cache = self.k_cache(key, key_cache, block_indices, + block_offsets) + value_cache = self.v_cache(value, value_cache, block_indices, + block_offsets) + + if attn_metadata.is_prompt: + # Prompt run. + if not self.prefill_usefusedsdpa: + # TODO: move this outside of model + assert attn_metadata.attn_bias is not None, \ + 'attn_bias must be set before calling model.forward!' + attn_bias = attn_metadata.attn_bias + if self.alibi_slopes is not None: + position_bias = _make_alibi_bias(self.alibi_slopes, + self.num_kv_heads, + attn_bias.dtype, + attn_bias.shape[-1]) + attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1)) + attn_bias.add_(position_bias) + else: + attn_bias = None + + query_shape = (batch_size, seq_len, self.num_heads, self.head_size) + kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, + self.head_size) + out = ops.prompt_attention( + query.view(query_shape), + key.view(kv_shape), + value.view(kv_shape), + attn_bias=attn_bias, + p=0.0, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + softmax_op=self.softmax, + matmul_av_op=self.matmul_av, + ) + output = out.reshape(batch_size, seq_len, hidden_size) + else: + # Decoding run. + output = HPUPagedAttention.forward_decode( + query=query, + key_cache=key_cache, + value_cache=value_cache, + block_list=attn_metadata.block_list, + block_mapping=attn_metadata.block_mapping, + block_bias=attn_metadata.attn_bias, + block_scales=attn_metadata.block_scales, + scale=self.scale, + matmul_qk_op=self.matmul_qk, + matmul_av_op=self.matmul_av, + keys_fetch_func=self.k_cache.fetch_from_cache, + values_fetch_func=self.v_cache.fetch_from_cache) + # Reshape the output tensor. + return output.view(batch_size, seq_len, hidden_size) + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + seq_len: int, +) -> torch.Tensor: + bias = torch.arange(seq_len, dtype=dtype) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + # Calculate a matrix where each element represents ith element- jth + # element. + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + 1, # batch size + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + return bias diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 1eb5fe10d76db..87bdb1e0e6565 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -119,8 +119,6 @@ def __init__( if blocksparse_params is not None: raise ValueError( "IPEX backend does not support block-sparse attention.") - if logits_soft_cap is not None: - raise ValueError("IPEX backend does not support logits_soft_cap.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -135,6 +133,9 @@ def __init__( self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.need_mask = (self.alibi_slopes is not None or self.sliding_window is not None) + if logits_soft_cap is None: + logits_soft_cap = 0 + self.logits_soft_cap = logits_soft_cap supported_head_sizes = PagedAttention.get_supported_head_sizes() if head_size not in supported_head_sizes: @@ -239,20 +240,23 @@ def forward( (num_tokens, self.num_heads, self.head_size), dtype=query.dtype, device=query.device) - ipex_ops.varlen_attention(query, - key, - value, - output, - attn_metadata.seqlen_q, - attn_metadata.seqlen_q, - attn_metadata.max_seqlen, - attn_metadata.max_seqlen, - pdropout=0.0, - softmax_scale=self.scale, - zero_tensors=False, - is_causal=True, - return_softmax=False, - gen_=None) + ipex_ops.varlen_attention( + query, + key, + value, + output, + attn_metadata.seqlen_q, + attn_metadata.seqlen_q, + attn_metadata.max_seqlen, + attn_metadata.max_seqlen, + pdropout=0.0, + softmax_scale=self.scale, + zero_tensors=False, + is_causal=True, + return_softmax=False, + gen_=None, + logits_soft_cap=self.logits_soft_cap, + ) else: # prefix-enabled attention raise RuntimeError( diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 6fddfc2002120..be06d16009988 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import openvino as ov import torch @@ -7,6 +7,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) from vllm.attention.backends.utils import CommonAttentionState +from vllm.multimodal import MultiModalPlaceholderMap def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor, @@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata: # Shape: scalar # Type: i32 max_context_len: torch.Tensor + + # The index maps that relate multi-modal embeddings to the corresponding + # placeholders. + # + # N.B. These aren't really related to attention and don't belong on this + # type -- this is just a temporary solution to make them available to + # `model_executable`. + multi_modal_placeholder_index_maps: Optional[Dict[ + str, MultiModalPlaceholderMap.IndexMap]] diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 4116fbf00020c..888adbffb8578 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -1,5 +1,6 @@ +from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type import torch @@ -7,6 +8,7 @@ AttentionMetadata, AttentionMetadataBuilder) from vllm.attention.backends.utils import CommonAttentionState +from vllm.multimodal import MultiModalPlaceholderMap if TYPE_CHECKING: from vllm.worker.model_runner import ModelInputForGPUBuilder @@ -135,6 +137,8 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_decode_query_len=0, @@ -167,6 +171,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_decode_query_len=self.max_decode_query_len, @@ -189,6 +194,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -213,6 +221,12 @@ def _add_seq_group( self.context_lens.append(context_len) if is_prompt: + mm_maps = inter_data.multi_modal_placeholder_maps + if mm_maps: + for modality, placeholders in mm_maps.items(): + self.multimodal_placeholder_maps[modality].extend( + placeholders) + self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -280,6 +294,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, dtype=torch.int32, device=device) + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + self.multimodal_placeholder_maps.items() + } torch.cumsum(seq_lens_tensor, dim=0, dtype=seq_start_loc.dtype, @@ -296,6 +315,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], return PlaceholderAttentionMetadata( num_prefills=self.num_prefills, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=placeholder_index_maps, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 30859dfa60634..2bae370eaa90f 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -150,6 +150,8 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=self.slot_mapping[:self.num_prefill_tokens], + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_query_len=self.max_query_len, @@ -178,6 +180,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=self.slot_mapping[self.num_prefill_tokens:], + multi_modal_placeholder_index_maps=None, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_query_len=None, @@ -189,6 +192,12 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: block_tables=self.block_tables[self.num_prefills:], use_cuda_graph=self.use_cuda_graph, ) + # Batch may be composed of prefill|decodes, adjust query start indices + # to refer to the start of decodes when the two are split apart. + # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. + if self._cached_decode_metadata.query_start_loc is not None: + qs = self._cached_decode_metadata.query_start_loc + self._cached_decode_metadata.query_start_loc = qs - qs[0] return self._cached_decode_metadata def advance_step(self, diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index f985f70728a60..563178d3ab60d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -158,7 +158,8 @@ def get_seq_lens( * Appropriate sequence lengths tensor for key & value ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): seq_lens_q = self.seq_lens seq_lens_kv = self.seq_lens elif attn_type == AttentionType.ENCODER: @@ -189,7 +190,8 @@ def get_attn_bias( * Appropriate attention bias value given the attention type ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): return self.attn_bias elif attn_type == AttentionType.ENCODER: return self.encoder_attn_bias @@ -215,7 +217,8 @@ def set_attn_bias( encoder/decoder cross-attention ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): self.attn_bias = attn_bias elif attn_type == AttentionType.ENCODER: self.encoder_attn_bias = attn_bias @@ -252,7 +255,8 @@ def get_seq_len_block_table_args( * Appropriate block tables (or None) ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): # Decoder self-attention # Choose max_seq_len based on whether we are in prompt_run return (self.seq_lens_tensor, self.max_decode_seq_len, @@ -420,6 +424,8 @@ def forward( "Torch SDPA backend doesn't support prefix decoding.") if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") # Decoding run. ( seq_lens_arg, diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index d1a44f3e8bfa6..12800668af223 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -1,12 +1,16 @@ """Attention backend utils""" +from collections import defaultdict from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Dict, List, Type, TypeVar, Union +from itertools import accumulate +from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar, Union import numpy as np import torch from vllm.attention import (AttentionMetadata, AttentionMetadataBuilder, AttentionState) +from vllm.attention.backends.abstract import AttentionType +from vllm.multimodal import MultiModalPlaceholderMap from vllm.utils import async_tensor_h2d, make_tensor_with_pad if TYPE_CHECKING: @@ -123,6 +127,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.context_lens: List[int] = [] self.block_tables: List[List[int]] = [] self.curr_seq_lens: List[int] = [] + self.multimodal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) self.num_prefills = 0 self.num_prefill_tokens = 0 self.num_decode_tokens = 0 @@ -138,7 +145,6 @@ def _add_seq_group( chunked_prefill_enabled: bool): is_prompt = inter_data.is_prompt block_tables = inter_data.block_tables - computed_block_nums = inter_data.computed_block_nums for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, curr_sliding_window_block) in zip( @@ -148,6 +154,12 @@ def _add_seq_group( inter_data.curr_sliding_window_blocks): self.context_lens.append(context_len) if is_prompt: + mm_maps = inter_data.multi_modal_placeholder_maps + if mm_maps: + for modality, placeholders in mm_maps.items(): + self.multimodal_placeholder_maps[modality].extend( + placeholders) + self.num_prefills += 1 self.num_prefill_tokens += token_len self.prefill_seq_lens.append(seq_len) @@ -164,10 +176,14 @@ def _add_seq_group( # NOTE: This only works for oooooooxxx style attention. block_table = [] if inter_data.prefix_cache_hit: - block_table = computed_block_nums + block_table = block_tables[seq_id] elif ((chunked_prefill_enabled or not is_prompt) and block_tables is not None): - block_table = block_tables[seq_id][-curr_sliding_window_block:] + if curr_sliding_window_block == 0: + block_table = block_tables[seq_id] + else: + block_table = block_tables[seq_id][ + -curr_sliding_window_block:] self.block_tables.append(block_table) # Compute slot mapping. @@ -201,6 +217,8 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens + query_start_loc = list(accumulate(query_lens, initial=0)) + seq_start_loc = list(accumulate(seq_lens, initial=0)) if use_captured_graph: self.slot_mapping.extend([PAD_SLOT_ID] * cuda_graph_pad_size) @@ -229,28 +247,23 @@ def build(self, seq_lens: List[int], query_lens: List[int], device, self.runner.pin_memory) seq_lens_tensor = async_tensor_h2d(seq_lens, torch.int, device, self.runner.pin_memory) - query_lens_tensor = async_tensor_h2d(query_lens, torch.long, device, - self.runner.pin_memory) slot_mapping_tensor = async_tensor_h2d(self.slot_mapping, torch.long, device, self.runner.pin_memory) - query_start_loc = torch.zeros(query_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=device) - seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, - dtype=torch.int32, - device=device) - torch.cumsum(seq_lens_tensor, - dim=0, - dtype=seq_start_loc.dtype, - out=seq_start_loc[1:]) - torch.cumsum(query_lens_tensor, - dim=0, - dtype=query_start_loc.dtype, - out=query_start_loc[1:]) + query_start_loc_tensor = async_tensor_h2d(query_start_loc, torch.int32, + device, + self.runner.pin_memory) + seq_start_loc_tensor = async_tensor_h2d(seq_start_loc, torch.int32, + device, self.runner.pin_memory) + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + self.multimodal_placeholder_maps.items() + } return self._metadata_cls( # type: ignore num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, + multi_modal_placeholder_index_maps=placeholder_index_maps, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, @@ -258,8 +271,8 @@ def build(self, seq_lens: List[int], query_lens: List[int], max_query_len=max_query_len, max_prefill_seq_len=max_prefill_seq_len, max_decode_seq_len=max_decode_seq_len, - query_start_loc=query_start_loc, - seq_start_loc=seq_start_loc, + query_start_loc=query_start_loc_tensor, + seq_start_loc=seq_start_loc_tensor, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=use_captured_graph, @@ -302,6 +315,7 @@ def graph_capture_get_metadata_for_batch( num_prefill_tokens=0, num_decode_tokens=batch_size, slot_mapping=self._graph_slot_mapping[:batch_size], + multi_modal_placeholder_index_maps=None, seq_lens=None, seq_lens_tensor=self._graph_seq_lens[:batch_size], max_query_len=1, @@ -315,11 +329,13 @@ def graph_capture_get_metadata_for_batch( use_cuda_graph=True, ) if is_encoder_decoder_model: - # The encoder decoder model works only with XFormers backend. - # Assert the same. - assert self.runner.attn_backend.get_name() == "XFORMERS", \ - f"Expected attn_backend name to be 'XFORMERS', but "\ - f" got '{self.runner.attn_backend.get_name()}'" + # The encoder decoder model works only with XFormers and + # Flash Attention backend. Assert the same. + assert self.runner.attn_backend.get_name() in\ + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or " \ + f"'FLASH_ATTN', but "\ + f"got '{self.runner.attn_backend.get_name()}'" self._update_captured_metadata_for_enc_dec_model( batch_size=batch_size, attn_metadata=attn_metadata) @@ -335,11 +351,13 @@ def get_graph_input_buffers( "block_tables": attn_metadata.decode_metadata.block_tables, } if is_encoder_decoder_model: - # The encoder decoder model works only with XFormers backend. - # Assert the same. - assert self.runner.attn_backend.get_name() == "XFORMERS", \ - f"Expected attn_backend name to be 'XFORMERS', but "\ - f" got '{self.runner.attn_backend.get_name()}'" + # The encoder decoder model works only with XFormers and + # Flash Attention backend. Assert the same. + assert self.runner.attn_backend.get_name() in\ + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or "\ + f"'FLASH_ATTN', but "\ + f"got '{self.runner.attn_backend.get_name()}'" self._add_additonal_input_buffers_for_enc_dec_model( attn_metadata=attn_metadata, input_buffers=input_buffers) return input_buffers @@ -354,11 +372,13 @@ def prepare_graph_input_buffers( input_buffers["block_tables"].copy_( attn_metadata.decode_metadata.block_tables, non_blocking=True) if is_encoder_decoder_model: - # The encoder decoder model works only with XFormers backend. - # Assert the same. - assert self.runner.attn_backend.get_name() == "XFORMERS", \ - f"Expected attn_backend name to be 'XFORMERS', but "\ - f" got '{self.runner.attn_backend.get_name()}'" + # The encoder decoder model works only with XFormers and + # Flash Attention backend. Assert the same. + assert self.runner.attn_backend.get_name() in\ + ["XFORMERS", "FLASH_ATTN"], \ + f"Expected attn_backend name to be either 'XFORMERS' or "\ + f"'FLASH_ATTN', but "\ + f"got '{self.runner.attn_backend.get_name()}'" self._prepare_input_buffers_for_enc_dec_model( attn_metadata, input_buffers) @@ -390,6 +410,7 @@ def _update_captured_metadata_for_enc_dec_model(self, batch_size: int, attn_metadata.encoder_seq_lens_tensor = torch.full( (batch_size, ), 1, dtype=torch.int).cuda() attn_metadata.max_encoder_seq_len = self.runner.max_seq_len_to_capture + attn_metadata.num_encoder_tokens = 0 def _add_additonal_input_buffers_for_enc_dec_model( self, attn_metadata, input_buffers: Dict[str, Any]): @@ -432,3 +453,122 @@ def _prepare_input_buffers_for_enc_dec_model(self, attn_metadata, input_buffers["cross_block_tables"].copy_( attn_metadata.decode_metadata.cross_block_tables, non_blocking=True) + + +def is_all_encoder_attn_metadata_set(attn_metadata): + ''' + All attention metadata required for encoder attention is set. + ''' + return ((attn_metadata.encoder_seq_lens is not None) + and (attn_metadata.encoder_seq_lens_tensor is not None) + and (attn_metadata.max_encoder_seq_len is not None)) + + +def is_all_cross_attn_metadata_set(attn_metadata): + ''' + All attention metadata required for enc/dec cross-attention is set. + + Superset of encoder attention required metadata. + ''' + return (attn_metadata.is_all_encoder_attn_metadata_set + and (attn_metadata.cross_slot_mapping is not None) + and (attn_metadata.cross_block_tables is not None)) + + +def get_seq_len_block_table_args( + attn_metadata, + is_prompt: bool, + attn_type: AttentionType, +) -> tuple: + ''' + The particular choice of sequence-length- and block-table-related + attributes which should be extracted from attn_metadata is dependent + on the type of attention operation. + + Decoder attn -> select entirely decoder self-attention-related fields + Encoder/decoder cross-attn -> select encoder sequence lengths & + cross-attn block-tables fields + Encoder attn -> select encoder sequence lengths fields & no block tables + + Arguments: + + * attn_metadata: Attention metadata structure associated with attention op + * is_prompt: True if prefill, False otherwise + * attn_type: encoder attention, decoder self-attention, + encoder/decoder cross-attention + + Returns: + + * Appropriate sequence-lengths tensor + * Appropriate max sequence-length scalar + * Appropriate block tables (or None) + ''' + + if attn_type == AttentionType.DECODER: + # Decoder self-attention + # Choose max_seq_len based on whether we are in prompt_run + if is_prompt: + max_seq_len = attn_metadata.max_prefill_seq_len + else: + max_seq_len = attn_metadata.max_decode_seq_len + return (attn_metadata.seq_lens_tensor, max_seq_len, + attn_metadata.block_tables) + elif attn_type == AttentionType.ENCODER_DECODER: + # Enc/dec cross-attention KVs match encoder sequence length; + # cross-attention utilizes special "cross" block tables + return (attn_metadata.encoder_seq_lens_tensor, + attn_metadata.max_encoder_seq_len, + attn_metadata.cross_block_tables) + elif attn_type == AttentionType.ENCODER: + # No block tables associated with encoder attention + return (attn_metadata.encoder_seq_lens_tensor, + attn_metadata.max_encoder_seq_len, None) + else: + raise AttributeError(f"Invalid attention type {str(attn_type)}") + + +def get_num_prefill_decode_query_kv_tokens( + attn_metadata, + attn_type: AttentionType, +) -> Tuple[int, int, int]: + """ + Calculate the number of prefill and decode tokens for query, key/value + based on the attention metadata and the specified attention type. + + Args: + attn_metadata (FlashAttentionMetadata): Attention Metadata object. + attn_type (AttentionType): The type of attention being used. + Returns: + Tuple[int, int, int]: A tuple containing three integers: + - The number of prefill query tokens. + - The number of prefill key/value tokens. + - The number of decode query tokens. + + Raises: + AssertionError: If the number of encoder tokens in `attn_metadata` + is `None` when required for the calculations. + """ + num_prefill_query_tokens = 0 + num_decode_query_tokens = 0 + num_prefill_kv_tokens = 0 + if attn_type == AttentionType.ENCODER: + # Encoder attention is only invoked during prefill phase. + # The same input servers a both query and key. + assert attn_metadata.num_encoder_tokens is not None + num_prefill_query_tokens = attn_metadata.num_encoder_tokens + num_prefill_kv_tokens = attn_metadata.num_encoder_tokens + num_decode_query_tokens = 0 + elif attn_type == AttentionType.ENCODER_DECODER: + assert attn_metadata.num_encoder_tokens is not None + num_prefill_query_tokens = attn_metadata.num_prefill_tokens + # The key is the encoder/cross-attention. + num_prefill_kv_tokens = attn_metadata.num_encoder_tokens + num_decode_query_tokens = attn_metadata.num_decode_tokens + else: # attn_type == AttentionType.DECODER or + # attn_type == AttentionType.ENCODER_ONLY + num_prefill_query_tokens = attn_metadata.num_prefill_tokens + num_prefill_kv_tokens = attn_metadata.num_prefill_tokens + num_decode_query_tokens = attn_metadata.num_decode_tokens + + return (num_prefill_query_tokens, num_prefill_kv_tokens, + num_decode_query_tokens) diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 5aaf13d8ea744..83d03606524dc 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -11,8 +11,10 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import (CommonAttentionState, - CommonMetadataBuilder) +from vllm.attention.backends.utils import ( + CommonAttentionState, CommonMetadataBuilder, + get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, + is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set) from vllm.attention.ops.paged_attn import (PagedAttention, PagedAttentionMetadata) from vllm.logger import init_logger @@ -135,6 +137,11 @@ class XFormersMetadata(AttentionMetadata, PagedAttentionMetadata): # Encoder sequence lengths representation encoder_seq_lens: Optional[List[int]] = None encoder_seq_lens_tensor: Optional[torch.Tensor] = None + # FIXME: It is for flash attn. + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + encoder_seq_start_loc: Optional[torch.Tensor] = None # Maximum sequence length among encoder sequences max_encoder_seq_len: Optional[int] = None @@ -162,9 +169,7 @@ def is_all_encoder_attn_metadata_set(self): ''' All attention metadata required for encoder attention is set. ''' - return ((self.encoder_seq_lens is not None) - and (self.encoder_seq_lens_tensor is not None) - and (self.max_encoder_seq_len is not None)) + return is_all_encoder_attn_metadata_set(self) @property def is_all_cross_attn_metadata_set(self): @@ -173,9 +178,7 @@ def is_all_cross_attn_metadata_set(self): Superset of encoder attention required metadata. ''' - return (self.is_all_encoder_attn_metadata_set - and (self.cross_slot_mapping is not None) - and (self.cross_block_tables is not None)) + return is_all_cross_attn_metadata_set(self) @property def prefill_metadata(self) -> Optional["XFormersMetadata"]: @@ -212,6 +215,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_query_len=self.max_query_len, @@ -255,6 +260,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]: num_prefill_tokens=0, num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, seq_lens_tensor=seq_lens_tensor, max_prefill_seq_len=0, max_decode_seq_len=self.max_decode_seq_len, @@ -266,6 +272,13 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]: max_encoder_seq_len=self.max_encoder_seq_len, cross_slot_mapping=self.cross_slot_mapping, cross_block_tables=self.cross_block_tables) + + # Batch may be composed of prefill|decodes, adjust query start indices + # to refer to the start of decodes when the two are split apart. + # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. + if self._cached_decode_metadata.query_start_loc is not None: + qs = self._cached_decode_metadata.query_start_loc + self._cached_decode_metadata.query_start_loc = qs - qs[0] return self._cached_decode_metadata @@ -326,64 +339,6 @@ def _set_attn_bias( raise AttributeError(f"Invalid attention type {str(attn_type)}") -def _get_seq_len_block_table_args( - attn_metadata: XFormersMetadata, - is_prompt: bool, - attn_type: AttentionType, -) -> tuple: - ''' - The particular choice of sequence-length- and block-table-related - attributes which should be extracted from attn_metadata is dependent - on the type of attention operation. - - Decoder attn -> select entirely decoder self-attention-related fields - Encoder/decoder cross-attn -> select encoder sequence lengths & - cross-attn block-tables fields - Encoder attn -> select encoder sequence lengths fields & no block tables - - Arguments: - - * attn_metadata: Attention metadata structure associated with attention op - * is_prompt: True if prefill, False otherwise - * attn_type: encoder attention, decoder self-attention, - encoder/decoder cross-attention - - Returns: - - * Appropriate sequence-lengths tensor - * Appropriate max sequence-length scalar - * Appropriate block tables (or None) - ''' - - if attn_type == AttentionType.DECODER: - # Decoder self-attention - # Choose max_seq_len based on whether we are in prompt_run - if is_prompt: - max_seq_len = attn_metadata.max_prefill_seq_len - else: - max_seq_len = attn_metadata.max_decode_seq_len - return (attn_metadata.seq_lens_tensor, max_seq_len, - attn_metadata.block_tables) - elif attn_type == AttentionType.ENCODER_DECODER: - # Enc/dec cross-attention KVs match encoder sequence length; - # cross-attention utilizes special "cross" block tables - return (attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, - attn_metadata.cross_block_tables) - elif attn_type == AttentionType.ENCODER: - # No block tables associated with encoder attention - return (attn_metadata.encoder_seq_lens_tensor, - attn_metadata.max_encoder_seq_len, None) - elif attn_type == AttentionType.ENCODER_ONLY: - assert is_prompt, "Should not have decode for encoder only model." - - # No block tables associated with encoder attention - return (attn_metadata.seq_lens_tensor, - attn_metadata.max_prefill_seq_len, None) - else: - raise AttributeError(f"Invalid attention type {str(attn_type)}") - - class XFormersMetadataBuilder(CommonMetadataBuilder[XFormersMetadata]): _metadata_cls = XFormersMetadata @@ -571,45 +526,21 @@ def forward( updated_slot_mapping, self.kv_cache_dtype, k_scale, v_scale) - - if attn_type == AttentionType.ENCODER: - # Encoder attention - chunked prefill is not applicable; - # derive token-count from query shape & and treat them - # as 100% prefill tokens - assert attn_metadata.num_encoder_tokens is not None - num_prefill_tokens = attn_metadata.num_encoder_tokens - num_encoder_tokens = attn_metadata.num_encoder_tokens - num_decode_tokens = 0 - elif attn_type == AttentionType.DECODER: - # Decoder self-attention supports chunked prefill. - num_prefill_tokens = attn_metadata.num_prefill_tokens - num_encoder_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens - # Only enforce this shape-constraint for decoder - # self-attention - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens - else: # attn_type == AttentionType.ENCODER_DECODER - # Encoder/decoder cross-attention requires no chunked - # prefill (100% prefill or 100% decode tokens, no mix) - num_prefill_tokens = attn_metadata.num_prefill_tokens - if attn_metadata.num_encoder_tokens is not None: - num_encoder_tokens = attn_metadata.num_encoder_tokens - else: - num_encoder_tokens = attn_metadata.num_prefill_tokens - num_decode_tokens = attn_metadata.num_decode_tokens + (num_prefill_query_tokens, num_prefill_kv_tokens, + num_decode_query_tokens) = \ + get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) output = torch.empty_like(query) # Query for decode. KV is not needed because it is already cached. - decode_query = query[num_prefill_tokens:] + decode_query = query[num_prefill_query_tokens:] # QKV for prefill. - query = query[:num_prefill_tokens] + query = query[:num_prefill_query_tokens] if key is not None and value is not None: - key = key[:num_encoder_tokens] - value = value[:num_encoder_tokens] + key = key[:num_prefill_kv_tokens] + value = value[:num_prefill_kv_tokens] - assert query.shape[0] == num_prefill_tokens - assert decode_query.shape[0] == num_decode_tokens + assert query.shape[0] == num_prefill_query_tokens + assert decode_query.shape[0] == num_decode_query_tokens if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. @@ -619,8 +550,8 @@ def forward( # prefix. out = self._run_memory_efficient_xformers_forward( query, key, value, prefill_meta, attn_type=attn_type) - assert out.shape == output[:num_prefill_tokens].shape - output[:num_prefill_tokens] = out + assert out.shape == output[:num_prefill_query_tokens].shape + output[:num_prefill_query_tokens] = out else: assert attn_type != AttentionType.ENCODER_ONLY, ( "Encoder-only models should not have prefix attention.") @@ -649,8 +580,8 @@ def forward( k_scale, v_scale, ) - assert output[:num_prefill_tokens].shape == out.shape - output[:num_prefill_tokens] = out + assert output[:num_prefill_query_tokens].shape == out.shape + output[:num_prefill_query_tokens] = out if decode_meta := attn_metadata.decode_metadata: assert attn_type != AttentionType.ENCODER_ONLY, ( @@ -660,9 +591,9 @@ def forward( seq_lens_arg, max_seq_len_arg, block_tables_arg, - ) = _get_seq_len_block_table_args(decode_meta, False, attn_type) + ) = get_seq_len_block_table_args(decode_meta, False, attn_type) - output[num_prefill_tokens:] = PagedAttention.forward_decode( + output[num_prefill_query_tokens:] = PagedAttention.forward_decode( decode_query, key_cache, value_cache, diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index a98eb431ac7fc..350f88c8f9740 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -192,10 +192,8 @@ def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen] q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1) - k2, v2 = [ - self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) - for x in [k, v] - ] + k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) + for x in [k, v]) spda_output = torch.nn.functional.scaled_dot_product_attention( q2, k2, v2, attn_mask=attn_mask, scale=sm_scale) return self.transpose_and_unpad(spda_output, cu_seqlens) diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py new file mode 100644 index 0000000000000..4c0fb2a628361 --- /dev/null +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -0,0 +1,103 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple + +import torch +from vllm_hpu_extension import cache_ops, ops + +# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. +_PARTITION_SIZE = 512 + + +@dataclass +class HPUPagedAttentionMetadata: + """Metadata for PagedAttention.""" + block_list: Optional[torch.Tensor] + block_mapping: Optional[torch.Tensor] + block_usage: Optional[torch.Tensor] + block_indices: Optional[torch.Tensor] + block_offsets: Optional[torch.Tensor] + block_scales: Optional[torch.Tensor] + + +class HPUPagedAttention: + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [64, 80, 96, 112, 128, 256] + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def split_kv_cache( + kv_cache: torch.Tensor, + num_kv_heads: int, + head_size: int, + ) -> Tuple[torch.Tensor, torch.Tensor]: + key_cache = kv_cache[0] + value_cache = kv_cache[1] + return key_cache, value_cache + + @staticmethod + def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_mapping: torch.Tensor, kv_cache_dtype: str, + is_prompt: bool) -> None: + cache_ops.reshape_and_cache(key, value, key_cache, value_cache, + slot_mapping, kv_cache_dtype, is_prompt) + + @staticmethod + def forward_decode(**kwargs) -> torch.Tensor: + return ops.flat_pa(**kwargs) + + @staticmethod + def forward_prefix( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + block_tables: torch.Tensor, + subquery_start_loc: torch.Tensor, + seq_lens_tensor: torch.Tensor, + context_lens: torch.Tensor, + max_query_len: int, + alibi_slopes: Optional[torch.Tensor], + sliding_window: Optional[int], + ) -> torch.Tensor: + raise NotImplementedError( + "forward_prefix is not implemented for HPUPagedAttention") + + @staticmethod + def swap_blocks( + src_kv_cache: torch.Tensor, + dst_kv_cache: torch.Tensor, + src_to_dst: Dict[int, int], + ) -> None: + src_key_cache = src_kv_cache[0] + dst_key_cache = dst_kv_cache[0] + cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) + + src_value_cache = src_kv_cache[1] + dst_value_cache = dst_kv_cache[1] + cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: Dict[int, List[int]], + ) -> None: + key_caches = [kv_cache[0] for kv_cache in kv_caches] + value_caches = [kv_cache[1] for kv_cache in kv_caches] + cache_ops.copy_blocks(key_caches, value_caches, src_to_dists) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 376b3136f0fb8..664707e9dc65d 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -23,6 +23,7 @@ class _Backend(enum.Enum): TORCH_SDPA = enum.auto() OPENVINO = enum.auto() FLASHINFER = enum.auto() + HPU_ATTN = enum.auto() PALLAS = enum.auto() IPEX = enum.auto() NO_ATTENTION = enum.auto() @@ -88,7 +89,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]: return forced_attn_backend -@lru_cache(maxsize=None) def get_attn_backend( head_size: int, dtype: torch.dtype, @@ -98,7 +98,31 @@ def get_attn_backend( is_blocksparse: bool = False, ) -> Type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" + # Accessing envs.* behind an @lru_cache decorator can cause the wrong + # value to be returned from the cache if the value changes between calls. + # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the + # private function. + return _cached_get_attn_backend( + head_size=head_size, + dtype=dtype, + kv_cache_dtype=kv_cache_dtype, + block_size=block_size, + is_attention_free=is_attention_free, + is_blocksparse=is_blocksparse, + use_v1=envs.VLLM_USE_V1, + ) + +@lru_cache(maxsize=None) +def _cached_get_attn_backend( + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + is_attention_free: bool, + is_blocksparse: bool = False, + use_v1: bool = False, +) -> Type[AttentionBackend]: if is_blocksparse: logger.info("Using BlocksparseFlashAttention backend.") from vllm.attention.backends.blocksparse_attn import ( @@ -106,8 +130,9 @@ def get_attn_backend( return BlocksparseFlashAttentionBackend backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size, - is_attention_free) + is_attention_free, use_v1) if backend == _Backend.FLASH_ATTN: + logger.info("Using Flash Attention backend.") from vllm.attention.backends.flash_attn import ( # noqa: F401 FlashAttentionBackend) return FlashAttentionBackend @@ -145,6 +170,10 @@ def get_attn_backend( logger.info("Using Flashinfer backend.") from vllm.attention.backends.flashinfer import FlashInferBackend return FlashInferBackend + elif backend == _Backend.HPU_ATTN: + logger.info("Using HPUAttention backend.") + from vllm.attention.backends.hpu_attn import HPUAttentionBackend + return HPUAttentionBackend elif backend == _Backend.PALLAS: logger.info("Using Pallas backend.") from vllm.attention.backends.pallas import PallasAttentionBackend @@ -157,13 +186,12 @@ def get_attn_backend( raise ValueError("Invalid attention backend.") -def which_attn_to_use( - head_size: int, - dtype: torch.dtype, - kv_cache_dtype: Optional[str], - block_size: int, - is_attention_free: bool, -) -> _Backend: +def which_attn_to_use(head_size: int, + dtype: torch.dtype, + kv_cache_dtype: Optional[str], + block_size: int, + is_attention_free: bool, + use_v1: bool = False) -> _Backend: """Returns which flash attention backend to use.""" # Default case. selected_backend = _Backend.FLASH_ATTN @@ -220,7 +248,10 @@ def which_attn_to_use( logger.info("%s is not supported in AMD GPUs.", selected_backend) return _Backend.ROCM_FLASH - if envs.VLLM_USE_V1: + if current_platform.is_hpu(): + return _Backend.HPU_ATTN + + if use_v1: return _Backend.FLASH_ATTN_VLLM_V1 # FlashAttn in NVIDIA GPUs. diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 10cf49e19eccc..5682faa158069 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -1,17 +1,23 @@ import copy import dataclasses import operator -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from contextlib import ExitStack +from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple, + Union) +from unittest.mock import patch import torch import torch.fx as fx +import vllm.envs as envs from vllm.logger import init_logger -from vllm.utils import weak_ref_tensors +from vllm.utils import combine_fx_passes, weak_ref_tensors from .config import CompilationConfig from .counter import compilation_counter +from .fusion import FusionPass from .levels import CompilationLevel +from .reshapes import RedundantReshapesPass logger = init_logger(__name__) @@ -96,28 +102,74 @@ def fix_functionalization(graph: fx.Graph): user.replace_all_uses_with(replace_node) nodes_to_remove.append(user) nodes_to_remove.append(node) + elif (node.args[0] == + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default): + # manual replace for fused_add_rms_norm_static_fp8_quant + # this is the most effective optimization for llama + # failing to do this will result in many unnecessary copies + + kwargs = node.kwargs + + result = kwargs['result'] + residual = kwargs['residual'] + + # Create a new call to + # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default + with graph.inserting_before(node): + # just insert the call to the custom op + # NOTE: don't run dead code elimination, + # otherwise this op will be removed + graph.call_function( + torch.ops._C.fused_add_rms_norm_static_fp8_quant. + default, + kwargs=kwargs) + + for user in list(node.users): + if user.op == 'call_function' and user.target == operator.getitem: # noqa + # Remove the getitem node + if user.args[1] == 1: + replace_node = result + elif user.args[1] == 2: + replace_node = residual + user.replace_all_uses_with(replace_node) + nodes_to_remove.append(user) + nodes_to_remove.append(node) elif node.args[0] == torch.ops._C.rms_norm.default: # manual replace for rms_norm kwargs = node.kwargs - input = kwargs['input'] - out = kwargs['out'] - weight = kwargs['weight'] - epsilon = kwargs['epsilon'] - # Create a new call to torch.ops._C.rotary_embedding.default - # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa + replace_node = kwargs['result'] + # Create a new call to torch.ops._C.rms_norm.default with graph.inserting_before(node): # just insert the call to the custom op # NOTE: don't run dead code elimination, # otherwise this op will be removed - graph.call_function( - torch.ops._C.rms_norm.default, - args=(out, input, weight, epsilon), - ) + graph.call_function(torch.ops._C.rms_norm.default, + kwargs=kwargs) - replace_node = out + for user in list(node.users): + if user.op == 'call_function' and user.target == operator.getitem: # noqa + user.replace_all_uses_with(replace_node) + nodes_to_remove.append(user) + nodes_to_remove.append(node) + + elif node.args[ + 0] == torch.ops._C.rms_norm_static_fp8_quant.default: # noqa + # manual replace for rms_norm_static_fp8_quant + + kwargs = node.kwargs + + replace_node = kwargs['result'] + # Create a new call to torch.ops._C.rms_norm_static_fp8_quant.default # noqa + with graph.inserting_before(node): + # just insert the call to the custom op + # NOTE: don't run dead code elimination, + # otherwise this op will be removed + graph.call_function( + torch.ops._C.rms_norm_static_fp8_quant.default, + kwargs=kwargs) for user in list(node.users): if user.op == 'call_function' and user.target == operator.getitem: # noqa @@ -133,7 +185,7 @@ def fix_functionalization(graph: fx.Graph): input = kwargs['input'] out = kwargs['out'] - # Create a new call to torch.ops._C.rotary_embedding.default + # Create a new call to torch.ops._C.silu_and_mul.default # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa with graph.inserting_before(node): # just insert the call to the custom op @@ -193,6 +245,7 @@ def wrap_inductor(graph, @dataclasses.dataclass class SplitItem: submod_name: str + graph_id: int is_splitting_graph: bool graph: fx.GraphModule @@ -226,9 +279,7 @@ def split_graph(graph: fx.GraphModule, outputs = [] - # sort the names to make sure the order is deterministic names = [name for (name, module) in split_gm.named_modules()] - names.sort() for name in names: if "." in name or name == "": @@ -238,11 +289,79 @@ def split_graph(graph: fx.GraphModule, module = getattr(split_gm, name) graph_id = int(name.replace("submod_", "")) - outputs.append(SplitItem(name, graph_id in split_op_graphs, module)) + outputs.append( + SplitItem(name, graph_id, (graph_id in split_op_graphs), module)) + + # sort by intetger graph_id, rather than string name + outputs.sort(key=lambda x: x.graph_id) return split_gm, outputs +# we share the global graph pool among all the backends +global_graph_pool = None + + +class PiecewiseCompileInterpreter(torch.fx.Interpreter): + """Code adapted from `torch.fx.passes.shape_prop.ShapeProp`. + It runs the given graph with fake inputs, and compile some + submodules specified by `compile_submod_names` with the given + compilation configs. + + NOTE: the order in `compile_submod_names` matters, because + it will be used to determine the order of the compiled piecewise + graphs. The first graph will handle logging, and the last graph + has some special cudagraph output handling. + """ + + def __init__(self, module: torch.fx.GraphModule, + compile_submod_names: List[str], + compilation_configs: CompilationConfig, graph_pool): + super().__init__(module) + from torch._guards import detect_fake_mode + self.fake_mode = detect_fake_mode() + self.compile_submod_names = compile_submod_names + self.compilation_configs = compilation_configs + self.graph_pool = graph_pool + + def run(self, *args): + fake_args = [ + self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t + for t in args + ] + with self.fake_mode: + return super().run(*fake_args) + + def call_module(self, target: torch.fx.node.Target, + args: Tuple[torch.fx.node.Argument, + ...], kwargs: Dict[str, Any]) -> Any: + assert isinstance(target, str) + output = super().call_module(target, args, kwargs) + + if target in self.compile_submod_names: + index = self.compile_submod_names.index(target) + submod = self.fetch_attr(target) + sym_shape_indices = [ + i for i, x in enumerate(args) if isinstance(x, torch.SymInt) + ] + compiled_graph_for_general_shape = wrap_inductor( + submod, + args, + self.compilation_configs.inductor_compile_config, + runtime_shape=None, + do_logging=index == 0, + use_inductor=self.compilation_configs.use_inductor) + + self.module.__dict__[target] = PiecewiseBackend( + submod, self.compilation_configs, self.graph_pool, index, + len(self.compile_submod_names), sym_shape_indices, + compiled_graph_for_general_shape) + + compilation_counter.num_piecewise_capturable_graphs_seen += 1 + + return output + + class VllmBackend: """The compilation backend for `torch.compile` with VLLM. It is used for compilation level of `CompilationLevel.PIECEWISE`, @@ -250,6 +369,13 @@ class VllmBackend: The major work of this backend is to split the graph into piecewise graphs, and pass them to the piecewise backend. + + This backend also handles custom passes and adds them to Inductor config. + The order of the post-grad post-passes is: + 1. post_grad_passes (constructor parameter) + 2. config["post_grad_custom_post_pass"] + 3. fix_functionalization + This way, all passes operate on a functionalized graph. """ compilation_configs: CompilationConfig @@ -261,14 +387,47 @@ class VllmBackend: split_gm: fx.GraphModule piecewise_graphs: List[SplitItem] returned_callable: Callable + # Inductor passes to run on the graph pre-defunctionalization + post_grad_passes: Sequence[Callable] + sym_tensor_indices: List[int] + input_buffers: List[torch.Tensor] + + def __init__(self, post_grad_passes: Sequence[Callable] = ()): + global global_graph_pool + if global_graph_pool is None: + global_graph_pool = torch.cuda.graph_pool_handle() - def __init__(self, ): - # every instance of VllmBackend has its own graph pool - self.graph_pool = torch.cuda.graph_pool_handle() + # TODO: in the future, if we want to use multiple + # streams, it might not be safe to share a global pool. + # only investigate this when we use multiple streams + self.graph_pool = global_graph_pool + self.post_grad_passes = post_grad_passes + + self.sym_tensor_indices = [] + self.input_buffers = [] # `torch.compile` is JIT compiled, so we don't need to # do anything here + def add_passes_to_config(self): + config = self.compilation_configs + passes = list(self.post_grad_passes) + + passes = passes + [RedundantReshapesPass(config)] + + if config.enable_fusion: + passes = passes + [FusionPass.instance(config)] + + inductor_config = config.inductor_compile_config + if "post_grad_custom_post_pass" in inductor_config: + passes = passes + [inductor_config["post_grad_custom_post_pass"]] + + # add the fix_functionalization pass last, so that all other + # passes operate on a functionalized graph + passes = passes + [fix_functionalization] + combined_pass = combine_fx_passes(passes) + inductor_config["post_grad_custom_post_pass"] = combined_pass + def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: compilation_counter.num_graphs_seen += 1 @@ -282,59 +441,71 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # we get the sizes to capture for cudagraph # from compilation context self.compilation_configs = CompilationConfig.select_and_init_config() + self.add_passes_to_config() self.split_gm, self.piecewise_graphs = split_graph( graph, self.compilation_configs.non_cudagraph_ops) - returned_callable: Callable # type: ignore + from torch._dynamo.utils import lazy_format_graph_code + logger.debug("%s", lazy_format_graph_code("before split", self.graph)) + logger.debug("%s", lazy_format_graph_code("after split", + self.split_gm)) - if len(self.piecewise_graphs) == 0: - compilation_counter.num_piecewise_graphs_seen += 1 - compilation_counter.num_piecewise_capturable_graphs_seen += 1 - returned_callable = PiecewiseBackend(graph, - self.compilation_configs, - self.graph_pool, - is_first_graph=True) - else: - from torch._dynamo.utils import lazy_format_graph_code - logger.debug( - "%s", lazy_format_graph_code("stiching module", self.split_gm)) - - is_first_graph = True - - for item in self.piecewise_graphs: - compilation_counter.num_piecewise_graphs_seen += 1 - compilation_counter.num_piecewise_capturable_graphs_seen += not item.is_splitting_graph # noqa - if not item.is_splitting_graph: - # cannot setattr to a module, so we need to set - # the attribute in the __dict__ - self.split_gm.__dict__[ - item.submod_name] = PiecewiseBackend( - item.graph, self.compilation_configs, - self.graph_pool, is_first_graph) - is_first_graph = False - returned_callable = self.split_gm - - self.returned_callable = returned_callable - # trigger the first compilation - # code borrowed from https://github.com/pytorch/pytorch/blob/4e3e08b71171fa34172b2362ff668553fac75f27/torch/_dynamo/backends/distributed.py#L206 # noqa - # to turn the inputs into fake tensors - import torch._guards - from torch._guards import detect_fake_mode - fake_mode = detect_fake_mode(example_inputs) - fake_args = [] - for arg in example_inputs: - if isinstance(arg, torch.Tensor) and not isinstance( - arg, torch._subclasses.FakeTensor): - fake_args.append( - torch._dynamo.utils.to_fake_tensor(arg, fake_mode)) - else: - fake_args.append(arg) - self.returned_callable(*fake_args) + compilation_counter.num_piecewise_graphs_seen += len( + self.piecewise_graphs) + submod_names_to_compile = [ + item.submod_name for item in self.piecewise_graphs + if not item.is_splitting_graph + ] + + # propagate the split graph to the piecewise backend, + # compile submodules with symbolic shapes + PiecewiseCompileInterpreter(self.split_gm, submod_names_to_compile, + self.compilation_configs, + self.graph_pool).run(*example_inputs) self._called = True - return self.returned_callable + if not self.compilation_configs.use_cudagraph or \ + not self.compilation_configs.cudagraph_copy_inputs: + return self.split_gm + + # if we need to copy input buffers for cudagraph + from torch._guards import detect_fake_mode + fake_mode = detect_fake_mode() + fake_args = [ + fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t + for t in example_inputs + ] + + # index of tensors that have symbolic shapes (batch size) + self.sym_tensor_indices = [ + i for i, x in enumerate(fake_args) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + ] + + # compiler managed cudagraph input buffers + # we assume the first run with symbolic shapes + # has the maximum size among all the tensors + self.input_buffers = [ + example_inputs[x].clone() for x in self.sym_tensor_indices + ] + + def copy_and_call(*args): + list_args = list(args) + for i, index in enumerate(self.sym_tensor_indices): + runtime_tensor = list_args[index] + runtime_shape = runtime_tensor.shape[0] + static_tensor = self.input_buffers[i][:runtime_shape] + + # copy the tensor to the static buffer + static_tensor.copy_(runtime_tensor) + + # replace the tensor in the list_args to the static buffer + list_args[index] = static_tensor + return self.split_gm(*list_args) + + return copy_and_call @dataclasses.dataclass @@ -349,14 +520,18 @@ class ConcreteSizeEntry: cudagraph: Optional[torch.cuda.CUDAGraph] = None output: Optional[Any] = None + # for cudagraph debugging, track the input addresses + # during capture, and check if they are the same during replay + input_addresses: Optional[List[int]] = None + class PiecewiseBackend: - def __init__(self, - graph: fx.GraphModule, - compilation_configs: CompilationConfig, - graph_pool: Any, - is_first_graph: bool = False): + def __init__(self, graph: fx.GraphModule, + compilation_configs: CompilationConfig, graph_pool: Any, + piecewise_compile_index: int, total_piecewise_compiles: int, + sym_shape_indices: List[int], + compiled_graph_for_general_shape: Callable): """ The backend for piecewise compilation. It mainly handles the compilation and cudagraph capturing. @@ -373,7 +548,12 @@ def __init__(self, self.graph = graph self.compilation_configs = compilation_configs self.graph_pool = graph_pool - self.is_first_graph = is_first_graph + self.piecewise_compile_index = piecewise_compile_index + self.total_piecewise_compiles = total_piecewise_compiles + + self.is_first_graph = piecewise_compile_index == 0 + self.is_last_graph = ( + piecewise_compile_index == total_piecewise_compiles - 1) self.compile_sizes: Set[int] = set( self.compilation_configs.compile_sizes) @@ -381,12 +561,13 @@ def __init__(self, self.compilation_configs.capture_sizes ) if self.compilation_configs.use_cudagraph else set() - self.compile_finished = False self.first_run_finished = False - self.compiled_graph_for_general_shape: Callable = None # type: ignore + self.compiled_graph_for_general_shape = compiled_graph_for_general_shape # noqa + + self.sym_shape_indices = sym_shape_indices - self.sym_shape_indices: List[int] = [] + self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG" # the entries for different shapes that we need to either # compile or capture cudagraph @@ -399,27 +580,6 @@ def __init__(self, ) def __call__(self, *args) -> Any: - - if not self.compile_finished: - self.compile_finished = True - - # this is the first compilation, we will compile a graph with - # dynamic shape, as the caller will mark first dimension as dynamic - - self.sym_shape_indices = [ - i for i, x in enumerate(args) if isinstance(x, torch.SymInt) - ] - - self.compiled_graph_for_general_shape = wrap_inductor( - self.graph, - args, - self.compilation_configs.inductor_compile_config, - runtime_shape=None, - do_logging=self.is_first_graph, - use_inductor=self.compilation_configs.use_inductor) - - return self.graph(*args) - if not self.first_run_finished: self.first_run_finished = True return self.compiled_graph_for_general_shape(*args) @@ -460,17 +620,63 @@ def __call__(self, *args) -> Any: return entry.runnable(*args) if self.is_first_graph: - logger.info("Capturing a cudagraph for shape %s", - runtime_shape) - + # Since we capture cudagraph for many different shapes and + # capturing is fast, we don't need to log it for every shape. + # We only log it in the debug mode. + logger.debug("Capturing a cudagraph for shape %s", + runtime_shape) + + input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + entry.input_addresses = input_addresses cudagraph = torch.cuda.CUDAGraph() - with torch.cuda.graph(cudagraph, pool=self.graph_pool): - entry.output = weak_ref_tensors(entry.runnable(*args)) + + with ExitStack() as stack: + if not self.is_first_graph: + # during every model forward, we will capture + # many pieces of cudagraphs (roughly one per layer). + # running gc again and again across layers will + # make the cudagraph capture very slow. + # therefore, we only run gc for the first graph, + # and disable gc for the rest of the graphs. + stack.enter_context(patch("gc.collect", lambda: None)) + stack.enter_context( + patch("torch.cuda.empty_cache", lambda: None)) + + # mind-exploding: carefully manage the reference and memory. + with torch.cuda.graph(cudagraph, pool=self.graph_pool): + # `output` is managed by pytorch's cudagraph pool + output = entry.runnable(*args) + if self.is_last_graph: + # by converting it to weak ref, + # the original `output` will immediately be released + # to save memory. It is only safe to do this for + # the last graph, because the output of the last graph + # will not be used by any other cuda graph. + output = weak_ref_tensors(output) + + # here we always use weak ref for the output + # to save memory + entry.output = weak_ref_tensors(output) + entry.cudagraph = cudagraph compilation_counter.num_cudagraph_caputured += 1 - entry.cudagraph = cudagraph - return entry.output + # important: we need to return the output, rather than + # the weak ref of the output, so that pytorch can correctly + # manage the memory during cuda graph capture + return output + + if self.is_debugging_mode: + # check if the input addresses are the same + new_input_addresses = [ + x.data_ptr() for x in args if isinstance(x, torch.Tensor) + ] + assert new_input_addresses == entry.input_addresses, ( + "Input addresses for cudagraphs are different during replay." + f" Expected {entry.input_addresses}, got {new_input_addresses}" + ) entry.cudagraph.replay() return entry.output diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py index 514f2b93ef64f..3e663505c627d 100644 --- a/vllm/compilation/config.py +++ b/vllm/compilation/config.py @@ -1,4 +1,5 @@ import copy +from pathlib import Path from typing import Any, Dict, List, Optional from pydantic import BaseModel, Field, PrivateAttr @@ -31,6 +32,11 @@ class CompilationConfig(BaseModel): It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded cudagraph will be used for subsequent runs. + - cudagraph_copy_inputs: whether to copy input tensors for + cudagraph. If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True, and the compiler will copy the input to an + internally managed buffer. Default is False. - Inductor compilation: - use_inductor: whether to use inductor compilation. - False: inductor compilation is not used. graph runs in eager. @@ -50,6 +56,12 @@ class CompilationConfig(BaseModel): name because the config uses json format. If we pass the config from Python, functions can also be passed directly via Python object constructor, e.g. `CompilationConfig(inductor_passes={"a": func})` + - Custom inductor passes: + - dump_graph_stages: list of stages for which we want to dump the graph. + Each pass defines its own stages (before, after, maybe in-between). + - dump_graph_dir: directory to dump the graph. Default is . + - enable_fusion: whether to enable the custom fusion pass. + TODO better pass enabling system. Why we have different sizes for cudagraph and inductor: - cudagraph: a cudagraph captured for a specific size can only be used @@ -71,6 +83,11 @@ class CompilationConfig(BaseModel): non_cudagraph_ops: List[str] = Field(default_factory=list) cudagraph_num_of_warmups: int = 0 cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_copy_inputs: bool = False + + dump_graph_stages: List[str] = Field(default_factory=list) + dump_graph_dir: Path = Field(default=Path(".")) + enable_fusion: bool = True # not configurable, computed after init compile_sizes: List[int] = PrivateAttr @@ -81,7 +98,7 @@ def model_post_init(self, __context: Any) -> None: if not isinstance(v, str): assert callable(v), ( f"pass {k} should be a function or a qualified name") - self.inductor_passes[k] = v + self.inductor_compile_config[k] = v continue # resolve function from qualified name @@ -91,18 +108,6 @@ def model_post_init(self, __context: Any) -> None: func = __import__(module).__dict__[func_name] self.inductor_compile_config[k] = func - from vllm.compilation.backends import fix_functionalization - from vllm.utils import combine_fx_passes - if "post_grad_custom_post_pass" in self.inductor_compile_config: - self.inductor_compile_config[ - "post_grad_custom_post_pass"] = combine_fx_passes( - fix_functionalization, - self.inductor_compile_config["post_grad_custom_post_pass"], - ) - else: - self.inductor_compile_config[ - "post_grad_custom_post_pass"] = fix_functionalization - def init_during_runtime(self): """To complete the initialization of config, we need to know the compile context, which is only available diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 3053e57e0b63b..ca1e96a33c014 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -6,6 +6,7 @@ import vllm.envs as envs from vllm.compilation.levels import CompilationLevel from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo @@ -110,26 +111,26 @@ def _support_torch_compile(cls: type, """ A decorator to add support for compiling the forward method of a class. """ - - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner - # will handle the compilation, so we don't need to do anything here. - if envs.VLLM_TORCH_COMPILE_LEVEL in [ - CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS - ] or not supports_dynamo(): + if TorchCompileWrapperWithCustomDispatcher in cls.__bases__: + # support decorating multiple times return cls # take care of method resolution order # make sure super().__init__ is called on the base class # other than TorchCompileWrapperWithCustomDispatcher - if TorchCompileWrapperWithCustomDispatcher not in cls.__bases__: - # support decorating multiple times - cls.__bases__ = cls.__bases__ + ( - TorchCompileWrapperWithCustomDispatcher, ) + cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, ) old_init = cls.__init__ # type: ignore - def __init__(self, *args, **kwargs): - old_init(self, *args, **kwargs) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): + old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) + # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # will handle the compilation, so we don't need to do anything here. + self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [ + CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS + ] or not supports_dynamo() + if self.do_not_compile: + return TorchCompileWrapperWithCustomDispatcher.__init__(self) cls.__init__ = __init__ # type: ignore @@ -138,7 +139,7 @@ def __call__(self, *args, **kwargs): # torch.compiler.is_compiling() means we are inside the compilation # e.g. TPU has the compilation logic in model runner, so we don't # need to compile the model inside. - if torch.compiler.is_compiling(): + if self.do_not_compile or torch.compiler.is_compiling(): return self.forward(*args, **kwargs) # the first compilation needs to have dynamic shapes marked diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py new file mode 100644 index 0000000000000..2a0cf0002c9dd --- /dev/null +++ b/vllm/compilation/fusion.py @@ -0,0 +1,291 @@ +import operator +from typing import Iterable, List, Optional + +import torch +from torch._higher_order_ops.auto_functionalize import auto_functionalized +from torch._inductor.pattern_matcher import (Match, PatternMatcherPass, + fwd_only, register_replacement) + +from vllm.compilation.config import CompilationConfig +from vllm.compilation.inductor_pass import InductorPass +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at1 = auto_functionalized(torch.ops._C.rms_norm.default, + result=result_rms, + input=input, + weight=weight, + epsilon=1e-5) + at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default, + result=result, + input=at1[1], + scale=scale) + + # result + return at2[1] + + +def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor, + input: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default, + result=result, + input=input, + weight=weight, + scale=scale, + epsilon=1e-5) + + # result + return at[1] + + +def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, weight: torch.Tensor, + scale: torch.Tensor): + at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default, + input=input, + residual=residual, + weight=weight, + epsilon=1e-5) + at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default, + result=result, + input=at[1], + scale=scale) + + # result, residual + return at1[1], at[2] + + +def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, scale: torch.Tensor): + at = auto_functionalized( + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, + result=result, + input=input, + residual=residual, + weight=weight, + scale=scale, + epsilon=1e-5) + # result, residual + return at[1], at[2] + + +def empty_bf16(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda") + + +def empty_fp8(*args, **kwargs): + fp8 = torch.float8_e4m3fn + return torch.empty(*args, **kwargs, dtype=fp8, device="cuda") + + +def empty_fp32(*args, **kwargs): + return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda") + + +# Utilities for post-processing multi-output matches +def is_func(node: torch.fx.Node, target) -> bool: + return node.op == "call_function" and node.target == target + + +# Returns the first auto_functionalized node with the given op (if it exists) +def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node], + op) -> Optional[torch.fx.Node]: + for node in nodes: + if is_func(node, auto_functionalized) and node.args[0] == op: # noqa + return node + return None + + +# Returns the first auto_functionalized node with the given op +def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node: + node = find_auto_fn_maybe(nodes, op) + assert node is not None, f"Could not find {op} in nodes {nodes}" + return node + + +# Returns the getitem node that extracts the idx-th element from node +# (if it exists) +def find_getitem_maybe(node: torch.fx.Node, + idx: int) -> Optional[torch.fx.Node]: + for user in node.users: + if is_func(user, operator.getitem) and user.args[1] == idx: + return user + return None + + +# Returns the getitem node that extracts the idx-th element from node +def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node: + ret = find_getitem_maybe(node, idx) + assert ret is not None, f"Could not find getitem {idx} in node {node}" + return ret + + +class FusionPass(InductorPass): + """ + This pass fuses a pre-defined set of custom ops into fused ops. + It uses the torch pattern matcher to find the patterns and replace them. + It also manually processes multi-output matches, as those are broken in + the torch pattern matcher. + + Because patterns can only be registered once, the pass is a singleton. + This will be addressed in a future version of PyTorch: + https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + """ + + _instance: 'Optional[FusionPass]' = None + + @classmethod + def instance(cls, config: CompilationConfig): + """ + Get the singleton instance of the FusionPass. + If the instance exists, the config is updated but + initialization is not repeated. + """ + if cls._instance is None: + cls._instance = FusionPass(config) + else: + cls._instance.config = config + return cls._instance + + def __init__(self, config: CompilationConfig): + assert self.__class__._instance is None, \ + "FusionPass singleton instance already exists" + super().__init__(config) + + self.matches: List[Match] = [] + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="fusion_pass") + + # Fuse rms_norm + static_scaled_fp8_quant into + # rms_norm_static_fp8_quant + inputs = [ + empty_fp8(5, 4), + empty_bf16(5, 4), + empty_bf16(5, 4), + empty_bf16(1, 5), + empty_fp32(1, 1) + ] + register_replacement(rms_pattern_static, rms_replacement_static, + inputs, fwd_only, self.patterns) + + # Fuse fused_add_rms_norm + static_scaled_fp8_quant into + # fused_add_rms_norm_static_fp8_quant + # Because pattern has 2 outputs, we need to manually process the match + # (see process_matches) + inputs = [ + empty_fp8(5, 4), + empty_bf16(5, 4), + empty_bf16(5, 4), + empty_bf16(1, 5), + empty_fp32(1, 1) + ] + register_replacement(rms_pattern_residual_static, + rms_replacement_residual_static, + inputs, + fwd_only, + self.patterns, + extra_check=lambda m: self.record_match(m)) + + def record_match(self, match: Match) -> bool: + # Hijack the extra_check to record the match and + # save it for post-processing. + self.matches.append(match) + + # Return False to prevent automatic replacement. + return False + + def process_matches(self, graph: torch.fx.Graph): + """ + Manually process multi-output matches and replace them with fused nodes. + This is necessary because the automatic replacement for multi-output + matches is broken: https://github.com/pytorch/pytorch/issues/137280 + """ + for match in self.matches: + # To avoid use-before-definition errors, insert replacement nodes + # after the last node in the match. + # match.nodes is not guaranteed to be sorted. + # Find the last node in the match. + for last_node_in_match in reversed(graph.nodes): + if last_node_in_match in match.nodes: + break + else: + raise ValueError("No nodes in graph") + + # Insert a new auto_functionalized node for the fused operation, + # as well as getitem nodes to extract the result and residual. + # The auto_functionalized node returns a tuple of + # (None, result, residual) - None is the function return value. + # The resulting graph looks like this: + # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...) # noqa + # result_node_new = at[1] + # residual_node_new = at[2] + with graph.inserting_after(last_node_in_match): + kwargs = match.kwargs + kwargs["epsilon"] = 1e-5 # Currently hard-coded in RMSNorm + + fused_node = graph.call_function( + auto_functionalized, + (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, + ), + kwargs=kwargs) + + graph.inserting_after(fused_node) + result_node_new = graph.call_function(operator.getitem, + (fused_node, 1)) + residual_node_new = graph.call_function( + operator.getitem, (fused_node, 2)) + + # Last part of replacement is rebinding the users of nodes in the + # match to use the new nodes. + + # Find the nodes in the match that we need to rebind + rms_node = find_auto_fn(match.nodes, + torch.ops._C.fused_add_rms_norm.default) + quant_node = find_auto_fn( + match.nodes, torch.ops._C.static_scaled_fp8_quant.default) + + assert len(rms_node.users) == 2 + assert len(quant_node.users) == 1 + + # meta["val"] is used by de-functionalization and has to contain the + # value of the node (tuple of tensors) that would be returned by the + # functionalized node during tracing. + + rms_tup = rms_node.meta["val"] + quant_tup = quant_node.meta["val"] + + # The result of fused_node must be a tuple with the first element + # None (the function return value) and the remaining elements + # representing the mutated inputs. + fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2]) + fused_node.meta["val"] = fused_tup + + # Find the getitem nodes and replace their uses with the new nodes. + # The old nodes will be removed by DCE at the end of the pass. + find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new) + find_getitem(quant_node, 1).replace_all_uses_with(result_node_new) + + # Finally, remove matched nodes + graph.eliminate_dead_code() + assert all(node not in graph.nodes for match in self.matches + for node in match.nodes) + + def __call__(self, graph: torch.fx.Graph): + self.dump_graph(graph, "before_fusion") + + count = self.patterns.apply(graph) + logger.info("Replaced %s patterns", count) + self.dump_graph(graph, "after_pattern_match") + + # Manually process multi-output matches (and run DCE) + self.process_matches(graph) + logger.info("Post-processed %s matches", len(self.matches)) + self.dump_graph(graph, "after_fusion") + self.matches.clear() diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py new file mode 100644 index 0000000000000..b23351fa19759 --- /dev/null +++ b/vllm/compilation/inductor_pass.py @@ -0,0 +1,38 @@ +from abc import ABC, abstractmethod + +import torch + +from vllm.compilation.config import CompilationConfig +# yapf: disable +from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank +from vllm.distributed import ( + get_tensor_model_parallel_world_size as get_tp_world_size) +from vllm.distributed import model_parallel_is_initialized as p_is_init +# yapf: enable +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class InductorPass(ABC): + + @abstractmethod + def __call__(self, graph: torch.fx.Graph): + raise NotImplementedError + + def __init__(self, config: CompilationConfig): + self.config = config + + def dump_graph(self, graph: torch.fx.Graph, stage: str): + if stage in self.config.dump_graph_stages: + # Make sure filename includes rank in the distributed setting + parallel = p_is_init() and get_tp_world_size() > 1 + rank = f"-{get_tp_rank()}" if parallel else "" + filepath = self.config.dump_graph_dir / f"{stage}{rank}.py" + + logger.info("Printing graph to %s", filepath) + with open(filepath, "w") as f: + src = graph.python_code(root_module="self", verbose=True).src + # Add imports so it's not full of errors + print("import torch; from torch import device", file=f) + print(src, file=f) diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py new file mode 100644 index 0000000000000..0d284246d2576 --- /dev/null +++ b/vllm/compilation/reshapes.py @@ -0,0 +1,85 @@ +from typing import Union + +import torch.fx +from torch import SymInt + +from vllm.compilation.fusion import is_func +from vllm.compilation.inductor_pass import InductorPass +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class RedundantReshapesPass(InductorPass): + """ + This is an inductor pass that removes redundant reshape operations. + It is required for RMSNorm-quant fusion to work properly. + That's because apply_fp8_linear adds a reshape, which is redundant + in the 2D-case. + + Example graph: + + getitem_1: "f16[s0, 4096]" = ... + view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096]) + at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...) + out: "f8e4m3fn[s0, 4096]" = at[1] + + Can be replaced with: + getitem_1: "f16[s0, 4096]" = ... + at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...) + out: "f8e4m3fn[s0, 4096]" = at[1] + """ + + def __call__(self, graph: torch.fx.Graph): + self.dump_graph(graph, "before_reshapes") + count = 0 + # Remove no-op reshapes/views: + for node in graph.nodes: + if is_func(node, torch.ops.aten.reshape.default): + input, shape = node.args[:2] + input_shape = input.meta["val"].shape + if len(shape) != len(input_shape): + # Reshape changing rank, skip + continue + + if shape.count(-1) > 1: + # Invalid reshape args, skip + continue + + if all( + self.dims_equivalent(s, i_s) + for s, i_s in zip(shape, input_shape)): + node.replace_all_uses_with(input) + graph.erase_node(node) + count += 1 + + logger.info("Removed %s no-op reshapes", count) + + self.dump_graph(graph, "after_reshapes") + + def dims_equivalent(self, dim: Union[int, torch.fx.Node], + i_dim: Union[int, SymInt]) -> bool: + """ + This function checks if two dimensions are equivalent. + :param dim: The dimension arg to reshape + :param i_dim: The corresponding dimension in the input tensor + :return: Are the dimensions equivalent? + + There are three cases in which the dimensions are equivalent: + 1. The dimensions are equal (both integers) + 2. The reshape dimension is -1 (i.e. inferred) + 3. The dimensions both correspond to the same SymInt + + While case 2 does not guarantee the dimensions are equal, + they are equal if all other dimensions are equal. + + In case 3, the reshape dimension is a torch.fx.Node, + and its value is a SymInt. That value is equal to the + input dimension. + + """ + # Case 1 and 2 + if dim == i_dim or dim == -1: + return True + # Case 3 + return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim diff --git a/vllm/config.py b/vllm/config.py index c2a8c956b374a..002adb4316969 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,6 +1,8 @@ +import copy import enum import json -from dataclasses import dataclass, field, fields +import warnings +from dataclasses import dataclass, field, replace from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal, Mapping, Optional, Set, Tuple, Type, Union) @@ -13,9 +15,10 @@ from vllm.model_executor.models import ModelRegistry from vllm.platforms import current_platform from vllm.tracing import is_otel_available, otel_import_error_traceback -from vllm.transformers_utils.config import (ConfigFormat, get_config, - get_hf_image_processor_config, - get_hf_text_config) +from vllm.transformers_utils.config import ( + ConfigFormat, get_config, get_hf_image_processor_config, + get_hf_text_config, get_pooling_config, + get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, print_warning_once) @@ -23,9 +26,13 @@ from ray.util.placement_group import PlacementGroup from vllm.executor.executor_base import ExecutorBase + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) from vllm.model_executor.model_loader.loader import BaseModelLoader from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import ( BaseTokenizerGroup) +else: + QuantizationConfig = None logger = init_logger(__name__) @@ -55,6 +62,10 @@ class ModelConfig: "mistral" will always use the tokenizer from `mistral_common`. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images or + videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. dtype: Data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. @@ -65,9 +76,6 @@ class ModelConfig: code_revision: The specific revision to use for the model code on Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. - rope_scaling: Dictionary containing the scaling configuration for the - RoPE embeddings. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. tokenizer_revision: The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. @@ -99,7 +107,7 @@ class ModelConfig: matches the model name exposed via the APIs. If multiple model names provided, the first name will be used. If not specified, the model name will be the same as `model`. - limit_mm_per_prompt: Maximum number of data instances per modality + limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. override_neuron_config: Initialize non default neuron config or override default neuron config that are specific to Neuron devices, @@ -107,6 +115,7 @@ class ModelConfig: can not be gathered from the vllm arguments. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. + hf_overrides: Arguments to be forwarded to the HuggingFace config. mm_processor_kwargs: Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. pooling_type: Used to configure the pooling method in the embedding @@ -134,9 +143,10 @@ def __init__( trust_remote_code: bool, dtype: Union[str, torch.dtype], seed: int, + allowed_local_media_path: str = "", revision: Optional[str] = None, code_revision: Optional[str] = None, - rope_scaling: Optional[dict] = None, + rope_scaling: Optional[Dict[str, Any]] = None, rope_theta: Optional[float] = None, tokenizer_revision: Optional[str] = None, max_model_len: Optional[int] = None, @@ -154,6 +164,7 @@ def __init__( override_neuron_config: Optional[Dict[str, Any]] = None, config_format: ConfigFormat = ConfigFormat.AUTO, chat_template_text_format: str = "string", + hf_overrides: Optional[Dict[str, Any]] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, pooling_type: Optional[str] = None, pooling_norm: Optional[bool] = None, @@ -164,11 +175,26 @@ def __init__( self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode self.trust_remote_code = trust_remote_code + self.allowed_local_media_path = allowed_local_media_path self.seed = seed self.revision = revision self.code_revision = code_revision - self.rope_scaling = rope_scaling - self.rope_theta = rope_theta + + if hf_overrides is None: + hf_overrides = {} + if rope_scaling is not None: + hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling} + hf_overrides.update(hf_override) + msg = ("`--rope-scaling` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_override!r}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + if rope_theta is not None: + hf_override = {"rope_theta": rope_theta} + hf_overrides.update(hf_override) + msg = ("`--rope-theta` will be removed in a future release. " + f"'Please instead use `--hf-overrides '{hf_override!r}'`") + warnings.warn(DeprecationWarning(msg), stacklevel=2) + # The tokenizer version is consistent with the model version by default. if tokenizer_revision is None: self.tokenizer_revision = revision @@ -181,11 +207,11 @@ def __init__( self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - self.hf_config = get_config(self.model, trust_remote_code, revision, - code_revision, rope_scaling, rope_theta, - config_format) + code_revision, config_format, + **hf_overrides) self.hf_text_config = get_hf_text_config(self.hf_config) + self.encoder_config = self._get_encoder_config() self.hf_image_processor_config = get_hf_image_processor_config( self.model, revision) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) @@ -218,7 +244,8 @@ def __init__( max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window_len=self.get_hf_config_sliding_window(), - spec_target_max_model_len=spec_target_max_model_len) + spec_target_max_model_len=spec_target_max_model_len, + encoder_config=self.encoder_config) self.served_model_name = get_served_model_name(model, served_model_name) self.multimodal_config = self._init_multimodal_config( @@ -262,6 +289,10 @@ def _init_multimodal_config( return None + def _get_encoder_config(self): + return get_sentence_transformer_tokenizer_config( + self.model, self.revision) + def _init_pooler_config( self, pooling_type: Optional[str] = None, @@ -271,6 +302,14 @@ def _init_pooler_config( pooling_returned_token_ids: Optional[List[int]] = None ) -> Optional["PoolerConfig"]: if self.task == "embedding": + pooling_config = get_pooling_config(self.model, self.revision) + if pooling_config is not None: + # override if user does not + # specifies pooling_type and/or pooling_norm + if pooling_type is None: + pooling_type = pooling_config["pooling_type"] + if pooling_norm is None: + pooling_norm = pooling_config["normalize"] return PoolerConfig( pooling_type=pooling_type, pooling_norm=pooling_norm, @@ -456,9 +495,10 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid - if device_config.device_type not in ("cuda", "tpu", "xpu"): + if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"): logger.warning( - "Async output processing is only supported for CUDA, TPU, XPU. " + "Async output processing is only supported for CUDA, TPU, XPU " + "and HPU." "Disabling it for other platforms.") self.use_async_output_proc = False return @@ -656,11 +696,13 @@ def get_multimodal_config(self) -> "MultiModalConfig": return self.multimodal_config @property - def is_encoder_decoder_model(self) -> bool: + def is_encoder_decoder(self) -> bool: """Extract the HF encoder/decoder model flag.""" - return getattr(self.hf_config, "is_encoder_decoder", False) or ( - (hasattr(self.hf_config, "text_config") and getattr( - self.hf_config.text_config, "is_encoder_decoder", False))) + return is_encoder_decoder(self.hf_config) + + @property + def uses_mrope(self) -> bool: + return uses_mrope(self.hf_config) @property def is_multimodal_model(self) -> bool: @@ -849,7 +891,6 @@ class LoadConfig: ignore_patterns: The list of patterns to ignore when loading the model. Default to "original/**/*" to avoid repeated loading of llama's checkpoints. - """ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO @@ -910,9 +951,12 @@ class ParallelConfig: https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. placement_group: ray distributed model workers placement group. distributed_executor_backend: Backend to use for distributed model - workers, either "ray" or "mp" (multiprocessing). If either - pipeline_parallel_size or tensor_parallel_size is greater than 1, - will default to "ray" if Ray is installed or "mp" otherwise. + workers, either "ray" or "mp" (multiprocessing). If the product + of pipeline_parallel_size and tensor_parallel_size is less than + or equal to the number of GPUs available, "mp" will be used to + keep processing on a single host. Otherwise, this will default + to "ray" if Ray is installed and fail otherwise. Note that tpu + and hpu only support Ray for distributed inference. """ def __init__( @@ -953,6 +997,13 @@ def __init__( raise ValueError( "TPU backend only supports Ray for distributed inference.") + if current_platform.is_hpu() and self.world_size > 1: + if self.distributed_executor_backend is None: + self.distributed_executor_backend = "ray" + if self.distributed_executor_backend != "ray": + raise ValueError( + "HPU backend only supports Ray for distributed inference.") + if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. @@ -1155,6 +1206,8 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "cuda" elif current_platform.is_neuron(): self.device_type = "neuron" + elif current_platform.is_hpu(): + self.device_type = "hpu" elif current_platform.is_openvino(): self.device_type = "openvino" elif current_platform.is_tpu(): @@ -1281,13 +1334,6 @@ def maybe_create_spec_config( "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if enable_chunked_prefill: - raise ValueError( - "Speculative decoding and chunked prefill are " - f"currently mutually exclusive ({enable_chunked_prefill=}).") - # TODO: The user should be able to specify revision/max model len # for the draft model. It is not currently supported. draft_revision = None @@ -1319,6 +1365,8 @@ def maybe_create_spec_config( tokenizer=target_model_config.tokenizer, tokenizer_mode=target_model_config.tokenizer_mode, trust_remote_code=target_model_config.trust_remote_code, + allowed_local_media_path=target_model_config. + allowed_local_media_path, dtype=target_model_config.dtype, seed=target_model_config.seed, revision=draft_revision, @@ -1352,6 +1400,29 @@ def maybe_create_spec_config( f"num_speculative_tokens={n_predict}, but " f"{num_speculative_tokens=} was provided.") + if enable_chunked_prefill and draft_hf_config.model_type in ( + "medusa", "mlp_speculator", "eagle"): + raise ValueError( + "Chunked prefill and hidden-state based draft models are " + "not compatible.") + + speculative_draft_tensor_parallel_size = \ + SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size( + target_parallel_config, + speculative_draft_tensor_parallel_size, + draft_hf_config + ) + + if (enable_chunked_prefill and \ + speculative_draft_tensor_parallel_size != 1): + # TODO - Investigate why the error reported in + # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258 + # is happening and re-enable it. + raise ValueError( + "Chunked prefill and speculative decoding can be enabled " + "simultaneously only for draft models with tensor " + "parallel size 1.") + draft_model_config.max_model_len = ( SpeculativeConfig._maybe_override_draft_max_model_len( speculative_max_model_len, @@ -1430,15 +1501,16 @@ def _maybe_override_draft_max_model_len( ) @staticmethod - def create_draft_parallel_config( - target_parallel_config: ParallelConfig, - speculative_draft_tensor_parallel_size: Optional[int], - draft_hf_config: PretrainedConfig, - ) -> ParallelConfig: - """Create a parallel config for use by the draft worker. - - This is mostly a copy of the target parallel config, except the tp_size. + def _verify_and_get_draft_model_tensor_parallel_size( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: Optional[int], + draft_hf_config: PretrainedConfig) -> int: """ + Verifies and adjusts the tensor parallel size for a draft model + specified using speculative_draft_tensor_parallel_size. + """ + # If speculative_draft_tensor_parallel_size is unset then set it + # appropriately else verify that it is set correctly. if speculative_draft_tensor_parallel_size is None: if draft_hf_config.model_type == "mlp_speculator": speculative_draft_tensor_parallel_size = 1 @@ -1454,7 +1526,18 @@ def create_draft_parallel_config( raise ValueError( f"{speculative_draft_tensor_parallel_size=} cannot be " f"other value than 1 or target model tensor_parallel_size") + return speculative_draft_tensor_parallel_size + + @staticmethod + def create_draft_parallel_config( + target_parallel_config: ParallelConfig, + speculative_draft_tensor_parallel_size: int, + draft_hf_config: PretrainedConfig, + ) -> ParallelConfig: + """Create a parallel config for use by the draft worker. + This is mostly a copy of the target parallel config, except the tp_size. + """ draft_parallel_config = ParallelConfig( pipeline_parallel_size=target_parallel_config. pipeline_parallel_size, @@ -1604,6 +1687,7 @@ class LoRAConfig: # This is a constant. lora_vocab_padding_size: ClassVar[int] = 256 long_lora_scaling_factors: Optional[Tuple[float]] = None + bias_enabled: bool = False def __post_init__(self): # Setting the maximum rank to 256 should be able to satisfy the vast @@ -1732,6 +1816,13 @@ def _get_and_verify_dtype( torch_dtype = torch.float16 else: torch_dtype = config_dtype + + if current_platform.is_hpu() and config_dtype == torch.float16: + logger.info( + "For HPU, we cast models to bfloat16 instead of" + "using float16 by default. Please specify `dtype` if you " + "want to use float16.") + torch_dtype = torch.bfloat16 else: if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: raise ValueError(f"Unknown dtype: {dtype}") @@ -1764,6 +1855,7 @@ def _get_and_verify_max_len( disable_sliding_window: bool, sliding_window_len: Optional[Union[int, List[Optional[int]]]], spec_target_max_model_len: Optional[int] = None, + encoder_config: Optional[Any] = None, ) -> int: """Get and verify the model's maximum length.""" derived_max_model_len = float("inf") @@ -1846,6 +1938,9 @@ def _get_and_verify_max_len( "original_max_position_embeddings"] derived_max_model_len *= scaling_factor + if encoder_config and "max_seq_length" in encoder_config: + derived_max_model_len = encoder_config["max_seq_length"] + # If the user specified a max length, make sure it is smaller than the # derived length from the HF model config. if max_model_len is None: @@ -1941,32 +2036,73 @@ def __post_init__(self): f"installed. Original error:\n{otel_import_error_traceback}") -@dataclass(frozen=True) -class EngineConfig: - """Dataclass which contains all engine-related configuration. This +@dataclass +class VllmConfig: + """Dataclass which contains all vllm-related configuration. This simplifies passing around the distinct configurations in the codebase. """ - model_config: ModelConfig - cache_config: CacheConfig - parallel_config: ParallelConfig - scheduler_config: SchedulerConfig - device_config: DeviceConfig - load_config: LoadConfig - lora_config: Optional[LoRAConfig] - speculative_config: Optional[SpeculativeConfig] - decoding_config: Optional[DecodingConfig] - observability_config: Optional[ObservabilityConfig] - prompt_adapter_config: Optional[PromptAdapterConfig] + model_config: ModelConfig = field(default=None, init=True) # type: ignore + cache_config: CacheConfig = field(default=None, init=True) # type: ignore + parallel_config: ParallelConfig = field(default=None, + init=True) # type: ignore + scheduler_config: SchedulerConfig = field(default=None, + init=True) # type: ignore + device_config: DeviceConfig = field(default=None, + init=True) # type: ignore + load_config: LoadConfig = field(default=None, init=True) # type: ignore + lora_config: Optional[LoRAConfig] = None + speculative_config: Optional[SpeculativeConfig] = None + decoding_config: Optional[DecodingConfig] = None + observability_config: Optional[ObservabilityConfig] = None + prompt_adapter_config: Optional[PromptAdapterConfig] = None + quant_config: Optional[QuantizationConfig] = None + + @staticmethod + def _get_quantization_config( + model_config: ModelConfig, + load_config: LoadConfig) -> Optional[QuantizationConfig]: + """Get the quantization config.""" + if model_config.quantization is not None: + from vllm.model_executor.model_loader.weight_utils import ( + get_quant_config) + quant_config = get_quant_config(model_config, load_config) + capability_tuple = current_platform.get_device_capability() + + if capability_tuple is not None: + capability = capability_tuple.to_int() + if capability < quant_config.get_min_capability(): + raise ValueError( + f"The quantization method {model_config.quantization} " + "is not supported for the current GPU. Minimum " + f"capability: {quant_config.get_min_capability()}. " + f"Current capability: {capability}.") + supported_dtypes = quant_config.get_supported_act_dtypes() + if model_config.dtype not in supported_dtypes: + raise ValueError( + f"{model_config.dtype} is not supported for quantization " + f"method {model_config.quantization}. Supported dtypes: " + f"{supported_dtypes}") + return quant_config + return None + + def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig": + model_config = copy.deepcopy(self.model_config) + model_config.hf_config = hf_config + + return replace(self, model_config=model_config) def __post_init__(self): """Verify configs are valid & consistent with each other. """ - self.model_config.verify_async_output_proc(self.parallel_config, - self.speculative_config, - self.device_config) - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) + if self.model_config is not None: + self.model_config.verify_async_output_proc(self.parallel_config, + self.speculative_config, + self.device_config) + self.model_config.verify_with_parallel_config(self.parallel_config) + + if self.cache_config is not None: + self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config: self.lora_config.verify_with_model_config(self.model_config) @@ -1976,8 +2112,48 @@ def __post_init__(self): self.prompt_adapter_config.verify_with_model_config( self.model_config) - def to_dict(self): - """Return the configs as a dictionary, for use in **kwargs. - """ - return dict( - (field.name, getattr(self, field.name)) for field in fields(self)) + if self.quant_config is None and \ + self.model_config is not None and self.load_config is not None: + self.quant_config = VllmConfig._get_quantization_config( + self.model_config, self.load_config) + + def __str__(self): + return ("model=%r, speculative_config=%r, tokenizer=%r, " + "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " + "override_neuron_config=%s, tokenizer_revision=%s, " + "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " + "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " + "pipeline_parallel_size=%d, " + "disable_custom_all_reduce=%s, quantization=%s, " + "enforce_eager=%s, kv_cache_dtype=%s, " + "quantization_param_path=%s, device_config=%s, " + "decoding_config=%r, observability_config=%r, " + "seed=%d, served_model_name=%s, " + "num_scheduler_steps=%d, enable_prefix_caching=%s, " + "use_async_output_proc=%s, mm_processor_kwargs=%s") % \ + (self.model_config.model, self.speculative_config, + self.model_config.tokenizer, + self.model_config.skip_tokenizer_init, + self.model_config.tokenizer_mode, + self.model_config.revision, + self.model_config.override_neuron_config, + self.model_config.tokenizer_revision, + self.model_config.trust_remote_code, + self.model_config.dtype, + self.model_config.max_model_len, + self.load_config.download_dir, + self.load_config.load_format, + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size, + self.parallel_config.disable_custom_all_reduce, + self.model_config.quantization, + self.model_config.enforce_eager, + self.cache_config.cache_dtype, + self.model_config.quantization_param_path, + self.device_config.device, self.decoding_config, + self.observability_config, self.model_config.seed, + self.model_config.served_model_name, + self.scheduler_config.num_scheduler_steps, + self.cache_config.enable_prefix_caching, + self.model_config.use_async_output_proc, + self.model_config.mm_processor_kwargs) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 6eda5f99aa1c8..9727f6e19b84e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,6 +4,7 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator +from vllm.platforms import current_platform from vllm.utils import Device @@ -52,7 +53,11 @@ def create( - The block IDs are assigned contiguously, with GPU block IDs coming before CPU block IDs. """ - block_ids = list(range(num_gpu_blocks + num_cpu_blocks)) + # For HPU, block id 0 is used only for padding + reserved_blocks = 1 if current_platform.is_hpu() else 0 + block_ids = list( + range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) + num_gpu_blocks -= reserved_blocks gpu_block_ids = block_ids[:num_gpu_blocks] cpu_block_ids = block_ids[num_gpu_blocks:] diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 61ed7afba12ed..21f4c63b6572d 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -393,7 +393,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: with num_lookahead_slots. Args: - seq_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap out. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. @@ -409,7 +409,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: swapping out the given sequence_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap out. Returns: List[Tuple[int, int]]: The mapping of swapping block from @@ -459,7 +459,7 @@ def _can_swap(self, on to the 'device'. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap in/out. device (Device): device to swap the 'seq_group' on. status (SequenceStatus): The status of sequence which is needed for action. RUNNING for swap out and SWAPPED for swap in diff --git a/vllm/core/evictor.py b/vllm/core/evictor.py index 0b943e6e65f1c..ed7e06cab2996 100644 --- a/vllm/core/evictor.py +++ b/vllm/core/evictor.py @@ -52,7 +52,7 @@ def num_blocks(self) -> int: pass -class BlockMetaData(): +class BlockMetaData: """Data structure for storing key data describe cached block, so that evitor could use to make its decision which one to choose for eviction diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 88733b8f53b86..af4671ec29be9 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -828,8 +828,7 @@ def _schedule_priority_preemption( num_running_seqs) #Preempt out the victim sequence group - self._preempt(vseq_group, blocks_to_swap_out, - PreemptionMode.RECOMPUTE) + self._preempt(vseq_group, blocks_to_swap_out) waiting_queue.appendleft(vseq_group) force_preemption_count += 1 #Put the sequence back into the waiting queue @@ -1148,6 +1147,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: # Update swapped requests. self.swapped.extend(running_scheduled.swapped_out) + # Put prefills first due to Attention backend ordering assumption. return SchedulerOutputs( scheduled_seq_groups=(prefills.seq_groups + running_scheduled.prefill_seq_groups + @@ -1309,6 +1309,8 @@ def schedule( # `multi_modal_data` will be None. multi_modal_data=seq_group.multi_modal_data if scheduler_outputs.num_prefill_groups > 0 else None, + multi_modal_placeholders=seq_group.multi_modal_placeholders + if scheduler_outputs.num_prefill_groups > 0 else None, mm_processor_kwargs=seq_group.mm_processor_kwargs, prompt_adapter_request=seq_group.prompt_adapter_request, ) @@ -1451,12 +1453,8 @@ def _append_slots(self, if len(cows) > 0: blocks_to_copy.extend(cows) - def _preempt( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - preemption_mode: Optional[PreemptionMode] = None, - ) -> PreemptionMode: + def _preempt(self, seq_group: SequenceGroup, + blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than # swapping. However, when the sequence group has multiple sequences diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index c3632aee6d11a..62929dc0feaaf 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -1,5 +1,6 @@ +import ctypes from contextlib import contextmanager -from typing import Any, List, Optional, Union +from typing import List, Optional, Union import torch import torch.distributed as dist @@ -7,6 +8,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary from vllm.distributed.device_communicators.custom_all_reduce_utils import ( gpu_p2p_access_check) from vllm.distributed.parallel_state import in_the_same_node_as @@ -145,18 +147,14 @@ def __init__(self, return self.disabled = False - # buffers memory are owned by this Python class and passed to C++ - # meta data composes of two parts: meta data for synchronization - # (256 bytes) and a temporary buffer for storing intermediate - # allreduce results. - self.meta = torch.zeros(ops.meta_size() + max_size, - dtype=torch.uint8, - device=self.device) + # Buffers memory are owned by this Python class and passed to C++. + # Meta data composes of two parts: meta data for synchronization and a + # temporary buffer for storing intermediate allreduce results. + self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size, + group=group) # This is a pre-registered IPC buffer. In eager mode, input tensors # are first copied into this buffer before allreduce is performed - self.buffer = torch.empty(max_size, - dtype=torch.uint8, - device=self.device) + self.buffer_ptrs = self.create_shared_buffer(max_size, group=group) # This is a buffer for storing the tuples of pointers pointing to # IPC buffers from all ranks. Each registered tuple has size of # 8*world_size bytes where world_size is at most 8. Allocating 8MB @@ -168,11 +166,43 @@ def __init__(self, self.max_size = max_size self.rank = rank self.world_size = world_size - handles, offsets = self._get_ipc_meta(self.meta) self.full_nvlink = full_nvlink - self._ptr = ops.init_custom_ar(self.meta, self.rank_data, handles, - offsets, rank, self.full_nvlink) - self.register_buffer(self.buffer) + self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank, + self.full_nvlink) + ops.register_buffer(self._ptr, self.buffer_ptrs) + + @staticmethod + def create_shared_buffer( + size_in_bytes: int, + group: Optional[ProcessGroup] = None) -> List[int]: + """ + Creates a shared buffer and returns a list of pointers + representing the buffer on all processes in the group. + """ + lib = CudaRTLibrary() + pointer = lib.cudaMalloc(size_in_bytes) + handle = lib.cudaIpcGetMemHandle(pointer) + world_size = dist.get_world_size(group=group) + rank = dist.get_rank(group=group) + handles = [None] * world_size + dist.all_gather_object(handles, handle, group=group) + + pointers: List[int] = [] + for i, h in enumerate(handles): + if i == rank: + pointers.append(pointer.value) # type: ignore + else: + pointers.append( + lib.cudaIpcOpenMemHandle(h).value) # type: ignore + + return pointers + + @staticmethod + def free_shared_buffer(pointers: List[int], + group: Optional[ProcessGroup] = None) -> None: + rank = dist.get_rank(group=group) + lib = CudaRTLibrary() + lib.cudaFree(ctypes.c_void_p(pointers[rank])) @contextmanager def capture(self): @@ -189,60 +219,24 @@ def capture(self): if not self.disabled: self.register_graph_buffers() - def _get_ipc_meta(self, inp: torch.Tensor): - data = inp.untyped_storage()._share_cuda_() - handle = data[1] - # https://github.com/pytorch/pytorch/pull/130890 changes - # the binary format of the ipc handle - # it starts from pytorch 2.5 - if len(handle) > 64: - assert len(handle) == 66 - # only support SHAREABLE_HANDLE_VERSION = 1 - assert int(handle[0]) == 1 - # only support SHAREABLE_CUDA_MALLOC = 'c' - assert handle[1] == ord("c") - handle = handle[2:] - # TODO: support expandable segment - shard_data = ( - handle, # ipc handle to base ptr - data[3], # offset of base ptr - ) - return self._gather_ipc_meta(shard_data) - - def _gather_ipc_meta(self, shard_data): - # Note: don't use `[[None]] * self.world_size` here - # because it will create a list of the same reference - all_data: List[Optional[Any]] = [[None] - for i in range(self.world_size)] - all_data[self.rank][0] = shard_data - - ranks = dist.get_process_group_ranks(group=self.group) - ranks.sort() + def register_graph_buffers(self): + handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) + logger.info("Registering %d cuda graph addresses", len(offset)) + # We cannot directly use `dist.all_gather_object` here + # because it is incompatible with `gloo` backend under inference mode. + # see https://github.com/pytorch/pytorch/issues/126032 for details. + all_data = [[None, None] + for _ in range(dist.get_world_size(group=self.group))] + all_data[self.rank] = [handle, offset] + ranks = sorted(dist.get_process_group_ranks(group=self.group)) for i, rank in enumerate(ranks): dist.broadcast_object_list(all_data[i], src=rank, group=self.group, device="cpu") - - # we cannot directly use `dist.all_gather_object` here - # because it is incompatible with `gloo` backend under inference mode. - # see https://github.com/pytorch/pytorch/issues/126032 for details. - - handles = [] - offsets = [] - for i in range(len(all_data)): - handles.append(all_data[i][0][0]) # type: ignore - offsets.append(all_data[i][0][1]) # type: ignore - return handles, offsets - - def register_buffer(self, inp: torch.Tensor): - handles, offsets = self._get_ipc_meta(inp) - ops.register_buffer(self._ptr, inp, handles, offsets) - - def register_graph_buffers(self): - handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr) - handles, offsets = self._gather_ipc_meta((bytes(handle), offset)) - logger.info("Registering %d cuda graph addresses", len(offset)) + # Unpack list of tuples to tuple of lists. + handles = [d[0] for d in all_data] # type: ignore + offsets = [d[1] for d in all_data] # type: ignore ops.register_graph_buffers(self._ptr, handles, offsets) def should_custom_ar(self, inp: torch.Tensor): @@ -260,45 +254,50 @@ def should_custom_ar(self, inp: torch.Tensor): return inp_size < self.max_size return False - # all reduce, assuming inp tensor is IPC registered with register_buffer, - # or, in the context of cuda graphs, register_graph_buffers - def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None): - if out is None: - out = torch.empty_like(inp) - ops.all_reduce_reg(self._ptr, inp, out) - return out - - # all reduce, assuming inp tensor is NOT IPC registered - def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None): + def all_reduce(self, + inp: torch.Tensor, + *, + out: torch.Tensor = None, + registered: bool = False): + """Performs an out-of-place all reduce. + + If registered is True, this assumes inp's pointer is already + IPC-registered. Otherwise, inp is first copied into a pre-registered + buffer. + """ if out is None: out = torch.empty_like(inp) - ops.all_reduce_unreg(self._ptr, inp, self.buffer, out) + if registered: + ops.all_reduce(self._ptr, inp, out, 0, 0) + else: + ops.all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank], + self.max_size) return out def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]: - # when custom allreduce is disabled, this will be None + """The main allreduce API that provides support for cuda graph.""" + # When custom allreduce is disabled, this will be None. if self.disabled or not self.should_custom_ar(input): return None if self._IS_CAPTURING: if torch.cuda.is_current_stream_capturing(): - return self.all_reduce_reg(input) + return self.all_reduce(input, registered=True) else: - # if warm up, mimic the allocation pattern - # since custom allreduce is out-of-place + # If warm up, mimic the allocation pattern since custom + # allreduce is out-of-place. return torch.empty_like(input) else: - # note: outside of cuda graph context, - # custom allreduce incurs a cost of cudaMemcpy, which should - # be small(<=1% of overall latency) compared to the performance - # gains of using custom kernels - return self.all_reduce_unreg(input) - - return None + # Note: outside of cuda graph context, custom allreduce incurs a + # cost of cudaMemcpy, which should be small (<=1% of overall + # latency) compared to the performance gain of using custom kernels + return self.all_reduce(input, registered=False) def close(self): if not self.disabled and self._ptr: ops.dispose(self._ptr) self._ptr = 0 + self.free_shared_buffer(self.meta_ptrs) + self.free_shared_buffer(self.buffer_ptrs) def __del__(self): self.close() diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 983e772a3f79b..1f78e10cc1dcd 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -240,7 +240,7 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: if is_distributed: get_world_group().barrier() logger.info("reading GPU P2P access cache from %s", path) - with open(path, "r") as f: + with open(path) as f: cache = json.load(f) _gpu_p2p_access_cache = cache return _gpu_p2p_access_cache[f"{src}->{tgt}"] diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py new file mode 100644 index 0000000000000..cc9b19ce022b5 --- /dev/null +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -0,0 +1,48 @@ +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.platforms import current_platform + +if current_platform.is_hpu(): + import habana_frameworks.torch as htorch # noqa: F401 + + +class HpuCommunicator: + + def __init__(self, group: ProcessGroup): + if not current_platform.is_hpu(): + self.disabled = True + return + self.disabled = False + self.group = group + self.world_size = dist.get_world_size(self.group) + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge + # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used + # (which is required for tensor parallel HPUGraph inference) + htorch.core.mark_step() + dist.all_reduce(x, group=self.group) + return x + + def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: + world_size = self.world_size + if dim < 0: + # Convert negative dim to positive. + dim += x.dim() + input_size = x.size() + # Allocate output tensor. + output_tensor = torch.empty((world_size, ) + input_size, + dtype=x.dtype, + device=x.device) + # All-gather. + htorch.core.mark_step() + dist.all_gather_into_tensor(output_tensor, x, group=self.group) + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (world_size * + input_size[dim], ) + + input_size[dim + 1:]) + return output_tensor diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 7319566545678..7c6f48e88637b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -9,6 +9,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum, ncclRedOpTypeEnum, ncclUniqueId) +from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +19,7 @@ class PyNcclCommunicator: def __init__( self, - group: ProcessGroup, + group: Union[ProcessGroup, StatelessProcessGroup], device: Union[int, str, torch.device], library_path: Optional[str] = None, ): @@ -33,13 +34,18 @@ def __init__( It is the caller's responsibility to make sure each communicator is bind to a unique device. """ - assert dist.is_initialized() - assert dist.get_backend(group) != dist.Backend.NCCL, ( - "PyNcclCommunicator should be attached to a non-NCCL group.") + if not isinstance(group, StatelessProcessGroup): + assert dist.is_initialized() + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "PyNcclCommunicator should be attached to a non-NCCL group.") + # note: this rank is the rank in the group + self.rank = dist.get_rank(group) + self.world_size = dist.get_world_size(group) + else: + self.rank = group.rank + self.world_size = group.world_size + self.group = group - # note: this rank is the rank in the group - self.rank = dist.get_rank(group) - self.world_size = dist.get_world_size(group) # if world_size == 1, no need to create communicator if self.world_size == 1: @@ -68,13 +74,17 @@ def __init__( else: # construct an empty unique id self.unique_id = ncclUniqueId() - tensor = torch.ByteTensor(list(self.unique_id.internal)) - ranks = dist.get_process_group_ranks(group) - # arg `src` in `broadcast` is the global rank - dist.broadcast(tensor, src=ranks[0], group=group) - byte_list = tensor.tolist() - for i, byte in enumerate(byte_list): - self.unique_id.internal[i] = byte + + if not isinstance(group, StatelessProcessGroup): + tensor = torch.ByteTensor(list(self.unique_id.internal)) + ranks = dist.get_process_group_ranks(group) + # arg `src` in `broadcast` is the global rank + dist.broadcast(tensor, src=ranks[0], group=group) + byte_list = tensor.tolist() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + else: + self.unique_id = group.broadcast_obj(self.unique_id, src=0) if isinstance(device, int): device = torch.device(f"cuda:{device}") elif isinstance(device, str): diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 7d526b25ed193..2ff1a1ead99c1 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -1,3 +1,4 @@ +import os import pickle import time from contextlib import contextmanager @@ -18,12 +19,6 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL -# time to wait if the queue is full or empty -# if we sleep for too short, it will consume too much CPU -# if we sleep for too long, it will slow down the writer/reader -# 0.1 us is a good balance -RINGBUFFER_SLEEP_INTERVAL = 1e-7 - logger = init_logger(__name__) @@ -333,8 +328,8 @@ def acquire_write(self): # if this block is not ready to write, # we need to wait until it is read by all readers - # wait for a while - time.sleep(RINGBUFFER_SLEEP_INTERVAL) + # Release the processor to other threads + os.sched_yield() # if we wait for a long time, we should warn the user if (time.monotonic() - start_time > @@ -387,8 +382,8 @@ def acquire_read(self): # if this block is not ready, # we need to wait until it is written - # wait for a while - time.sleep(RINGBUFFER_SLEEP_INTERVAL) + # Release the processor to other threads + os.sched_yield() # if we wait for a long time, we should warn the user if (time.monotonic() - start_time > diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py new file mode 100644 index 0000000000000..eafd3c2f67749 --- /dev/null +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -0,0 +1,47 @@ +import torch +import torch.distributed as dist +from torch.distributed import ProcessGroup + +from vllm.platforms import current_platform + + +class XpuCommunicator: + + def __init__(self, group: ProcessGroup): + if not current_platform.is_xpu(): + self.disabled = True + return + self.disabled = False + self.group = group + self.world_size = dist.get_world_size(self.group) + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + dist.all_reduce(x, group=self.group) + return x + + def gather(self, + input_: torch.Tensor, + rank_in_group: int, + dst: int = 0, + dim: int = -1): + # For xpu path, gather doesn't work properly together with ray + # cluster so we use all_gather instead for now. + input_size = input_.size() + # Allocate output tensor. + output_tensor = torch.empty((self.world_size, ) + input_size, + dtype=input_.dtype, + device=input_.device) + # All-gather. + torch.distributed.all_gather_into_tensor(output_tensor, + input_, + group=self.group) + if rank_in_group == dst: + # Reshape + output_tensor = output_tensor.movedim(0, dim) + output_tensor = output_tensor.reshape(input_size[:dim] + + (self.world_size * + input_size[dim], ) + + input_size[dim + 1:]) + else: + output_tensor = None + return output_tensor diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index b04bbc478534c..87ade377266a2 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -37,7 +37,7 @@ import vllm.envs as envs from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import supports_custom_op +from vllm.utils import direct_register_custom_op, supports_custom_op @dataclass @@ -89,18 +89,15 @@ def _get_unique_name(name: str) -> str: return newname -_groups: Dict[str, Callable[[], "GroupCoordinator"]] = {} +_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {} def _register_group(group: "GroupCoordinator") -> None: - # looks like Python 3.8 does not understand `ReferenceType` - _groups[group.unique_name] = weakref.ref(group) # type: ignore + _groups[group.unique_name] = weakref.ref(group) if supports_custom_op(): - @torch.library.custom_op("vllm::inplace_all_reduce", - mutates_args=["tensor"]) def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None: assert group_name in _groups, f"Group {group_name} is not found." group = _groups[group_name]() @@ -108,11 +105,16 @@ def inplace_all_reduce(tensor: torch.Tensor, group_name: str) -> None: raise ValueError(f"Group {group_name} is destroyed.") group._all_reduce_in_place(tensor) - @inplace_all_reduce.register_fake - def _(tensor: torch.Tensor, group_name: str) -> None: + def inplace_all_reduce_fake(tensor: torch.Tensor, group_name: str) -> None: return - @torch.library.custom_op("vllm::outplace_all_reduce", mutates_args=[]) + direct_register_custom_op( + op_name="inplace_all_reduce", + op_func=inplace_all_reduce, + mutates_args=["tensor"], + fake_impl=inplace_all_reduce_fake, + ) + def outplace_all_reduce(tensor: torch.Tensor, group_name: str) -> torch.Tensor: assert group_name in _groups, f"Group {group_name} is not found." @@ -121,10 +123,17 @@ def outplace_all_reduce(tensor: torch.Tensor, raise ValueError(f"Group {group_name} is destroyed.") return group._all_reduce_out_place(tensor) - @outplace_all_reduce.register_fake - def _(tensor: torch.Tensor, group_name: str) -> torch.Tensor: + def outplace_all_reduce_fake(tensor: torch.Tensor, + group_name: str) -> torch.Tensor: return torch.empty_like(tensor) + direct_register_custom_op( + op_name="outplace_all_reduce", + op_func=outplace_all_reduce, + mutates_args=[], + fake_impl=outplace_all_reduce_fake, + ) + class GroupCoordinator: """ @@ -167,6 +176,8 @@ def __init__( use_pynccl: bool, use_custom_allreduce: bool, use_tpu_communicator: bool, + use_hpu_communicator: bool, + use_xpu_communicator: bool, use_message_queue_broadcaster: bool = False, group_name: Optional[str] = None, ): @@ -203,6 +214,8 @@ def __init__( self.use_pynccl = use_pynccl self.use_custom_allreduce = use_custom_allreduce self.use_tpu_communicator = use_tpu_communicator + self.use_hpu_communicator = use_hpu_communicator + self.use_xpu_communicator = use_xpu_communicator # lazy import to avoid documentation build error from vllm.distributed.device_communicators.custom_all_reduce import ( @@ -231,6 +244,18 @@ def __init__( if use_tpu_communicator and self.world_size > 1: self.tpu_communicator = TpuCommunicator(group=self.cpu_group) + from vllm.distributed.device_communicators.hpu_communicator import ( + HpuCommunicator) + self.hpu_communicator: Optional[HpuCommunicator] + if use_hpu_communicator and self.world_size > 1: + self.hpu_communicator = HpuCommunicator(group=self.device_group) + + from vllm.distributed.device_communicators.xpu_communicator import ( + XpuCommunicator) + self.xpu_communicator: Optional[XpuCommunicator] + if use_xpu_communicator and self.world_size > 1: + self.xpu_communicator = XpuCommunicator(group=self.device_group) + from vllm.distributed.device_communicators.shm_broadcast import ( MessageQueue) self.mq_broadcaster: Optional[MessageQueue] = None @@ -338,6 +363,11 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: if self.world_size == 1: return input_ + if input_.is_cpu: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) + return input_ + if not supports_custom_op(): self._all_reduce_in_place(input_) return input_ @@ -347,6 +377,14 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: # TPU handles Dynamo with its own logic. return self.tpu_communicator.all_reduce(input_) + if self.hpu_communicator is not None and \ + not self.hpu_communicator.disabled: + return self.hpu_communicator.all_reduce(input_) + + if self.xpu_communicator is not None and \ + not self.xpu_communicator.disabled: + return self.xpu_communicator.all_reduce(input_) + if self.ca_comm is not None and \ not self.ca_comm.disabled and \ self.ca_comm.should_custom_ar(input_): @@ -369,9 +407,6 @@ def _all_reduce_in_place(self, input_: torch.Tensor) -> None: pynccl_comm = self.pynccl_comm if (pynccl_comm is not None and not pynccl_comm.disabled): pynccl_comm.all_reduce(input_) - elif input_.is_cpu: - import intel_extension_for_pytorch as ipex - ipex.distributed.all_reduce(input_, group=self.device_group) else: torch.distributed.all_reduce(input_, group=self.device_group) @@ -388,6 +423,11 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: if tpu_comm is not None and not tpu_comm.disabled: return tpu_comm.all_gather(input_, dim) + # For HPUs, use HPU communicator. + hpu_comm = self.hpu_communicator + if hpu_comm is not None and not hpu_comm.disabled: + return hpu_comm.all_gather(input_, dim) + if dim < 0: # Convert negative dim to positive. dim += input_.dim() @@ -431,28 +471,10 @@ def gather(self, if dim < 0: # Convert negative dim to positive. dim += input_.dim() - # For xpu path, gather doesn't work properly together with ray - # cluster so we use all_gather instead for now. - if current_platform.is_xpu(): - input_size = input_.size() - # Allocate output tensor. - output_tensor = torch.empty((world_size, ) + input_size, - dtype=input_.dtype, - device=input_.device) - # All-gather. - torch.distributed.all_gather_into_tensor(output_tensor, - input_, - group=self.device_group) - if self.rank_in_group == dst: - # Reshape - output_tensor = output_tensor.movedim(0, dim) - output_tensor = output_tensor.reshape(input_size[:dim] + - (world_size * - input_size[dim], ) + - input_size[dim + 1:]) - else: - output_tensor = None - return output_tensor + if self.xpu_communicator is not None and \ + not self.xpu_communicator.disabled: + return self.xpu_communicator.gather(input_, self.rank_in_group, + dst, dim) # Allocate output tensor. if self.rank_in_group == dst: gather_list = [torch.empty_like(input_) for _ in range(world_size)] @@ -867,6 +889,8 @@ def init_world_group(ranks: List[int], local_rank: int, use_pynccl=False, use_custom_allreduce=False, use_tpu_communicator=False, + use_hpu_communicator=False, + use_xpu_communicator=False, group_name="world", ) @@ -888,6 +912,8 @@ def init_model_parallel_group( use_pynccl=True, use_custom_allreduce=use_custom_allreduce, use_tpu_communicator=True, + use_hpu_communicator=True, + use_xpu_communicator=True, use_message_queue_broadcaster=use_message_queue_broadcaster, group_name=group_name, ) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 8c94ef8cb10ce..dcfcb848cbe06 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -2,9 +2,14 @@ # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from typing import Sequence, Tuple +import dataclasses +import pickle +import time +from collections import deque +from typing import Any, Deque, Dict, Optional, Sequence, Tuple import torch +from torch.distributed import TCPStore import vllm.envs as envs from vllm.logger import init_logger @@ -84,3 +89,139 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int, end_layer = num_hidden_layers return (start_layer, end_layer) + + +@dataclasses.dataclass +class StatelessProcessGroup: + """A dataclass to hold a metadata store, and the rank, world_size of the + group. Only use it to communicate metadata between processes. + For data-plane communication, create NCCL-related objects. + """ + rank: int + world_size: int + store: torch._C._distributed_c10d.Store + data_expiration_seconds: int = 3600 # 1 hour + + # dst rank -> counter + send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + # src rank -> counter + recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + broadcast_send_counter: int = 0 + broadcast_recv_src_counter: Dict[int, int] = dataclasses.field( + default_factory=dict) + + # A deque to store the data entries, with key and timestamp. + entries: Deque[Tuple[str, + float]] = dataclasses.field(default_factory=deque) + + def __post_init__(self): + assert self.rank < self.world_size + self.send_dst_counter = {i: 0 for i in range(self.world_size)} + self.recv_src_counter = {i: 0 for i in range(self.world_size)} + self.broadcast_recv_src_counter = { + i: 0 + for i in range(self.world_size) + } + + def send_obj(self, obj: Any, dst: int): + """Send an object to a destination rank.""" + self.expire_data() + key = f"send_to/{dst}/{self.send_dst_counter[dst]}" + self.store.set(key, pickle.dumps(obj)) + self.send_dst_counter[dst] += 1 + self.entries.append((key, time.time())) + + def expire_data(self): + """Expire data that is older than `data_expiration_seconds` seconds.""" + while self.entries: + # check the oldest entry + key, timestamp = self.entries[0] + if time.time() - timestamp > self.data_expiration_seconds: + self.store.delete_key(key) + self.entries.popleft() + else: + break + + def recv_obj(self, src: int) -> Any: + """Receive an object from a source rank.""" + obj = pickle.loads( + self.store.get( + f"send_to/{self.rank}/{self.recv_src_counter[src]}")) + self.recv_src_counter[src] += 1 + return obj + + def broadcast_obj(self, obj: Optional[Any], src: int) -> Any: + """Broadcast an object from a source rank to all other ranks. + It does not clean up after all ranks have received the object. + Use it for limited times, e.g., for initialization. + """ + if self.rank == src: + self.expire_data() + key = (f"broadcast_from/{src}/" + f"{self.broadcast_send_counter}") + self.store.set(key, pickle.dumps(obj)) + self.broadcast_send_counter += 1 + self.entries.append((key, time.time())) + return obj + else: + key = (f"broadcast_from/{src}/" + f"{self.broadcast_recv_src_counter[src]}") + recv_obj = pickle.loads(self.store.get(key)) + self.broadcast_recv_src_counter[src] += 1 + return recv_obj + + def all_gather_obj(self, obj: Any) -> list[Any]: + """All gather an object from all ranks.""" + gathered_objs = [] + for i in range(self.world_size): + if i == self.rank: + gathered_objs.append(obj) + self.broadcast_obj(obj, src=self.rank) + else: + recv_obj = self.broadcast_obj(None, src=i) + gathered_objs.append(recv_obj) + return gathered_objs + + def barrier(self): + """A barrier to synchronize all ranks.""" + for i in range(self.world_size): + if i == self.rank: + self.broadcast_obj(None, src=self.rank) + else: + self.broadcast_obj(None, src=i) + + @staticmethod + def create( + host: str, + port: int, + rank: int, + world_size: int, + data_expiration_seconds: int = 3600, + ) -> "StatelessProcessGroup": + """A replacement for `torch.distributed.init_process_group` that does not + pollute the global state. + + If we have process A and process B called `torch.distributed.init_process_group` + to form a group, and then we want to form another group with process A, B, C, + D, it is not possible in PyTorch, because process A and process B have already + formed a group, and process C and process D cannot join that group. This + function is a workaround for this issue. + + `torch.distributed.init_process_group` is a global call, while this function + is a stateless call. It will return a `StatelessProcessGroup` object that can be + used for exchanging metadata. With this function, process A and process B + can call `StatelessProcessGroup.create` to form a group, and then process A, B, + C, and D can call `StatelessProcessGroup.create` to form another group. + """ # noqa + store = TCPStore( + host_name=host, + port=port, + world_size=world_size, + is_master=(rank == 0), + ) + + return StatelessProcessGroup( + rank=rank, + world_size=world_size, + store=store, + data_expiration_seconds=data_expiration_seconds) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b1f0f8b9df925..31aa8c5908719 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -9,15 +9,16 @@ import vllm.envs as envs from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig, - DeviceConfig, EngineConfig, LoadConfig, LoadFormat, - LoRAConfig, ModelConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig, TaskOption, TokenizerPoolConfig) + DeviceConfig, LoadConfig, LoadFormat, LoRAConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, + SpeculativeConfig, TaskOption, TokenizerPoolConfig, + VllmConfig) from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger +from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) +from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -36,6 +37,7 @@ "openvino", "tpu", "xpu", + "hpu", ] @@ -91,6 +93,7 @@ class EngineArgs: tokenizer_mode: str = 'auto' chat_template_text_format: str = 'string' trust_remote_code: bool = False + allowed_local_media_path: str = "" download_dir: Optional[str] = None load_format: str = 'auto' config_format: ConfigFormat = ConfigFormat.AUTO @@ -108,7 +111,9 @@ class EngineArgs: pipeline_parallel_size: int = 1 tensor_parallel_size: int = 1 max_parallel_loading_workers: Optional[int] = None - block_size: int = 16 + # NOTE(kzawora): default block size for Gaudi should be 128 + # smaller sizes still work, but very inefficiently + block_size: int = 16 if not current_platform.is_hpu() else 128 enable_prefix_caching: bool = False disable_sliding_window: bool = False use_v2_block_manager: bool = True @@ -121,8 +126,9 @@ class EngineArgs: disable_log_stats: bool = False revision: Optional[str] = None code_revision: Optional[str] = None - rope_scaling: Optional[dict] = None + rope_scaling: Optional[Dict[str, Any]] = None rope_theta: Optional[float] = None + hf_overrides: Optional[Dict[str, Any]] = None tokenizer_revision: Optional[str] = None quantization: Optional[str] = None enforce_eager: Optional[bool] = None @@ -133,9 +139,11 @@ class EngineArgs: # is intended for expert use only. The API may change without # notice. tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray" - tokenizer_pool_extra_config: Optional[dict] = None + tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None limit_mm_per_prompt: Optional[Mapping[str, int]] = None + mm_processor_kwargs: Optional[Dict[str, Any]] = None enable_lora: bool = False + enable_lora_bias: bool = False max_loras: int = 1 max_lora_rank: int = 16 enable_prompt_adapter: bool = False @@ -180,7 +188,6 @@ class EngineArgs: collect_detailed_traces: Optional[str] = None disable_async_output_proc: bool = False override_neuron_config: Optional[Dict[str, Any]] = None - mm_processor_kwargs: Optional[Dict[str, Any]] = None scheduling_policy: Literal["fcfs", "priority"] = "fcfs" # Pooling configuration. @@ -268,6 +275,13 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--trust-remote-code', action='store_true', help='Trust remote code from huggingface.') + parser.add_argument( + '--allowed-local-media-path', + type=str, + help="Allowing API requests to read local images or videos" + "from directories specified by the server file system." + "This is a security risk." + "Should only be enabled in trusted environments") parser.add_argument('--download-dir', type=nullable_str, default=EngineArgs.download_dir, @@ -356,9 +370,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--distributed-executor-backend', choices=['ray', 'mp'], default=EngineArgs.distributed_executor_backend, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') + help='Backend to use for distributed model ' + 'workers, either "ray" or "mp" (multiprocessing). If the product ' + 'of pipeline_parallel_size and tensor_parallel_size is less than ' + 'or equal to the number of GPUs available, "mp" will be used to ' + 'keep processing on a single host. Otherwise, this will default ' + 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' + 'and hpu only support Ray for distributed inference.') + parser.add_argument( '--worker-use-ray', action='store_true', @@ -388,7 +407,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--block-size', type=int, default=EngineArgs.block_size, - choices=[8, 16, 32], + choices=[8, 16, 32, 64, 128], help='Token block size for contiguous chunks of ' 'tokens. This is ignored on neuron devices and ' 'set to max-model-len') @@ -498,6 +517,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help='RoPE theta. Use with `rope_scaling`. In ' 'some cases, changing the RoPE theta improves the ' 'performance of the scaled model.') + parser.add_argument('--hf-overrides', + type=json.loads, + default=EngineArgs.hf_overrides, + help='Extra arguments for the HuggingFace config.' + 'This should be a JSON string that will be ' + 'parsed into a dictionary.') parser.add_argument('--enforce-eager', action='store_true', help='Always use eager-mode PyTorch. If False, ' @@ -560,6 +585,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--enable-lora', action='store_true', help='If True, enable handling of LoRA adapters.') + parser.add_argument('--enable-lora-bias', + action='store_true', + help='If True, enable bias for LoRA adapters.') parser.add_argument('--max-loras', type=int, default=EngineArgs.max_loras, @@ -598,8 +626,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=EngineArgs.max_cpu_loras, help=('Maximum number of LoRAs to store in CPU memory. ' - 'Must be >= than max_num_seqs. ' - 'Defaults to max_num_seqs.')) + 'Must be >= than max_loras. ' + 'Defaults to max_loras.')) parser.add_argument( '--fully-sharded-loras', action='store_true', @@ -850,7 +878,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--pooling-type', - choices=['LAST', 'ALL', 'CLS', 'STEP'], + choices=[pt.name for pt in PoolingType], default=None, help='Used to configure the pooling method in the embedding model.' ) @@ -919,12 +947,14 @@ def create_model_config(self) -> ModelConfig: tokenizer_mode=self.tokenizer_mode, chat_template_text_format=self.chat_template_text_format, trust_remote_code=self.trust_remote_code, + allowed_local_media_path=self.allowed_local_media_path, dtype=self.dtype, seed=self.seed, revision=self.revision, code_revision=self.code_revision, rope_scaling=self.rope_scaling, rope_theta=self.rope_theta, + hf_overrides=self.hf_overrides, tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, @@ -955,7 +985,7 @@ def create_load_config(self) -> LoadConfig: ignore_patterns=self.ignore_patterns, ) - def create_engine_config(self) -> EngineConfig: + def create_engine_config(self) -> VllmConfig: # gguf file needs a specific model loader and doesn't use hf_repo if check_gguf_file(self.model): self.quantization = self.load_format = "gguf" @@ -990,8 +1020,6 @@ def create_engine_config(self) -> EngineConfig: "supported for multimodal models and has been disabled.") self.enable_prefix_caching = False - maybe_register_config_serialize_by_value(self.trust_remote_code) - cache_config = CacheConfig( # neuron needs block_size = max_model_len block_size=self.block_size if self.device != "neuron" else @@ -1122,9 +1150,9 @@ def create_engine_config(self) -> EngineConfig: multi_step_stream_outputs=self.multi_step_stream_outputs, send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER and parallel_config.use_ray), - policy=self.scheduling_policy, - ) + policy=self.scheduling_policy) lora_config = LoRAConfig( + bias_enabled=self.enable_lora_bias, max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, fully_sharded_loras=self.fully_sharded_loras, @@ -1167,7 +1195,7 @@ def create_engine_config(self) -> EngineConfig: or "all" in detailed_trace_modules, ) - return EngineConfig( + return VllmConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 5198467a6ac40..5a5388708b1c6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -7,8 +7,8 @@ from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, EngineConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, + ParallelConfig, SchedulerConfig, VllmConfig) from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout @@ -19,6 +19,7 @@ from vllm.executor.gpu_executor import GPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( @@ -604,7 +605,7 @@ def __del__(self): @classmethod def _get_executor_cls( - cls, engine_config: EngineConfig) -> Type[ExecutorAsyncBase]: + cls, engine_config: VllmConfig) -> Type[ExecutorAsyncBase]: distributed_executor_backend = ( engine_config.parallel_config.distributed_executor_backend) if isinstance(distributed_executor_backend, type): @@ -627,6 +628,14 @@ def _get_executor_cls( elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutorAsync executor_class = CPUExecutorAsync + elif engine_config.device_config.device_type == "hpu": + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync + executor_class = RayHPUExecutorAsync + else: + from vllm.executor.hpu_executor import HPUExecutorAsync + executor_class = HPUExecutorAsync elif engine_config.device_config.device_type == "openvino": assert distributed_executor_backend is None, ( "Distributed execution is not supported with " @@ -663,7 +672,7 @@ def _get_executor_cls( def from_engine_args( cls, engine_args: AsyncEngineArgs, - engine_config: Optional[EngineConfig] = None, + engine_config: Optional[VllmConfig] = None, start_engine_loop: bool = True, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, @@ -680,7 +689,7 @@ def from_engine_args( # Create the async LLM engine. engine = cls( - **engine_config.to_dict(), + vllm_config=engine_config, executor_class=executor_class, log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, @@ -721,6 +730,9 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.engine.input_preprocessor + async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, @@ -812,7 +824,7 @@ async def _engine_abort(self, request_ids: Iterable[str]): async def run_engine_loop(engine_ref: ReferenceType): """We use a weakref to the engine so that the running loop doesn't prevent the engine being garbage collected.""" - engine: Optional["AsyncLLMEngine"] = engine_ref() + engine: Optional[AsyncLLMEngine] = engine_ref() if not engine: return diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3fd34fadee1ca..f5299746d845d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,14 +10,12 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeIs, TypeVar +from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, - EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, + ObservabilityConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, SchedulerOutputs) from vllm.engine.arg_utils import EngineArgs @@ -31,9 +29,9 @@ from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputRegistry, PromptType, - TokensPrompt) +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors @@ -41,6 +39,7 @@ from vllm.model_executor.guided_decoding import ( get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams @@ -222,30 +221,36 @@ def validate_outputs( def __init__( self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - speculative_config: Optional[SpeculativeConfig], - decoding_config: Optional[DecodingConfig], - observability_config: Optional[ObservabilityConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], + vllm_config: VllmConfig, executor_class: Type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: + + # TODO: remove the local variables and use self.* throughout the class. + model_config = self.model_config = vllm_config.model_config + cache_config = self.cache_config = vllm_config.cache_config + lora_config = self.lora_config = vllm_config.lora_config + parallel_config = self.parallel_config = vllm_config.parallel_config + scheduler_config = self.scheduler_config = vllm_config.scheduler_config + device_config = self.device_config = vllm_config.device_config + speculative_config = self.speculative_config = vllm_config.speculative_config # noqa + load_config = self.load_config = vllm_config.load_config + decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa + ) + prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa + observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa + ) + logger.info( "Initializing an LLM engine (v%s) with config: " "model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, " - "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, " + "override_neuron_config=%s, tokenizer_revision=%s, " "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " "pipeline_parallel_size=%d, " @@ -267,8 +272,6 @@ def __init__( model_config.tokenizer_mode, model_config.revision, model_config.override_neuron_config, - model_config.rope_scaling, - model_config.rope_theta, model_config.tokenizer_revision, model_config.trust_remote_code, model_config.dtype, @@ -334,24 +337,14 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: model_config) self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) + self.tokenizer, + mm_registry) self.input_registry = input_registry self.input_processor = input_registry.create_input_processor( model_config) - self.model_executor = executor_class( - model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - lora_config=lora_config, - speculative_config=speculative_config, - load_config=load_config, - prompt_adapter_config=prompt_adapter_config, - observability_config=self.observability_config, - ) + self.model_executor = executor_class(vllm_config=vllm_config, ) if self.model_config.task != "embedding": self._initialize_kv_caches() @@ -508,7 +501,7 @@ def _initialize_kv_caches(self) -> None: @classmethod def _get_executor_cls(cls, - engine_config: EngineConfig) -> Type[ExecutorBase]: + engine_config: VllmConfig) -> Type[ExecutorBase]: distributed_executor_backend = ( engine_config.parallel_config.distributed_executor_backend) # Initialize the cluster and specify the executor class. @@ -535,6 +528,14 @@ def _get_executor_cls(cls, elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor + elif engine_config.device_config.device_type == "hpu": + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_hpu_executor import RayHPUExecutor + executor_class = RayHPUExecutor + else: + from vllm.executor.hpu_executor import HPUExecutor + executor_class = HPUExecutor elif engine_config.device_config.device_type == "openvino": from vllm.executor.openvino_executor import OpenVINOExecutor executor_class = OpenVINOExecutor @@ -582,7 +583,7 @@ def from_engine_args( executor_class = cls._get_executor_cls(engine_config) # Create the LLM engine. engine = cls( - **engine_config.to_dict(), + vllm_config=engine_config, executor_class=executor_class, log_stats=not engine_args.disable_log_stats, usage_context=usage_context, @@ -645,7 +646,7 @@ def _verify_args(self) -> None: def _add_processed_request( self, request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], + processed_inputs: ProcessorInputs, params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], @@ -676,18 +677,19 @@ def _add_processed_request( seq_id = next(self.seq_counter) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id, + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = processed_inputs["decoder"] + encoder_inputs = processed_inputs["encoder"] + else: + decoder_inputs = processed_inputs + encoder_inputs = None + + seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, lora_request, prompt_adapter_request) - encoder_seq = None - if 'encoder_prompt_token_ids' in processed_inputs: - encoder_seq = Sequence(seq_id, - processed_inputs, - block_size, - eos_token_id, - lora_request, - prompt_adapter_request, - from_decoder_prompt=False) + encoder_seq = (None if encoder_inputs is None else Sequence( + seq_id, encoder_inputs, block_size, eos_token_id, lora_request, + prompt_adapter_request)) # Create a SequenceGroup based on SamplingParams or PoolingParams if isinstance(params, SamplingParams): @@ -829,6 +831,13 @@ def add_request( raise ValueError(f"Got priority {priority} but " "Priority scheduling is not enabled.") + if isinstance(params, SamplingParams) \ + and (params.guided_decoding or params.logits_processors) \ + and self.scheduler_config.num_scheduler_steps > 1: + raise ValueError( + "Guided decoding and logits processors are not supported " + "in multi-step decoding") + if arrival_time is None: arrival_time = time.time() @@ -845,13 +854,6 @@ def add_request( ) processed_inputs = self.input_processor(preprocessed_inputs) - # This is a bit of a hack - copy the mm_processor_kwargs that were - # used in the input processor to the processed output, since these - # kwargs are presumed to be immutable and the values should be aligned - # between the input processor (here) and the input mapper. - processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get( - "mm_processor_kwargs") - self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, @@ -874,7 +876,7 @@ def _validate_token_prompt(self, prompt: PromptType, # This needs to happen before multimodal input pre-processing, which # may add dummy tokens that aren't part of the tokenizer's # vocabulary. - if self._is_token_prompt(prompt): + if is_token_prompt(prompt): prompt_ids = prompt["prompt_token_ids"] if len(prompt_ids) == 0: # Empty prompt check is handled later @@ -884,10 +886,6 @@ def _validate_token_prompt(self, prompt: PromptType, raise ValueError( "Token id {} is out of vocabulary".format(max_input_id)) - @staticmethod - def _is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: - return isinstance(prompt, dict) and "prompt_token_ids" in prompt - def _create_sequence_group_with_sampling( self, request_id: str, @@ -1544,8 +1542,8 @@ def _has_remaining_steps( seq_group.state.remaining_steps != ref_remaining_steps for seq_group in seq_group_metadata_list[1:] ]): - raise AssertionError(("All running sequence groups should " - "have the same remaining steps.")) + raise AssertionError("All running sequence groups should " + "have the same remaining steps.") return ref_remaining_steps > 0 @@ -1670,6 +1668,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 + num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else @@ -1678,6 +1677,10 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_queue_requests: List[float] = [] + time_inference_requests: List[float] = [] + time_prefill_requests: List[float] = [] + time_decode_requests: List[float] = [] time_in_queue_requests: List[float] = [] model_forward_time_requests: List[float] = [] model_execute_time_requests: List[float] = [] @@ -1685,6 +1688,8 @@ def _get_stats(self, num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] n_requests: List[int] = [] + max_num_generation_tokens_requests: List[int] = [] + max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] # Lora requests @@ -1774,6 +1779,18 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + if (seq_group.metrics.first_scheduled_time is not None and + seq_group.metrics.first_token_time is not None): + time_queue_requests.append( + seq_group.metrics.first_scheduled_time - + seq_group.metrics.arrival_time) + time_prefill_requests.append( + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time) + time_decode_requests.append( + now - seq_group.metrics.first_token_time) + time_inference_requests.append( + now - seq_group.metrics.first_scheduled_time) if seq_group.metrics.time_in_queue is not None: time_in_queue_requests.append( seq_group.metrics.time_in_queue) @@ -1790,8 +1807,13 @@ def _get_stats(self, seq.get_output_len() for seq in seq_group.get_finished_seqs() ]) + max_num_generation_tokens_requests.append( + max(seq.get_output_len() + for seq in seq_group.get_seqs())) if seq_group.sampling_params is not None: n_requests.append(seq_group.sampling_params.n) + max_tokens_requests.append( + seq_group.sampling_params.max_tokens) finished_reason_requests.extend([ SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs() @@ -1806,7 +1828,8 @@ def _get_stats(self, num_generation_tokens_iter = ( actual_num_batched_tokens - num_prompt_tokens_iter + num_generation_tokens_from_prefill_groups) - + num_tokens_iter = (num_generation_tokens_iter + + num_prompt_tokens_iter) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. if model_output and (model_output[0].spec_decode_worker_metrics @@ -1832,6 +1855,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter=num_prompt_tokens_iter, num_generation_tokens_iter=num_generation_tokens_iter, + num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, spec_decode_metrics=spec_decode_metrics, @@ -1840,13 +1864,20 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_queue_requests=time_queue_requests, + time_inference_requests=time_inference_requests, + time_prefill_requests=time_prefill_requests, + time_decode_requests=time_decode_requests, time_in_queue_requests=time_in_queue_requests, model_forward_time_requests=model_forward_time_requests, model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, + max_num_generation_tokens_requests= + max_num_generation_tokens_requests, n_requests=n_requests, + max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, max_lora=str(max_lora_stat), waiting_lora_adapters=list(waiting_lora_adapters.keys()), @@ -1974,17 +2005,17 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs], + def _validate_model_inputs(self, inputs: ProcessorInputs, lora_request: Optional[LoRARequest]): - if self.model_config.is_multimodal_model: + if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length - prompt_ids = inputs.get("prompt_token_ids") - elif self.is_encoder_decoder_model(): - prompt_ids = inputs.get("encoder_prompt_token_ids") + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] else: - prompt_ids = inputs.get("prompt_token_ids") + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 9ed30e1e99857..e896bcdded2d1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -111,6 +111,15 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) + self.counter_tokens = self._counter_cls( + name="vllm:tokens_total", + documentation="Number of prefill plus generation tokens processed.", + labelnames=labelnames) + self.histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + labelnames=labelnames, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]) self.histogram_time_to_first_token = self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -130,23 +139,45 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Request stats # Latency + request_latency_buckets = [ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ] self.histogram_e2e_time_request = self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, - buckets=[ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 - ]) + buckets=request_latency_buckets) + self.histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) self.histogram_time_in_queue_request = self._histogram_cls( name="vllm:time_in_queue_requests", documentation= "Histogram of time the request spent in the queue in seconds.", labelnames=labelnames, - buckets=[ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 - ]) + buckets=request_latency_buckets) self.histogram_model_forward_time_request = self._histogram_cls( name="vllm:model_forward_time_milliseconds", documentation= @@ -173,12 +204,24 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) + self.histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len)) self.histogram_n_request = self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", labelnames=labelnames, buckets=[1, 2, 5, 10, 20], ) + self.histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) self.counter_request_success = self._counter_cls( name="vllm:request_success_total", documentation="Count of successfully processed requests.", @@ -520,6 +563,8 @@ def _log_prometheus(self, stats: Stats) -> None: stats.num_prompt_tokens_iter) self._log_counter(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter) + self._log_histogram(self.metrics.histogram_iteration_tokens, + [stats.num_tokens_iter]) self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, @@ -529,6 +574,14 @@ def _log_prometheus(self, stats: Stats) -> None: # Latency self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + self._log_histogram(self.metrics.histogram_queue_time_request, + stats.time_queue_requests) + self._log_histogram(self.metrics.histogram_inference_time_request, + stats.time_inference_requests) + self._log_histogram(self.metrics.histogram_decode_time_request, + stats.time_prefill_requests) + self._log_histogram(self.metrics.histogram_prefill_time_request, + stats.time_decode_requests) self._log_histogram(self.metrics.histogram_time_in_queue_request, stats.time_in_queue_requests) self._log_histogram(self.metrics.histogram_model_forward_time_request, @@ -547,6 +600,11 @@ def _log_prometheus(self, stats: Stats) -> None: self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) + self._log_histogram( + self.metrics.histogram_max_num_generation_tokens_request, + stats.max_num_generation_tokens_requests) + self._log_histogram(self.metrics.histogram_max_tokens_request, + stats.max_tokens_requests) def _log_prometheus_interval(self, prompt_throughput: float, generation_throughput: float) -> None: diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 510dd04bb3e55..5f7ec3bbcb269 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -39,6 +39,7 @@ class Stats: # Iteration stats (should have _iter suffix) num_prompt_tokens_iter: int num_generation_tokens_iter: int + num_tokens_iter: int time_to_first_tokens_iter: List[float] time_per_output_tokens_iter: List[float] num_preemption_iter: int @@ -46,6 +47,10 @@ class Stats: # Request stats (should have _requests suffix) # Latency time_e2e_requests: List[float] + time_queue_requests: List[float] + time_inference_requests: List[float] + time_prefill_requests: List[float] + time_decode_requests: List[float] time_in_queue_requests: List[float] model_forward_time_requests: List[float] model_execute_time_requests: List[float] @@ -53,6 +58,8 @@ class Stats: num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] n_requests: List[int] + max_num_generation_tokens_requests: List[int] + max_tokens_requests: List[int] finished_reason_requests: List[str] waiting_lora_adapters: List[str] running_lora_adapters: List[str] @@ -76,7 +83,7 @@ def __init__(self, local_interval: float) -> None: self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval - self.spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None + self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 6e6630b3ff55f..fe21c58c775fe 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -13,7 +13,7 @@ from zmq.asyncio import Socket from vllm import PoolingParams -from vllm.config import DecodingConfig, EngineConfig, ModelConfig +from vllm.config import DecodingConfig, ModelConfig, VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs # yapf conflicts with isort for this block @@ -31,6 +31,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -78,7 +79,7 @@ class MQLLMEngineClient(EngineClient): every N seconds, confirming the engine is healthy """ - def __init__(self, ipc_path: str, engine_config: EngineConfig, + def __init__(self, ipc_path: str, engine_config: VllmConfig, engine_pid: int): self.context = zmq.asyncio.Context() self._errored_with: Optional[BaseException] = None @@ -94,6 +95,8 @@ def __init__(self, ipc_path: str, engine_config: EngineConfig, parallel_config=engine_config.parallel_config, enable_lora=bool(engine_config.lora_config), ) + self.input_preprocessor = InputPreprocessor(self.model_config, + self.tokenizer) # Send RPCGenerateRequest to the MQLLMEngine. self.input_socket: Socket = self.context.socket(zmq.constants.PUSH) @@ -112,7 +115,11 @@ def __init__(self, ipc_path: str, engine_config: EngineConfig, # Stream for each individual request. self.output_queues: Dict[str, asyncio.Queue] = {} - self.output_loop = asyncio.create_task(self.run_output_handler_loop()) + + # Loop to handle output of the LLMEngine periodically. + # Started after the MQLLMEngine is ready so that we can + # build the Client in an executor to enable clean shutdown. + self.output_loop: Optional[asyncio.Task] = None # Loop to check health of the LLMEngine periodically. # Started after the MQLLMEngine is ready. @@ -247,6 +254,9 @@ async def run_output_handler_loop(self): async def setup(self): """Setup the client before it starts sending server requests.""" + # Start output_loop + self.output_loop = asyncio.create_task(self.run_output_handler_loop()) + with self.get_data_socket() as socket: # Wait until server is ready. response = await self._wait_for_server_rpc(socket) @@ -265,7 +275,8 @@ def close(self): # Cancel background tasks. if self.health_loop is not None: self.health_loop.cancel() - self.output_loop.cancel() + if self.output_loop is not None: + self.output_loop.cancel() def _set_errored(self, e: BaseException): logger.exception(repr(e)) @@ -337,6 +348,9 @@ async def _check_success(error_message: str, socket: Socket): or response != VLLM_RPC_SUCCESS_STR): raise ValueError(error_message) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.input_preprocessor + async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): return await self.tokenizer.get_lora_tokenizer_async(lora_request) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 0a7f430eca488..7de23643a2e1c 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -7,8 +7,7 @@ import zmq from vllm import AsyncEngineArgs, SamplingParams -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.engine.llm_engine import LLMEngine # yapf conflicts with isort for this block # yapf: disable from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT, @@ -19,20 +18,11 @@ RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable -from vllm.envs import VLLM_USE_V1 from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.usage.usage_lib import UsageContext -if VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine -else: - from vllm.engine.llm_engine import LLMEngine - -CONFIG_TYPE = Union[ModelConfig, DecodingConfig, ParallelConfig, - SchedulerConfig, LoRAConfig] - logger = init_logger(__name__) POLLING_TIMEOUT_MS = 10000 @@ -122,15 +112,13 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, load_general_plugins() engine_config = engine_args.create_engine_config() - executor_class = LLMEngine._get_executor_cls(engine_config) - use_async_sockets = (engine_config.model_config.use_async_output_proc - and not VLLM_USE_V1) + use_async_sockets = engine_config.model_config.use_async_output_proc return cls(ipc_path=ipc_path, use_async_sockets=use_async_sockets, - **engine_config.to_dict(), + vllm_config=engine_config, executor_class=executor_class, log_requests=not engine_args.disable_log_requests, log_stats=not engine_args.disable_log_stats, @@ -310,6 +298,17 @@ def _health_check(self): def _send_outputs(self, outputs: REQUEST_OUTPUTS_T): """Send List of RequestOutput to RPCClient.""" if outputs: + try: + from ray.exceptions import RayTaskError + + # RayTaskError might not pickelable here. We need to unpack the + # underlying exception as the real exception in the output. + if (isinstance(outputs, RPCError) + and isinstance(outputs.exception, RayTaskError)): + outputs.exception = outputs.exception.cause + except ImportError: + pass + output_bytes = pickle.dumps(outputs) self.output_socket.send_multipart((output_bytes, ), copy=False) @@ -348,16 +347,22 @@ def stop_profile(self) -> None: self.engine.model_executor._run_workers("stop_profile") +def signal_handler(*_) -> None: + raise KeyboardInterrupt("MQLLMEngine terminated") + + def run_mp_engine(engine_args: AsyncEngineArgs, usage_context: UsageContext, - ipc_path: str): + ipc_path: str, engine_alive): + try: + engine = MQLLMEngine.from_engine_args(engine_args=engine_args, + usage_context=usage_context, + ipc_path=ipc_path) - def signal_handler(*_) -> None: - # Interrupt server on sigterm - raise KeyboardInterrupt("MQLLMEngine terminated") + signal.signal(signal.SIGTERM, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) + engine.start() - engine = MQLLMEngine.from_engine_args(engine_args=engine_args, - usage_context=usage_context, - ipc_path=ipc_path) - engine.start() + except BaseException as e: + logger.exception(e) + engine_alive.value = False + raise e diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 3ed37a269c4b4..7a6ebb430541f 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -63,7 +63,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, single_step_process_prompt_logprob(self, seq_group, output) @staticmethod - @functools.lru_cache() + @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid @@ -134,10 +134,12 @@ def process_outputs(self, sample for sample in samples if sample.output_token != VLLM_INVALID_TOKEN_ID ] - assert valid_samples - self._process_seq_outputs(seq, valid_samples, - sequence_group.sampling_params) + # When both spec-decode and pre-fill chunking are enabled, we + # don't have guaranteed samples here (e.g. all -1s). + if valid_samples: + self._process_seq_outputs(seq, valid_samples, + sequence_group.sampling_params) def _process_decode_and_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None: diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index a71ad493d9920..4b701f81504bb 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, Optional +from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -67,9 +67,13 @@ def maybe_stop_sequence( return # Check if any stop strings are matched. - stop_str = self._check_stop_strings(seq, new_char_count, - sampling_params) - if stop_str is not None: + stop = self.check_stop_strings( + seq.output_text, new_char_count, sampling_params.stop, + sampling_params.include_stop_str_in_output) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + seq.output_text = seq.output_text[:truncate_to] seq.status = SequenceStatus.FINISHED_STOPPED seq.stop_reason = stop_str return @@ -85,33 +89,40 @@ def maybe_stop_sequence( return @staticmethod - def _check_stop_strings(seq: Sequence, new_char_count: int, - sampling_params: SamplingParams) -> Optional[str]: + def check_stop_strings( + output_text: str, + new_char_count: int, + stop: List[str], + include_in_output: bool, + ) -> Optional[Tuple[str, int]]: """Check if any stop strings are matched and truncate sequence output text accordingly. - Returns the stop string if matched or else None. + Returns tuple (stop_string, offset) if matched or else None. + + Where stop_string is the matched stop string and offset is the + length to which output_text should be truncated, or -1 for no + truncation. """ - if not new_char_count or not sampling_params.stop: + if not new_char_count or not stop: return None - for stop_str in sampling_params.stop: + for stop_str in stop: stop_string_len = len(stop_str) # Avoid searching already-searched text. - stop_index = seq.output_text.find( - stop_str, -new_char_count - stop_string_len) + stop_index = output_text.find(stop_str, + -new_char_count - stop_string_len) if stop_index == -1: continue - if sampling_params.include_stop_str_in_output: + if include_in_output: # Truncate to end of stop string. stop_index += stop_string_len - if stop_index >= len(seq.output_text): + if stop_index >= len(output_text): # No truncation required. - return stop_str + return stop_str, -1 # Truncate the output text to either the beginning # or end of the stop string. - seq.output_text = seq.output_text[:stop_index] - return stop_str + return stop_str, stop_index return None diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 6a09361c56865..e15395d75c91f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,11 +1,12 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, List, Mapping, Optional, Union +from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt +from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -60,8 +61,7 @@ def generate( async def beam_search( self, - prompt: Union[PromptType, List[int]], - model_config: ModelConfig, + prompt: PromptType, request_id: str, params: BeamSearchParams, ) -> AsyncGenerator[RequestOutput, None]: @@ -73,14 +73,23 @@ async def beam_search( length_penalty = params.length_penalty include_stop_str_in_output = params.include_stop_str_in_output - tokenizer = await self.get_tokenizer() - input_preprocessor = InputPreprocessor(model_config, tokenizer) + preprocessor = await self.get_input_preprocessor() + tokenizer_group = preprocessor.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async() + + if is_explicit_encoder_decoder_prompt(prompt): + raise NotImplementedError + else: + processed_inputs = preprocessor._prompt_to_llm_inputs( + prompt, + request_id=request_id, + ) + + prompt_token_ids = processed_inputs["prompt_token_ids"] + prompt_text = processed_inputs.get("prompt") + multi_modal_data = processed_inputs.get("multi_modal_data") + mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs") - (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = input_preprocessor._extract_prompt_components( - prompt, - request_id=request_id, - ) tokenized_length = len(prompt_token_ids) sort_beams_key = create_sort_beams_key_function( @@ -211,6 +220,7 @@ async def abort(self, request_id: str) -> None: Args: request_id: The unique id of the request. """ + ... @abstractmethod async def get_model_config(self) -> ModelConfig: @@ -219,8 +229,13 @@ async def get_model_config(self) -> ModelConfig: @abstractmethod async def get_decoding_config(self) -> DecodingConfig: - ... """Get the decoding configuration of the vLLM engine.""" + ... + + @abstractmethod + async def get_input_preprocessor(self) -> InputPreprocessor: + """Get the input processor of the vLLM engine.""" + ... @abstractmethod async def get_tokenizer( diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py index f3e80cab62a34..ea3c93f733038 100644 --- a/vllm/entrypoints/api_server.py +++ b/vllm/entrypoints/api_server.py @@ -66,7 +66,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]: prompt + output.text for output in request_output.outputs ] ret = {"text": text_outputs} - yield (json.dumps(ret) + "\0").encode("utf-8") + yield (json.dumps(ret) + "\n").encode("utf-8") if stream: return StreamingResponse(stream_results()) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index ce36f20760f4c..3ca460c47c3bd 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -22,7 +22,6 @@ ChatCompletionToolMessageParam) # yapf: enable # pydantic needs the TypedDict from typing_extensions -from pydantic import ConfigDict from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast from typing_extensions import Required, TypeAlias, TypedDict @@ -31,7 +30,9 @@ from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import (async_get_and_parse_audio, async_get_and_parse_image, - get_and_parse_audio, get_and_parse_image) + async_get_and_parse_video, + get_and_parse_audio, get_and_parse_image, + get_and_parse_video) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import print_warning_once @@ -52,17 +53,24 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False): """The type of the content part.""" -class CustomChatCompletionContentPartParam(TypedDict, total=False): - __pydantic_config__ = ConfigDict(extra="allow") # type: ignore +class VideoURL(TypedDict, total=False): + url: Required[str] + """ + Either a URL of the video or a data URL with base64 encoded video data. + """ + - type: Required[str] +class ChatCompletionContentPartVideoParam(TypedDict, total=False): + video_url: Required[VideoURL] + + type: Required[Literal["video_url"]] """The type of the content part.""" class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain image_url. This is supported by OpenAI API, although it is not documented. - + Example: { "image_url": "https://example.com/image.jpg" @@ -73,7 +81,7 @@ class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False): class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): """A simpler version of the param that only accepts a plain audio_url. - + Example: { "audio_url": "https://example.com/audio.mp3" @@ -82,12 +90,23 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False): audio_url: Required[str] +class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False): + """A simpler version of the param that only accepts a plain audio_url. + + Example: + { + "video_url": "https://example.com/video.mp4" + } + """ + video_url: Required[str] + + ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, - ChatCompletionContentPartRefusalParam, - CustomChatCompletionContentPartParam, + ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam, CustomChatCompletionContentSimpleImageParam, - CustomChatCompletionContentSimpleAudioParam, str] + CustomChatCompletionContentSimpleAudioParam, + CustomChatCompletionContentSimpleVideoParam, str] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -156,6 +175,10 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._items: List[_T] = [] + @property + def model_config(self) -> ModelConfig: + return self._model_config + @staticmethod @lru_cache(maxsize=None) def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: @@ -183,7 +206,8 @@ def _placeholder_str(self, modality: ModalityStr, if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.image_token_index) - if model_type in ("chameleon", "internvl_chat", "NVLM_D"): + if model_type in ("chameleon", "internvl_chat", "NVLM_D", + "h2ovl_chat"): return "" if model_type == "mllama": return "<|image|>" @@ -191,6 +215,8 @@ def _placeholder_str(self, modality: ModalityStr, return "<|vision_start|><|image_pad|><|vision_end|>" if model_type == "molmo": return "" + if model_type == "idefics3": + return "" raise TypeError(f"Unknown {modality} model type: {model_type}") elif modality == "audio": @@ -203,6 +229,9 @@ def _placeholder_str(self, modality: ModalityStr, elif modality == "video": if model_type == "qwen2_vl": return "<|vision_start|><|video_pad|><|vision_end|>" + if model_type.startswith("llava"): + return self._cached_token_str(self._tokenizer, + hf_config.video_token_index) raise TypeError(f"Unknown {modality} model type: {model_type}") else: raise TypeError(f"Unknown modality: {modality}") @@ -293,6 +322,10 @@ def parse_image(self, image_url: str) -> None: def parse_audio(self, audio_url: str) -> None: raise NotImplementedError + @abstractmethod + def parse_video(self, video_url: str) -> None: + raise NotImplementedError + class MultiModalContentParser(BaseMultiModalContentParser): @@ -302,7 +335,9 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: self._tracker = tracker def parse_image(self, image_url: str) -> None: - image = get_and_parse_image(image_url) + image = get_and_parse_image(image_url, + allowed_local_media_path=self._tracker. + _model_config.allowed_local_media_path) placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) @@ -313,6 +348,12 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio) self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: + video = get_and_parse_video(video_url) + + placeholder = self._tracker.add("video", video) + self._add_placeholder(placeholder) + class AsyncMultiModalContentParser(BaseMultiModalContentParser): @@ -322,7 +363,10 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: self._tracker = tracker def parse_image(self, image_url: str) -> None: - image_coro = async_get_and_parse_image(image_url) + image_coro = async_get_and_parse_image( + image_url, + allowed_local_media_path=self._tracker._model_config. + allowed_local_media_path) placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) @@ -333,6 +377,12 @@ def parse_audio(self, audio_url: str) -> None: placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder(placeholder) + def parse_video(self, video_url: str) -> None: + video = async_get_and_parse_video(video_url) + + placeholder = self._tracker.add("video", video) + self._add_placeholder(placeholder) + def validate_chat_template(chat_template: Optional[Union[Path, str]]): """Raises if the provided chat template appears invalid.""" @@ -361,7 +411,7 @@ def load_chat_template( if chat_template is None: return None try: - with open(chat_template, "r") as f: + with open(chat_template) as f: resolved_chat_template = f.read() except OSError as e: if isinstance(chat_template, Path): @@ -413,6 +463,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _ImageParser = partial(cast, ChatCompletionContentPartImageParam) _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) +_VideoParser = partial(cast, ChatCompletionContentPartVideoParam) MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} # Define a mapping from part types to their corresponding parsing functions. @@ -425,6 +476,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""), "refusal": lambda part: _RefusalParser(part).get("refusal", ""), + "video_url": + lambda part: _VideoParser(part).get("video_url", {}).get("url", ""), } @@ -469,7 +522,10 @@ def _parse_chat_message_content_mm_part( audio_params = cast(CustomChatCompletionContentSimpleAudioParam, part) return "audio_url", audio_params.get("audio_url", "") - + if part.get("video_url") is not None: + video_params = cast(CustomChatCompletionContentSimpleVideoParam, + part) + return "video_url", video_params.get("video_url", "") # Raise an error if no 'type' or direct URL is found. raise ValueError("Missing 'type' field in multimodal part.") @@ -479,7 +535,7 @@ def _parse_chat_message_content_mm_part( VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url", - "audio_url") + "audio_url", "video_url") def _parse_chat_message_content_parts( @@ -491,10 +547,13 @@ def _parse_chat_message_content_parts( content: List[Union[str, Dict[str, str]]] = [] mm_parser = mm_tracker.create_parser() - wrap_dicts = \ - mm_tracker._model_config.hf_config.model_type in \ - MODEL_KEEP_MULTI_MODAL_CONTENT or \ - (chat_template_text_format == "openai") + model_config = mm_tracker.model_config + + wrap_dicts = (chat_template_text_format == "openai" + or (model_config.task == "embedding" + and model_config.is_multimodal_model) + or (model_config.hf_config.model_type + in MODEL_KEEP_MULTI_MODAL_CONTENT)) for part in parts: parse_res = _parse_chat_message_content_part( @@ -536,7 +595,7 @@ def _parse_chat_message_content_part( # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) - # if part_type is text/refusal/image_url/audio_url but + # if part_type is text/refusal/image_url/audio_url/video_url but # content is empty, log a warning and skip if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: logger.warning( @@ -555,6 +614,10 @@ def _parse_chat_message_content_part( mm_parser.parse_audio(content) return {'type': 'audio'} if wrap_dicts else None + if part_type == "video_url": + mm_parser.parse_video(content) + return {'type': 'video'} if wrap_dicts else None + raise NotImplementedError(f"Unknown part type: {part_type}") diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 3d62cb3598477..a15dbd1c45119 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1,7 +1,7 @@ import itertools import warnings from contextlib import contextmanager -from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, +from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type, Union, cast, overload) from tqdm import tqdm @@ -10,6 +10,7 @@ from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, BeamSearchSequence, get_beam_search_score) from vllm.engine.arg_utils import EngineArgs, TaskOption +from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, apply_hf_chat_template, apply_mistral_chat_template, @@ -31,11 +32,6 @@ from vllm.usage.usage_lib import UsageContext from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of -if envs.VLLM_USE_V1: - from vllm.v1.engine.llm_engine import LLMEngine # type: ignore -else: - from vllm.engine.llm_engine import LLMEngine # type: ignore - logger = init_logger(__name__) @@ -58,6 +54,10 @@ class LLM: from the input. trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer. + allowed_local_media_path: Allowing API requests to read local images + or videos from directories specified by the server file system. + This is a security risk. Should only be enabled in trusted + environments. tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism. dtype: The data type for the model weights and activations. Currently, @@ -98,7 +98,10 @@ class LLM: to eager mode. Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this, we fall back to the eager mode. - disable_custom_all_reduce: See ParallelConfig + disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig` + disable_async_output_proc: Disable async output processing. + This may result in lower performance. + hf_overrides: Arguments to be forwarded to the HuggingFace config. **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See :ref:`engine_args`) @@ -139,6 +142,7 @@ def __init__( tokenizer_mode: str = "auto", skip_tokenizer_init: bool = False, trust_remote_code: bool = False, + allowed_local_media_path: str = "", tensor_parallel_size: int = 1, dtype: str = "auto", quantization: Optional[str] = None, @@ -152,6 +156,7 @@ def __init__( max_seq_len_to_capture: int = 8192, disable_custom_all_reduce: bool = False, disable_async_output_proc: bool = False, + hf_overrides: Optional[dict] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, # After positional args are removed, move this right below `model` task: TaskOption = "auto", @@ -179,6 +184,7 @@ def __init__( tokenizer_mode=tokenizer_mode, skip_tokenizer_init=skip_tokenizer_init, trust_remote_code=trust_remote_code, + allowed_local_media_path=allowed_local_media_path, tensor_parallel_size=tensor_parallel_size, dtype=dtype, quantization=quantization, @@ -192,6 +198,7 @@ def __init__( max_seq_len_to_capture=max_seq_len_to_capture, disable_custom_all_reduce=disable_custom_all_reduce, disable_async_output_proc=disable_async_output_proc, + hf_overrides=hf_overrides, mm_processor_kwargs=mm_processor_kwargs, pooling_type=pooling_type, pooling_norm=pooling_norm, @@ -200,10 +207,24 @@ def __init__( pooling_returned_token_ids=pooling_returned_token_ids, **kwargs, ) - self.llm_engine = LLMEngine.from_engine_args( + # Logic to switch between engines is done at runtime instead of import + # to avoid import order issues + self.engine_class = self.get_engine_class() + + # TODO(rob): enable mp by default (issue with fork vs spawn) + self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) + self.request_counter = Counter() + @staticmethod + def get_engine_class() -> Type[LLMEngine]: + if envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine + return V1LLMEngine # type: ignore + return LLMEngine + def get_tokenizer(self) -> AnyTokenizer: return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer @@ -388,7 +409,7 @@ def generate( priority=priority) outputs = self._run_engine(use_tqdm=use_tqdm) - return LLMEngine.validate_outputs(outputs, RequestOutput) + return self.engine_class.validate_outputs(outputs, RequestOutput) def beam_search( self, @@ -763,7 +784,8 @@ def encode( ) outputs = self._run_engine(use_tqdm=use_tqdm) - return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput) + return self.engine_class.validate_outputs(outputs, + EmbeddingRequestOutput) def start_profile(self) -> None: self.llm_engine.start_profile() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 46c92e10b360c..6a24cdbc6a18f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -7,11 +7,12 @@ import signal import socket import tempfile +import uuid from argparse import Namespace from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus -from typing import AsyncIterator, Set +from typing import AsyncIterator, Optional, Set import uvloop from fastapi import APIRouter, FastAPI, Request @@ -25,7 +26,6 @@ import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient @@ -51,7 +51,7 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -60,6 +60,11 @@ from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path from vllm.version import __version__ as VLLM_VERSION +if envs.VLLM_USE_V1: + from vllm.v1.engine.async_llm import AsyncLLMEngine # type: ignore +else: + from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore + TIMEOUT_KEEP_ALIVE = 5 # seconds prometheus_multiproc_dir: tempfile.TemporaryDirectory @@ -125,7 +130,8 @@ async def build_async_engine_client_from_engine_args( # Fall back # TODO: fill out feature matrix. if (MQLLMEngineClient.is_unsupported_config(engine_args) - or disable_frontend_multiprocessing): + or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): + engine_config = engine_args.create_engine_config() uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), "uses_ray", False) @@ -142,6 +148,8 @@ async def build_async_engine_client_from_engine_args( None, build_engine) yield engine_client + if hasattr(engine_client, "shutdown"): + engine_client.shutdown() return # Otherwise, use the multiprocessing AsyncLLMEngine. @@ -171,39 +179,44 @@ async def build_async_engine_client_from_engine_args( # so we need to spawn a new process context = multiprocessing.get_context("spawn") + # The Process can raise an exception during startup, which may + # not actually result in an exitcode being reported. As a result + # we use a shared variable to communicate the information. + engine_alive = multiprocessing.Value('b', True, lock=False) engine_process = context.Process(target=run_mp_engine, args=(engine_args, UsageContext.OPENAI_API_SERVER, - ipc_path)) + ipc_path, engine_alive)) engine_process.start() engine_pid = engine_process.pid - assert engine_pid is not None, "Engine process failed to start" + assert engine_pid is not None, "Engine process failed to start." logger.info("Started engine process with PID %d", engine_pid) # Build RPCClient, which conforms to EngineClient Protocol. - # NOTE: Actually, this is not true yet. We still need to support - # embedding models via RPC (see TODO above) engine_config = engine_args.create_engine_config() - mp_engine_client = MQLLMEngineClient(ipc_path, engine_config, - engine_pid) - + build_client = partial(MQLLMEngineClient, ipc_path, engine_config, + engine_pid) + mq_engine_client = await asyncio.get_running_loop().run_in_executor( + None, build_client) try: while True: try: - await mp_engine_client.setup() + await mq_engine_client.setup() break except TimeoutError: - if not engine_process.is_alive(): + if (not engine_process.is_alive() + or not engine_alive.value): raise RuntimeError( - "Engine process failed to start") from None + "Engine process failed to start. See stack " + "trace for the root cause.") from None - yield mp_engine_client # type: ignore[misc] + yield mq_engine_client # type: ignore[misc] finally: # Ensure rpc server process was terminated engine_process.terminate() # Close all open connections to the backend - mp_engine_client.close() + mq_engine_client.close() # Wait for engine process to join engine_process.join(4) @@ -248,20 +261,25 @@ def mount_metrics(app: FastAPI): app.routes.append(metrics_route) -def chat(request: Request) -> OpenAIServingChat: +def base(request: Request) -> OpenAIServing: + # Reuse the existing instance + return tokenization(request) + + +def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat -def completion(request: Request) -> OpenAIServingCompletion: +def completion(request: Request) -> Optional[OpenAIServingCompletion]: return request.app.state.openai_serving_completion -def tokenization(request: Request) -> OpenAIServingTokenization: - return request.app.state.openai_serving_tokenization +def embedding(request: Request) -> Optional[OpenAIServingEmbedding]: + return request.app.state.openai_serving_embedding -def embedding(request: Request) -> OpenAIServingEmbedding: - return request.app.state.openai_serving_embedding +def tokenization(request: Request) -> OpenAIServingTokenization: + return request.app.state.openai_serving_tokenization def engine_client(request: Request) -> EngineClient: @@ -277,7 +295,9 @@ async def health(raw_request: Request) -> Response: @router.post("/tokenize") async def tokenize(request: TokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_tokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_tokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -289,7 +309,9 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): @router.post("/detokenize") async def detokenize(request: DetokenizeRequest, raw_request: Request): - generator = await tokenization(raw_request).create_detokenize(request) + handler = tokenization(raw_request) + + generator = await handler.create_detokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -301,7 +323,9 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - models = await completion(raw_request).show_available_models() + handler = base(raw_request) + + models = await handler.show_available_models() return JSONResponse(content=models.model_dump()) @@ -314,9 +338,12 @@ async def show_version(): @router.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + handler = chat(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Chat Completions API") - generator = await chat(raw_request).create_chat_completion( - request, raw_request) + generator = await handler.create_chat_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), @@ -330,8 +357,12 @@ async def create_chat_completion(request: ChatCompletionRequest, @router.post("/v1/completions") async def create_completion(request: CompletionRequest, raw_request: Request): - generator = await completion(raw_request).create_completion( - request, raw_request) + handler = completion(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Completions API") + + generator = await handler.create_completion(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -343,8 +374,12 @@ async def create_completion(request: CompletionRequest, raw_request: Request): @router.post("/v1/embeddings") async def create_embedding(request: EmbeddingRequest, raw_request: Request): - generator = await embedding(raw_request).create_embedding( - request, raw_request) + handler = embedding(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Embeddings API") + + generator = await handler.create_embedding(request, raw_request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -382,30 +417,26 @@ async def stop_profile(raw_request: Request): @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - response = await chat(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) - - response = await completion(raw_request).unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + for route in [chat, completion, embedding]: + handler = route(raw_request) + if handler is not None: + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -452,6 +483,13 @@ async def authentication(request: Request, call_next): status_code=401) return await call_next(request) + @app.middleware("http") + async def add_request_id(request: Request, call_next): + request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex + response = await call_next(request) + response.headers["X-Request-Id"] = request_id + return response + for middleware in args.middleware: module_path, object_name = middleware.rsplit(".", 1) imported = getattr(importlib.import_module(module_path), object_name) @@ -501,7 +539,9 @@ def init_app_state( chat_template=args.chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, - tool_parser=args.tool_call_parser) + tool_parser=args.tool_call_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + ) if model_config.task == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, @@ -510,13 +550,14 @@ def init_app_state( prompt_adapters=args.prompt_adapters, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) + ) if model_config.task == "generate" else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, base_model_paths, request_logger=request_logger, - ) + chat_template=args.chat_template, + ) if model_config.task == "embedding" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, @@ -544,7 +585,8 @@ async def run_server(args, **uvicorn_kwargs) -> None: # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("", args.port)) + sock.bind((args.host or "", args.port)) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) def signal_handler(*_) -> None: # Interrupt server on sigterm while initializing @@ -568,13 +610,14 @@ def signal_handler(*_) -> None: ssl_certfile=args.ssl_certfile, ssl_ca_certs=args.ssl_ca_certs, ssl_cert_reqs=args.ssl_cert_reqs, - fd=sock.fileno(), **uvicorn_kwargs, ) # NB: Await server shutdown only after the backend context is exited await shutdown_task + sock.close() + if __name__ == "__main__": # NOTE(simon): diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index a089985ac9758..eb08a89293370 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -190,7 +190,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help= "Enable auto tool choice for supported models. Use --tool-call-parser" - "to specify which parser to use") + " to specify which parser to use") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -228,6 +228,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 60fc5ac8d11d2..820aefd8800d9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -99,10 +99,15 @@ class ModelList(OpenAIBaseModel): data: List[ModelCard] = Field(default_factory=list) +class PromptTokenUsageInfo(OpenAIBaseModel): + cached_tokens: Optional[int] = None + + class UsageInfo(OpenAIBaseModel): prompt_tokens: int = 0 total_tokens: int = 0 completion_tokens: Optional[int] = 0 + prompt_tokens_details: Optional[PromptTokenUsageInfo] = None class RequestResponseMetadata(BaseModel): @@ -454,6 +459,12 @@ def check_tool_usage(cls, data): if "tool_choice" not in data and data.get("tools"): data["tool_choice"] = "auto" + # if "tool_choice" is "none" -- ignore tools if present + if "tool_choice" in data and data["tool_choice"] == "none": + # ensure that no tools are present + data.pop("tools", None) + return data + # if "tool_choice" is specified -- validation if "tool_choice" in data: @@ -467,8 +478,8 @@ def check_tool_usage(cls, data): if data["tool_choice"] != "auto" and not isinstance( data["tool_choice"], dict): raise ValueError( - "`tool_choice` must either be a named tool or \"auto\". " - "`tool_choice=\"none\" is not supported.") + "`tool_choice` must either be a named tool, \"auto\", " + "or \"none\".") # ensure that if "tool_choice" is specified as an object, # it matches a valid tool @@ -708,7 +719,7 @@ def validate_stream_options(cls, data): return data -class EmbeddingRequest(OpenAIBaseModel): +class EmbeddingCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/embeddings model: str @@ -720,10 +731,15 @@ class EmbeddingRequest(OpenAIBaseModel): # doc: begin-embedding-pooling-params additional_data: Optional[Any] = None - # doc: end-embedding-pooling-params # doc: begin-embedding-extra-params + add_special_tokens: bool = Field( + default=True, + description=( + "If true (the default), special tokens (e.g. BOS) will be added to " + "the prompt."), + ) priority: int = Field( default=0, description=( @@ -737,6 +753,82 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) +class EmbeddingChatRequest(OpenAIBaseModel): + model: str + messages: List[ChatCompletionMessageParam] + + encoding_format: Literal["float", "base64"] = "float" + dimensions: Optional[int] = None + user: Optional[str] = None + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-chat-embedding-pooling-params + additional_data: Optional[Any] = None + # doc: end-chat-embedding-pooling-params + + # doc: begin-chat-embedding-extra-params + add_generation_prompt: bool = Field( + default=True, + description= + ("If true, the generation prompt will be added to the chat template. " + "This is a parameter used by chat template in tokenizer config of the " + "model."), + ) + continue_final_message: bool = Field( + default=False, + description= + ("If this is set, the chat will be formatted so that the final " + "message in the chat is open-ended, without any EOS tokens. The " + "model will continue this message rather than starting a new one. " + "This allows you to \"prefill\" part of the model's response for it. " + "Cannot be used at the same time as `add_generation_prompt`."), + ) + add_special_tokens: bool = Field( + default=False, + description=( + "If true, special tokens (e.g. BOS) will be added to the prompt " + "on top of what is added by the chat template. " + "For most models, the chat template takes care of adding the " + "special tokens so this should be set to false (as is the " + "default)."), + ) + chat_template: Optional[str] = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "As of transformers v4.44, default chat template is no longer " + "allowed, so you must provide a chat template if the tokenizer " + "does not define one."), + ) + chat_template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the template renderer. " + "Will be accessible by the chat template."), + ) + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + # doc: end-chat-embedding-extra-params + + @model_validator(mode="before") + @classmethod + def check_generation_prompt(cls, data): + if data.get("continue_final_message") and data.get( + "add_generation_prompt"): + raise ValueError("Cannot set both `continue_final_message` and " + "`add_generation_prompt` to True.") + return data + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + +EmbeddingRequest = Union[EmbeddingCompletionRequest, EmbeddingChatRequest] + + class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) @@ -799,7 +891,7 @@ class EmbeddingResponseData(OpenAIBaseModel): class EmbeddingResponse(OpenAIBaseModel): - id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") + id: str = Field(default_factory=lambda: f"embd-{random_uuid()}") object: str = "list" created: int = Field(default_factory=lambda: int(time.time())) model: str diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f5249a0c447b3..1b422a93263b2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -78,6 +78,11 @@ def parse_args(): help="Port number for the Prometheus metrics server " "(only needed if enable-metrics is set).", ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser.parse_args() @@ -120,7 +125,7 @@ async def read_file(path_or_url: str) -> str: session.get(path_or_url) as resp: return await resp.text() else: - with open(path_or_url, "r", encoding="utf-8") as f: + with open(path_or_url, encoding="utf-8") as f: return f.read() @@ -217,13 +222,15 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, - ) + enable_prompt_tokens_details=args.enable_prompt_tokens_details, + ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, base_model_paths, request_logger=request_logger, - ) + chat_template=None, + ) if model_config.task == "embedding" else None tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) @@ -240,14 +247,31 @@ async def main(args): # Determine the type of request and run it. if request.url == "/v1/chat/completions": - response_futures.append( - run_request(openai_serving_chat.create_chat_completion, - request, tracker)) + handler_fn = (None if openai_serving_chat is None else + openai_serving_chat.create_chat_completion) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg= + "The model does not support Chat Completions API", + )) + continue + + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() elif request.url == "/v1/embeddings": - response_futures.append( - run_request(openai_serving_embedding.create_embedding, request, - tracker)) + handler_fn = (None if openai_serving_embedding is None else + openai_serving_embedding.create_embedding) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Embeddings API", + )) + continue + + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() else: response_futures.append( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 1f951d15a7a32..09edaf98f7d17 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -10,11 +10,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (ConversationMessage, - apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, @@ -22,21 +18,17 @@ ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, - DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata, - ToolCall, UsageInfo) + DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, + RequestResponseMetadata, ToolCall, UsageInfo) from vllm.entrypoints.openai.serving_engine import (BaseModelPath, LoRAModulePath, OpenAIServing, - PromptAdapterPath, - TextTokensPrompt) + PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager -from vllm.inputs import TokensPrompt from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.tracing import (contains_trace_headers, extract_trace_headers, - log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import iterate_with_cancellation @@ -57,7 +49,8 @@ def __init__(self, chat_template: Optional[str], return_tokens_as_token_ids: bool = False, enable_auto_tools: bool = False, - tool_parser: Optional[str] = None): + tool_parser: Optional[str] = None, + enable_prompt_tokens_details: bool = False): super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -88,18 +81,20 @@ def __init__(self, f"tool_parser:'{tool_parser}' which has not " "been registered") from e + self.enable_prompt_tokens_details = enable_prompt_tokens_details + async def create_chat_completion( self, request: ChatCompletionRequest, raw_request: Optional[Request] = None, ) -> Union[AsyncGenerator[str, None], ChatCompletionResponse, ErrorResponse]: - """Completion API similar to OpenAI's API. + """ + Chat Completion API similar to OpenAI's API. See https://platform.openai.com/docs/api-reference/chat/create for the API specification. This API mimics the OpenAI - ChatCompletion API. - + Chat Completion API. """ error_check_ret = await self._check_model(request) if error_check_ret is not None: @@ -118,143 +113,105 @@ async def create_chat_completion( prompt_adapter_request, ) = self._maybe_get_adapters(request) - model_config = self.model_config tokenizer = await self.engine_client.get_tokenizer(lora_request) - - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) + tool_parser = self.tool_parser + + # validation for OpenAI tools + # tool_choice = "required" is not supported + if request.tool_choice == "required": + return self.create_error_response( + "tool_choice = \"required\" is not supported!") + + if (request.tool_choice == "auto" and + not (self.enable_auto_tools and tool_parser is not None) + and not isinstance(tokenizer, MistralTokenizer)): + # for hf tokenizers, "auto" tools requires + # --enable-auto-tool-choice and --tool-call-parser + return self.create_error_response( + "\"auto\" tool choice requires " + "--enable-auto-tool-choice and --tool-call-parser to be set" + ) tool_dicts = None if request.tools is None else [ tool.model_dump() for tool in request.tools ] - prompt: Union[str, List[int]] - is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) - if is_mistral_tokenizer: - prompt = apply_mistral_chat_template( - tokenizer, - messages=request.messages, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tools=tool_dicts, - documents=request.documents, - **(request.chat_template_kwargs or {}), - ) - else: - prompt = apply_hf_chat_template( - tokenizer, - conversation=conversation, - chat_template=request.chat_template or self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, - tools=tool_dicts, - documents=request.documents, - **(request.chat_template_kwargs or {}), - ) - except Exception as e: - logger.exception("Error in applying chat template from request") - return self.create_error_response(str(e)) - - try: - mm_data = await mm_data_future - except Exception as e: - logger.exception("Error in loading multi-modal data") + ( + conversation, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + tool_dicts=tool_dicts, + documents=request.documents, + chat_template_kwargs=request.chat_template_kwargs, + tool_parser=tool_parser, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) - # validation for OpenAI tools - # tool_choice = "required" is not supported - if request.tool_choice == "required": - return self.create_error_response( - "tool_choice = \"required\" is not supported!") - - if not is_mistral_tokenizer and request.tool_choice == "auto" and not ( - self.enable_auto_tools and self.tool_parser is not None): - # for hf tokenizers, "auto" tools requires - # --enable-auto-tool-choice and --tool-call-parser - return self.create_error_response( - "\"auto\" tool choice requires " - "--enable-auto-tool-choice and --tool-call-parser to be set") - - request_id = f"chat-{request.request_id}" + request_id = f"chatcmpl-{request.request_id}" request_metadata = RequestResponseMetadata(request_id=request_id) if raw_request: raw_request.state.request_metadata = request_metadata + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[RequestOutput, None]] = [] try: - if self.enable_auto_tools and self.tool_parser: - request = self.tool_parser(tokenizer).adjust_request( - request=request) - - if isinstance(prompt, str): - prompt_inputs = self._tokenize_prompt_input( - request, - tokenizer, - prompt, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - ) - else: - assert isinstance(prompt, list) and isinstance( - prompt[0], int - ), "Prompt has to be either a string or a list of token ids" - prompt_inputs = TextTokensPrompt( - prompt=tokenizer.decode(prompt), prompt_token_ids=prompt) - - assert prompt_inputs is not None - - sampling_params: Union[SamplingParams, BeamSearchParams] - default_max_tokens = self.max_model_len - len( - prompt_inputs["prompt_token_ids"]) - if request.use_beam_search: - sampling_params = request.to_beam_search_params( - default_max_tokens) - else: - sampling_params = request.to_sampling_params( - default_max_tokens) - - self._log_inputs(request_id, - prompt_inputs, - params=sampling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) - - engine_inputs = TokensPrompt( - prompt_token_ids=prompt_inputs["prompt_token_ids"]) - if mm_data is not None: - engine_inputs["multi_modal_data"] = mm_data - - is_tracing_enabled = (await - self.engine_client.is_tracing_enabled()) - trace_headers = None - if is_tracing_enabled and raw_request: - trace_headers = extract_trace_headers(raw_request.headers) - if (not is_tracing_enabled and raw_request - and contains_trace_headers(raw_request.headers)): - log_tracing_disabled_warning() - - if isinstance(sampling_params, BeamSearchParams): - result_generator = self.engine_client.beam_search( - prompt=engine_inputs, - model_config=self.model_config, - request_id=request_id, - params=sampling_params, - ) - else: - result_generator = self.engine_client.generate( - engine_inputs, - sampling_params, - request_id, - lora_request=lora_request, - trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request, - priority=request.priority, - ) + for i, engine_prompt in enumerate(engine_prompts): + sampling_params: Union[SamplingParams, BeamSearchParams] + default_max_tokens = self.max_model_len - len( + engine_prompt["prompt_token_ids"]) + if request.use_beam_search: + sampling_params = request.to_beam_search_params( + default_max_tokens) + else: + sampling_params = request.to_sampling_params( + default_max_tokens) + + self._log_inputs(request_id, + request_prompts[i], + params=sampling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + if isinstance(sampling_params, BeamSearchParams): + generator = self.engine_client.beam_search( + prompt=engine_prompt, + request_id=request_id, + params=sampling_params, + ) + else: + generator = self.engine_client.generate( + engine_prompt, + sampling_params, + request_id, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=request.priority, + ) + + generators.append(generator) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + assert len(generators) == 1 + result_generator, = generators + if raw_request: result_generator = iterate_with_cancellation( result_generator, raw_request.is_disconnected) @@ -297,6 +254,7 @@ async def chat_completion_stream_generator( previous_num_tokens = [0] * num_choices finish_reason_sent = [False] * num_choices num_prompt_tokens = 0 + num_cached_tokens = None if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): tool_choice_function_name = request.tool_choice.function.name @@ -350,6 +308,7 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: + num_cached_tokens = res.num_cached_tokens # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) @@ -575,11 +534,13 @@ async def chat_completion_stream_generator( # is sent, send the usage if include_usage: completion_tokens = sum(previous_num_tokens) - final_usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) + final_usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + + completion_tokens) + if self.enable_prompt_tokens_details and num_cached_tokens: + final_usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=num_cached_tokens) final_usage_chunk = ChatCompletionStreamResponse( id=request_id, @@ -626,6 +587,9 @@ async def chat_completion_full_generator( final_res = res except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) assert final_res is not None @@ -744,11 +708,13 @@ async def chat_completion_full_generator( num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) + usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + + num_generated_tokens) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens) request_metadata.final_usage_info = usage diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index da521a6012530..936aae8f1c267 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -1,7 +1,6 @@ import asyncio import time -from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, List, - Optional) +from typing import AsyncGenerator, AsyncIterator, Dict, List, Optional from typing import Sequence as GenericSequence from typing import Tuple, Union, cast @@ -30,18 +29,11 @@ from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.tracing import (contains_trace_headers, extract_trace_headers, - log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import merge_async_iterators, random_uuid logger = init_logger(__name__) -TypeTokenIDs = List[int] -TypeTopLogProbs = List[Optional[Dict[int, float]]] -TypeCreateLogProbsFn = Callable[ - [TypeTokenIDs, TypeTopLogProbs, Optional[int], int], CompletionLogProbs] - class OpenAIServingCompletion(OpenAIServing): @@ -101,8 +93,6 @@ async def create_completion( if raw_request: raw_request.state.request_metadata = request_metadata - # Schedule the request and get the result generator. - generators: List[AsyncGenerator[RequestOutput, None]] = [] try: ( lora_request, @@ -111,19 +101,24 @@ async def create_completion( tokenizer = await self.engine_client.get_tokenizer(lora_request) - prompts = list( - self._tokenize_prompt_input_or_inputs( - request, - tokenizer, - request.prompt, - truncate_prompt_tokens=request.truncate_prompt_tokens, - add_special_tokens=request.add_special_tokens, - )) + request_prompts, engine_prompts = self._preprocess_completion( + request, + tokenizer, + request.prompt, + truncate_prompt_tokens=request.truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) - for i, prompt_inputs in enumerate(prompts): + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[RequestOutput, None]] = [] + try: + for i, engine_prompt in enumerate(engine_prompts): sampling_params: Union[SamplingParams, BeamSearchParams] default_max_tokens = self.max_model_len - len( - prompt_inputs["prompt_token_ids"]) + engine_prompt["prompt_token_ids"]) if request.use_beam_search: sampling_params = request.to_beam_search_params( default_max_tokens) @@ -134,36 +129,23 @@ async def create_completion( request_id_item = f"{request_id}-{i}" self._log_inputs(request_id_item, - prompt_inputs, + request_prompts[i], params=sampling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - is_tracing_enabled = (await - self.engine_client.is_tracing_enabled()) - trace_headers = None - if is_tracing_enabled: - trace_headers = extract_trace_headers(raw_request.headers) - if not is_tracing_enabled and contains_trace_headers( - raw_request.headers): - log_tracing_disabled_warning() + trace_headers = (await + self._get_trace_headers(raw_request.headers)) if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( - prompt={ - "prompt_token_ids": - prompt_inputs["prompt_token_ids"] - }, - model_config=self.model_config, + prompt=engine_prompt, request_id=request_id, params=sampling_params, ) else: generator = self.engine_client.generate( - { - "prompt_token_ids": - prompt_inputs["prompt_token_ids"] - }, + engine_prompt, sampling_params, request_id_item, lora_request=lora_request, @@ -180,6 +162,8 @@ async def create_completion( result_generator = merge_async_iterators( *generators, is_cancelled=raw_request.is_disconnected) + num_prompts = len(engine_prompts) + # Similar to the OpenAI API, when n != best_of, we do not stream the # results. In addition, we do not stream the results when use # beam search. @@ -195,12 +179,12 @@ async def create_completion( request_id, created_time, model_name, - num_prompts=len(prompts), + num_prompts=num_prompts, tokenizer=tokenizer, request_metadata=request_metadata) # Non-streaming response - final_res_batch: List[Optional[RequestOutput]] = [None] * len(prompts) + final_res_batch: List[Optional[RequestOutput]] = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res @@ -212,7 +196,7 @@ async def create_completion( # We did not pass it into vLLM engine to avoid being redundant # with the inputs token IDs if final_res.prompt is None: - final_res.prompt = prompts[i]["prompt"] + final_res.prompt = request_prompts[i]["prompt"] final_res_batch_checked = cast(List[RequestOutput], final_res_batch) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index 6c46aae2838f6..bbe7db8f13231 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -9,8 +9,10 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.logger import RequestLogger -from vllm.entrypoints.openai.protocol import (EmbeddingRequest, +from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest, + EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) @@ -21,8 +23,6 @@ logger = init_logger(__name__) -TypeTokenIDs = List[int] - def _get_embedding( output: EmbeddingOutput, @@ -76,6 +76,7 @@ def __init__( base_model_paths: List[BaseModelPath], *, request_logger: Optional[RequestLogger], + chat_template: Optional[str], ): super().__init__(engine_client=engine_client, model_config=model_config, @@ -83,21 +84,20 @@ def __init__( lora_modules=None, prompt_adapters=None, request_logger=request_logger) - self._enabled = self._check_embedding_mode( - model_config.task == "embedding") + + self.chat_template = load_chat_template(chat_template) async def create_embedding( self, request: EmbeddingRequest, raw_request: Optional[Request] = None, ) -> Union[EmbeddingResponse, ErrorResponse]: - """Completion API similar to OpenAI's API. + """ + Embedding API similar to OpenAI's API. See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. """ - if not self._enabled: - return self.create_error_response("Embedding API disabled") error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -122,8 +122,6 @@ async def create_embedding( "greater than max_model_len." " Please, select a smaller truncation size.") - # Schedule the request and get the result generator. - generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] try: ( lora_request, @@ -132,32 +130,60 @@ async def create_embedding( tokenizer = await self.engine_client.get_tokenizer(lora_request) - pooling_params = request.to_pooling_params() + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for embedding models") + + if isinstance(request, EmbeddingChatRequest): + ( + _, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, + tokenizer, + request.messages, + chat_template=request.chat_template or self.chat_template, + add_generation_prompt=request.add_generation_prompt, + continue_final_message=request.continue_final_message, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + else: + request_prompts, engine_prompts = self._preprocess_completion( + request, + tokenizer, + request.input, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=request.add_special_tokens, + ) + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) - prompts = list( - self._tokenize_prompt_input_or_inputs(request, tokenizer, - request.input, - truncate_prompt_tokens)) + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[EmbeddingRequestOutput, None]] = [] + try: + pooling_params = request.to_pooling_params() - for i, prompt_inputs in enumerate(prompts): + for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" self._log_inputs(request_id_item, - prompt_inputs, + request_prompts[i], params=pooling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - if prompt_adapter_request is not None: - raise NotImplementedError( - "Prompt adapter is not supported " - "for embedding models") + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) generator = self.engine_client.encode( - {"prompt_token_ids": prompt_inputs["prompt_token_ids"]}, + engine_prompt, pooling_params, request_id_item, lora_request=lora_request, + trace_headers=trace_headers, priority=request.priority, ) @@ -171,15 +197,16 @@ async def create_embedding( is_cancelled=raw_request.is_disconnected if raw_request else None, ) + num_prompts = len(engine_prompts) + # Non-streaming response final_res_batch: List[Optional[EmbeddingRequestOutput]] - final_res_batch = [None] * len(prompts) + final_res_batch = [None] * num_prompts try: async for i, res in result_generator: final_res_batch[i] = res - for final_res in final_res_batch: - assert final_res is not None + assert all(final_res is not None for final_res in final_res_batch) final_res_batch_checked = cast(List[EmbeddingRequestOutput], final_res_batch) @@ -194,11 +221,3 @@ async def create_embedding( return self.create_error_response(str(e)) return response - - def _check_embedding_mode(self, embedding_mode: bool) -> bool: - if not embedding_mode: - logger.warning( - "embedding_mode is False. Embedding API will not work.") - else: - logger.info("Activating the server engine with embedding enabled.") - return embedding_mode diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 22a01b3dc4cc0..fa315fa516632 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -2,28 +2,38 @@ import pathlib from dataclasses import dataclass from http import HTTPStatus -from typing import Iterable, Iterator, List, Optional, Tuple, TypedDict, Union +from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, + Optional, Sequence, Tuple, TypedDict, Union) from pydantic import Field +from starlette.datastructures import Headers from typing_extensions import Annotated from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient +from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam, + ConversationMessage, + apply_hf_chat_template, + apply_mistral_chat_template, + parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - EmbeddingRequest, ErrorResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, + ErrorResponse, LoadLoraAdapterRequest, ModelCard, ModelList, ModelPermission, TokenizeChatRequest, TokenizeCompletionRequest, - TokenizeRequest, UnloadLoraAdapterRequest) +from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable +from vllm.inputs import TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -31,8 +41,10 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob -from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import AtomicCounter +from vllm.tracing import (contains_trace_headers, extract_trace_headers, + log_tracing_disabled_warning) +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import AtomicCounter, is_list_of logger = init_logger(__name__) @@ -56,8 +68,14 @@ class LoRAModulePath: base_model_name: Optional[str] = None -AnyRequest = Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, - EmbeddingRequest, TokenizeRequest] +CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, + EmbeddingCompletionRequest, + TokenizeCompletionRequest] + +ChatLikeRequest = Union[ChatCompletionRequest, EmbeddingChatRequest, + TokenizeChatRequest] + +AnyRequest = Union[CompletionLikeRequest, ChatLikeRequest] class TextTokensPrompt(TypedDict): @@ -65,6 +83,9 @@ class TextTokensPrompt(TypedDict): prompt_token_ids: List[int] +RequestPrompt = Union[List[int], str, TextTokensPrompt] + + class OpenAIServing: def __init__( @@ -246,7 +267,8 @@ def _validate_input( token_num = len(input_ids) # Note: EmbeddingRequest doesn't have max_tokens - if isinstance(request, EmbeddingRequest): + if isinstance(request, + (EmbeddingChatRequest, EmbeddingCompletionRequest)): if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " @@ -373,10 +395,121 @@ def _tokenize_prompt_input_or_inputs( truncate_prompt_tokens=truncate_prompt_tokens, ) + def _preprocess_completion( + self, + request: CompletionLikeRequest, + tokenizer: AnyTokenizer, + input_or_inputs: Union[str, List[str], List[int], List[List[int]]], + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + add_special_tokens: bool = True, + ) -> Tuple[Sequence[TextTokensPrompt], List[TokensPrompt]]: + request_prompts = [ + request_prompt + for request_prompt in self._tokenize_prompt_input_or_inputs( + request, + tokenizer, + input_or_inputs, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + ) + ] + + engine_prompts = [ + TokensPrompt(prompt_token_ids=request_prompt["prompt_token_ids"]) + for request_prompt in request_prompts + ] + + return request_prompts, engine_prompts + + async def _preprocess_chat( + self, + request: ChatLikeRequest, + tokenizer: AnyTokenizer, + messages: List[ChatCompletionMessageParam], + chat_template: Optional[str] = None, + add_generation_prompt: bool = True, + continue_final_message: bool = False, + tool_dicts: Optional[List[Dict[str, Any]]] = None, + documents: Optional[List[Dict[str, str]]] = None, + chat_template_kwargs: Optional[Dict[str, Any]] = None, + tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None, + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, + add_special_tokens: bool = False, + ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt], + List[TokensPrompt]]: + conversation, mm_data_future = parse_chat_messages_futures( + messages, + self.model_config, + tokenizer, + ) + + _chat_template_kwargs: Dict[str, Any] = dict( + chat_template=chat_template, + add_generation_prompt=add_generation_prompt, + continue_final_message=continue_final_message, + tools=tool_dicts, + documents=documents, + ) + _chat_template_kwargs.update(chat_template_kwargs or {}) + + request_prompt: Union[str, List[int]] + is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer) + if is_mistral_tokenizer: + request_prompt = apply_mistral_chat_template( + tokenizer, + messages=messages, + **_chat_template_kwargs, + ) + else: + request_prompt = apply_hf_chat_template( + tokenizer, + conversation=conversation, + **_chat_template_kwargs, + ) + + mm_data = await mm_data_future + + # tool parsing is done only if a tool_parser has been set and if + # tool_choice is not "none" (if tool_choice is "none" but a tool_parser + # is set, we want to prevent parsing a tool_call hallucinated by the LLM + should_parse_tools = tool_parser is not None and (hasattr( + request, "tool_choice") and request.tool_choice != "none") + + if should_parse_tools: + if not isinstance(request, ChatCompletionRequest): + msg = "Tool usage is only supported for Chat Completions API" + raise NotImplementedError(msg) + + request = tool_parser(tokenizer).adjust_request( # type: ignore + request=request) + + if isinstance(request_prompt, str): + prompt_inputs = self._tokenize_prompt_input( + request, + tokenizer, + request_prompt, + truncate_prompt_tokens=truncate_prompt_tokens, + add_special_tokens=add_special_tokens, + ) + else: + # For MistralTokenizer + assert is_list_of(request_prompt, int), ( + "Prompt has to be either a string or a list of token ids") + prompt_inputs = TextTokensPrompt( + prompt=tokenizer.decode(request_prompt), + prompt_token_ids=request_prompt) + + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["prompt_token_ids"]) + if mm_data is not None: + engine_prompt["multi_modal_data"] = mm_data + + return conversation, [request_prompt], [engine_prompt] + def _log_inputs( self, request_id: str, - inputs: Union[str, List[int], TextTokensPrompt], + inputs: RequestPrompt, params: Optional[Union[SamplingParams, PoolingParams, BeamSearchParams]], lora_request: Optional[LoRARequest], @@ -404,6 +537,20 @@ def _log_inputs( prompt_adapter_request=prompt_adapter_request, ) + async def _get_trace_headers( + self, + headers: Headers, + ) -> Optional[Mapping[str, str]]: + is_tracing_enabled = await self.engine_client.is_tracing_enabled() + + if is_tracing_enabled: + return extract_trace_headers(headers) + + if contains_trace_headers(headers): + log_tracing_disabled_warning() + + return None + @staticmethod def _get_decoded_token(logprob: Logprob, token_id: int, diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index a269c94c7ec0d..1fd82304f7a4d 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -2,10 +2,7 @@ from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient -from vllm.entrypoints.chat_utils import (apply_hf_chat_template, - apply_mistral_chat_template, - load_chat_template, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import load_chat_template from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -20,7 +17,6 @@ LoRAModulePath, OpenAIServing) from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import MistralTokenizer from vllm.utils import random_uuid logger = init_logger(__name__) @@ -62,59 +58,51 @@ async def create_tokenize( request_id = f"tokn-{random_uuid()}" - ( - lora_request, - prompt_adapter_request, - ) = self._maybe_get_adapters(request) - - tokenizer = await self.engine_client.get_tokenizer(lora_request) - - prompt: Union[str, List[int]] - if isinstance(request, TokenizeChatRequest): - model_config = self.model_config - - conversation, mm_data_future = parse_chat_messages_futures( - request.messages, model_config, tokenizer) - - mm_data = await mm_data_future - if mm_data: - logger.warning( - "Multi-modal inputs are ignored during tokenization") - - if isinstance(tokenizer, MistralTokenizer): - prompt = apply_mistral_chat_template( + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if isinstance(request, TokenizeChatRequest): + ( + _, + request_prompts, + engine_prompts, + ) = await self._preprocess_chat( + request, tokenizer, - messages=request.messages, + request.messages, chat_template=self.chat_template, add_generation_prompt=request.add_generation_prompt, continue_final_message=request.continue_final_message, + add_special_tokens=request.add_special_tokens, ) else: - prompt = apply_hf_chat_template( + request_prompts, engine_prompts = self._preprocess_completion( + request, tokenizer, - conversation=conversation, - chat_template=self.chat_template, - add_generation_prompt=request.add_generation_prompt, - continue_final_message=request.continue_final_message, + request.prompt, + add_special_tokens=request.add_special_tokens, ) - else: - prompt = request.prompt + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) - self._log_inputs(request_id, - prompt, - params=None, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request) + input_ids: List[int] = [] + for i, engine_prompt in enumerate(engine_prompts): + self._log_inputs(request_id, + request_prompts[i], + params=None, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) - # Silently ignore prompt adapter since it does not affect tokenization + # Silently ignore prompt adapter since it does not affect + # tokenization (Unlike in Embeddings API where an error is raised) - prompt_input = self._tokenize_prompt_input( - request, - tokenizer, - prompt, - add_special_tokens=request.add_special_tokens, - ) - input_ids = prompt_input["prompt_token_ids"] + input_ids.extend(engine_prompt["prompt_token_ids"]) return TokenizeResponse(tokens=input_ids, count=len(input_ids), @@ -143,9 +131,8 @@ async def create_detokenize( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - if prompt_adapter_request is not None: - raise NotImplementedError("Prompt adapter is not supported " - "for tokenization") + # Silently ignore prompt adapter since it does not affect tokenization + # (Unlike in Embeddings API where an error is raised) prompt_input = self._tokenize_prompt_input( request, diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 1b299ce655570..2187862e8380b 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,5 +1,6 @@ from .abstract_tool_parser import ToolParser, ToolParserManager from .granite_20b_fc_tool_parser import Granite20bFCToolParser +from .granite_tool_parser import GraniteToolParser from .hermes_tool_parser import Hermes2ProToolParser from .internlm2_tool_parser import Internlm2ToolParser from .jamba_tool_parser import JambaToolParser @@ -8,6 +9,6 @@ __all__ = [ "ToolParser", "ToolParserManager", "Granite20bFCToolParser", - "Hermes2ProToolParser", "MistralToolParser", "Internlm2ToolParser", - "Llama3JsonToolParser", "JambaToolParser" + "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", + "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py new file mode 100644 index 0000000000000..b5854ca39ab47 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -0,0 +1,215 @@ +import json +from typing import Dict, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.entrypoints.openai.tool_parsers.utils import (consume_space, + find_common_prefix, + is_complete_json, + partial_json_loads) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("granite") +class GraniteToolParser(ToolParser): + """ + Tool call parser for the granite 3.0 models. Intended + for use with the examples/tool_chat_template_granite.jinja + template. + + Used when --enable-auto-tool-choice --tool-call-parser granite + are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + stripped = model_output.strip() + if not stripped or stripped[0] != '[': + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + try: + raw_function_calls = json.loads(stripped) + if not isinstance(raw_function_calls, list): + raise Exception( + f"Expected dict or list, got {type(raw_function_calls)}") + + logger.debug("Extracted %d tool calls", len(raw_function_calls)) + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"]), + ), + ) for function_call in raw_function_calls + ] + + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=None, + ) + + except Exception as e: + logger.error("Error in extracting tool call from response %s", e) + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + start_idx = consume_space(0, current_text) + if not current_text or current_text[start_idx] != '[': + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + tool_call_arr = None + is_complete = None + try: + tool_calls, end_idx = partial_json_loads( + current_text[start_idx:], flags) + if type(tool_calls) is list: + tool_call_arr = tool_calls + else: + return DeltaMessage(content=delta_text) + + is_complete = [True] * len(tool_calls) + if not is_complete_json( + current_text[start_idx:start_idx + end_idx]): + is_complete[-1] = False + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if not tool_call_arr: + return None + + # select as the current tool call the one we're on the state at + current_tool_call: Dict = tool_call_arr[self.current_tool_id] + + delta = None + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + if len(tool_call_arr) > self.current_tool_id + 1: + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments) + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + + if cur_arguments: + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments) + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments) + if cur_args_json != prev_args_json: + prefix = find_common_prefix( + prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None diff --git a/vllm/envs.py b/vllm/envs.py index b4a263d1e086e..f320e35971f94 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -49,7 +49,8 @@ VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets") VLLM_IMAGE_FETCH_TIMEOUT: int = 5 - VLLM_AUDIO_FETCH_TIMEOUT: int = 5 + VLLM_VIDEO_FETCH_TIMEOUT: int = 15 + VLLM_AUDIO_FETCH_TIMEOUT: int = 10 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None NVCC_THREADS: Optional[str] = None @@ -67,9 +68,11 @@ VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False VLLM_TORCH_COMPILE_LEVEL: int = 0 + VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None VLLM_CUSTOM_OPS: List[str] = [] VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False + VLLM_ENABLE_V1_MULTIPROCESSING: bool = False def get_default_cache_root(): @@ -225,6 +228,7 @@ def get_default_config_root(): # and disabled when running with Inductor (compile_level >= Inductor). "VLLM_CUSTOM_OPS": lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","), + # local rank of the process in the distributed setting, used to determine # the GPU device id "LOCAL_RANK": @@ -376,10 +380,15 @@ def get_default_config_root(): "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), + # Timeout for fetching videos when serving multimodal models + # Default is 15 seconds + "VLLM_VIDEO_FETCH_TIMEOUT": + lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")), + # Timeout for fetching audio when serving multimodal models - # Default is 5 seconds + # Default is 10 seconds "VLLM_AUDIO_FETCH_TIMEOUT": - lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "5")), + lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. @@ -465,6 +474,10 @@ def get_default_config_root(): # If set, use the V1 code path. "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + + # If set, enable multiprocessing in LLM for the V1 code path. + "VLLM_ENABLE_V1_MULTIPROCESSING": + lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))), } # end-env-vars-definition diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index e32993e0e452e..4ceb5a837dd7f 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,8 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -import torch - import vllm.envs as envs from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) @@ -138,18 +136,11 @@ def _create_worker( assert self.distributed_init_method is not None kwargs = dict( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=local_rank, rank=rank, distributed_init_method=self.distributed_init_method, - lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, - prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=rank == 0, ) wrapper.init_worker(**kwargs) @@ -323,9 +314,6 @@ async def check_health_async(self) -> None: def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: - if config.dtype == torch.float16: - logger.warning("float16 is not supported on CPU, casting to bfloat16.") - config.dtype = torch.bfloat16 # Reminder: Please update docs/source/serving/compatibility_matrix.rst # If the feature combo become valid if not config.enforce_eager: diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index c96cb0f2c2981..9cba189dd57f9 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -1,10 +1,7 @@ from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import VllmConfig from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.prompt_adapter.request import PromptAdapterRequest @@ -23,27 +20,19 @@ class ExecutorBase(ABC): def __init__( self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - speculative_config: Optional[SpeculativeConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], - observability_config: Optional[ObservabilityConfig], + vllm_config: VllmConfig, ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.speculative_config = speculative_config - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config self._init_executor() @abstractmethod diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index ed30d3186a453..c65d0836e5ff7 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -49,21 +49,12 @@ def _get_worker_kwargs( distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) return dict( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - speculative_config=self.speculative_config, - prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=(not self.parallel_config) or (rank % self.parallel_config.tensor_parallel_size == 0), - observability_config=self.observability_config, ) def _get_worker_module_and_class( diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py new file mode 100644 index 0000000000000..220e9eee87bb3 --- /dev/null +++ b/vllm/executor/hpu_executor.py @@ -0,0 +1,205 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import contextlib +import os +from typing import Any, Dict, List, Optional, Set, Tuple + +from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sequence import ExecuteModelRequest +from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, + make_async) +from vllm.worker.worker_base import WorkerWrapperBase + +logger = init_logger(__name__) + + +class HPUExecutor(ExecutorBase): + + uses_ray: bool = False + + def _init_executor(self) -> None: + """Initialize the worker and load the model.""" + self._init_worker() + + def _get_worker_kwargs( + self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None) -> Dict[str, Any]: + """Return worker init args for a given rank.""" + if distributed_init_method is None: + distributed_init_method = get_distributed_init_method( + get_ip(), get_open_port()) + return dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=rank == 0, + ) + + def _create_worker(self, + local_rank: int = 0, + rank: int = 0, + distributed_init_method: Optional[str] = None): + wrapper = WorkerWrapperBase( + worker_module_name="vllm.worker.hpu_worker", + worker_class_name="HPUWorker", + ) + wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank, + distributed_init_method)) + return wrapper.worker + + def _init_worker(self): + assert self.parallel_config.world_size == 1, ( + "GPUExecutor only supports single GPU.") + + self.driver_worker = self._create_worker() + self.driver_worker.init_device() + self.driver_worker.load_model() + + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Determine the number of available KV blocks by invoking the + underlying worker. + """ + return self.driver_worker.determine_num_available_blocks() + + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: + """Initialize the KV cache by invoking the underlying worker. + """ + # NOTE: This is logged in the executor because there can be >1 worker + # with other executors. We could log in the engine level, but work + # remains to abstract away the device for non-GPU configurations. + logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, + num_cpu_blocks) + from vllm_hpu_extension.profiler import HabanaMemoryProfiler + with HabanaMemoryProfiler() as cache_init_m: + self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" + logger.info(msg) + + def finish_measurements(self): + self.driver_worker.finish_measurements() + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501 + # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS - will log cpu fallbacks per engine step, only when there was any # noqa:E501 + # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501 + log_graph_compilation_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0' + log_graph_compilation = os.environ.get( + 'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION', + '0') != '0' or log_graph_compilation_all + log_cpu_fallbacks_all = os.environ.get( + 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' + log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', + '0') != '0' or log_cpu_fallbacks_all + if log_graph_compilation or log_cpu_fallbacks: + from habana_frameworks.torch.hpu.metrics import metric_localcontext + seq_group_metadata_list = execute_model_req.seq_group_metadata_list + is_prompt = any([ + seq_group_metadata.is_prompt + for seq_group_metadata in seq_group_metadata_list + ]) + max_context_len = max([ + max([ + len(v.prompt_token_ids) + len(v.output_token_ids) + for v in seq_group_metadata.seq_data.values() + ]) for seq_group_metadata in seq_group_metadata_list + ]) # whoa, that's some spicy stuff right here + max_num_blocks = ( + (max_context_len - 1) // self.cache_config.block_size) + 1 + input_stats = (f'is_prompt: {is_prompt}, ' + f'num_seqs: {len(seq_group_metadata_list)}, ' + f'max_context_len: {max_context_len}, ' + f'max_num_blocks {max_num_blocks}') + gc_ctx = metric_localcontext( + "graph_compilation" + ) if log_graph_compilation else contextlib.nullcontext() + cpu_fallback_ctx = metric_localcontext( + "cpu_fallback" + ) if log_cpu_fallbacks else contextlib.nullcontext() + with gc_ctx as gc_local_metric, \ + cpu_fallback_ctx as cpu_fallback_local_metric: + output = self.driver_worker.execute_model(execute_model_req) + if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 + ) or log_graph_compilation_all: + msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " + f"{gc_local_metric.stats()}, {input_stats}") + logger.warning(msg) + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > + 0) or log_cpu_fallbacks_all: + msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " + f"{cpu_fallback_local_metric.stats()}, {input_stats}") + logger.warning(msg) + + return output + + output = self.driver_worker.execute_model(execute_model_req) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + assert lora_request.lora_int_id > 0, "lora_id must be greater than 0." + return self.driver_worker.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + assert lora_id > 0, "lora_id must be greater than 0." + return self.driver_worker.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.driver_worker.list_loras() + + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def list_prompt_adapters(self) -> Set[int]: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def check_health(self) -> None: + # GPUExecutor will always be healthy as long as + # it's running. + return + + def start_profile(self) -> None: + self.driver_worker.start_profile() + + def stop_profile(self) -> None: + self.driver_worker.stop_profile() + + def shutdown(self) -> None: + self.driver_worker.shutdown_inc() + + +class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest, + ) -> List[SamplerOutput]: + output = await make_async(self.driver_worker.execute_model + )(execute_model_req=execute_model_req, ) + return output diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 2dbde778e49b1..3eb14fb931925 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -13,12 +13,15 @@ from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest -from vllm.triton_utils import maybe_set_triton_cache_manager +from vllm.triton_utils.importing import HAS_TRITON from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, cuda_is_initialized, get_distributed_init_method, get_open_port, get_vllm_instance_id, make_async, update_environment_variables) +if HAS_TRITON: + from vllm.triton_utils import maybe_set_triton_cache_manager + logger = init_logger(__name__) @@ -59,7 +62,7 @@ def _init_executor(self) -> None: torch.set_num_threads(default_omp_num_threads) # workaround for https://github.com/vllm-project/vllm/issues/6103 - if world_size > 1: + if HAS_TRITON and world_size > 1: maybe_set_triton_cache_manager() # Multiprocessing-based executor does not support multi-node setting. diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py index f2fcfa58b26e1..02d37cd7fbf23 100644 --- a/vllm/executor/neuron_executor.py +++ b/vllm/executor/neuron_executor.py @@ -29,11 +29,7 @@ def _init_worker(self): distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) self.driver_worker = NeuronWorker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, + vllm_config=self.vllm_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method) diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index d0c0333854dae..d06b0ccb7906e 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -48,16 +48,10 @@ def _init_worker(self): get_ip(), get_open_port()) self.driver_worker = OpenVINOWorker( ov_core=self.ov_core, - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=0, rank=0, distributed_init_method=distributed_init_method, - lora_config=self.lora_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py index 9433dce842b09..66bab2c686c67 100644 --- a/vllm/executor/ray_gpu_executor.py +++ b/vllm/executor/ray_gpu_executor.py @@ -32,7 +32,7 @@ class RayGPUExecutor(DistributedGPUExecutor): uses_ray: bool = True def _init_executor(self) -> None: - self.forward_dag: Optional["ray.dag.CompiledDAG"] = None + self.forward_dag: Optional[ray.dag.CompiledDAG] = None # If the env var is set, it uses the Ray's compiled DAG API # which optimizes the control plane overhead. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py new file mode 100644 index 0000000000000..a24bab6df370e --- /dev/null +++ b/vllm/executor/ray_hpu_executor.py @@ -0,0 +1,554 @@ +import asyncio +import os +from collections import defaultdict +from itertools import islice, repeat +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, + Type) + +import msgspec + +import vllm.envs as envs +from vllm.executor.distributed_gpu_executor import ( # yapf: disable + DistributedGPUExecutor, DistributedGPUExecutorAsync) +from vllm.executor.msgspec_utils import encode_hook +from vllm.executor.ray_utils import RayWorkerWrapper, ray +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import ExecuteModelRequest +from vllm.utils import (_run_task_with_lock, get_distributed_init_method, + get_ip, get_open_port, get_vllm_instance_id, + make_async) +from vllm.worker.worker_base import WorkerBase + +if ray is not None: + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + +logger = init_logger(__name__) + + +class RayHPUExecutor(DistributedGPUExecutor): + + uses_ray: bool = True + + def _init_executor(self) -> None: + self.forward_dag: Optional[ray.dag.CompiledDAG] = None + # If the env var is set, it uses the Ray's compiled DAG API + # which optimizes the control plane overhead. + # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. + # Currently, this requires USE_RAY_SPMD_WORKER=True. + self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG + # If the env var is set, then we do not distinguish between the + # "driver worker" vs other workers. Also, the rank 0 worker will + # be executed in a remote Ray worker. Currently this requires + # USE_RAY_COMPILED_DAG=True. + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if self.use_ray_compiled_dag: + assert self.use_ray_spmd_worker, ( + "VLLM_USE_RAY_COMPILED_DAG=1 requires " + "VLLM_USE_RAY_SPMD_WORKER=1") + if self.use_ray_spmd_worker: + # TODO: Support SPMD worker for non-DAG Ray executor. + assert self.use_ray_compiled_dag, ( + "VLLM_USE_RAY_SPMD_WORKER=1 requires " + "VLLM_USE_RAY_COMPILED_DAG=1") + + assert self.uses_ray + placement_group = self.parallel_config.placement_group + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) + self.output_decoder = msgspec.msgpack.Decoder( + Optional[List[SamplerOutput]]) + + def shutdown(self) -> None: + if hasattr(self, "forward_dag") and self.forward_dag is not None: + self.forward_dag.teardown() + import ray + for worker in self.workers: + ray.kill(worker) + self.forward_dag = None + + def finish_measurements(self): + self._run_workers("finish_measurements") + + def _get_worker_module_and_class( + self + ) -> Tuple[str, str, Optional[Callable[[], + Type[WorkerBase]]]]: # noqa: F821 + worker_class_fn = None + if self.scheduler_config.is_multi_step: + raise NotImplementedError( + "Multi-step execution is not implemented for HPU") + elif self.speculative_config: + raise NotImplementedError( + "Speculative decoding is not implemented for HPU") + else: + worker_module_name = "vllm.worker.hpu_worker" + worker_class_name = "HPUWorker" + return (worker_module_name, worker_class_name, worker_class_fn) + + def _get_worker_wrapper_args(self) -> Dict[str, Any]: + (worker_module_name, worker_class_name, + worker_class_fn) = self._get_worker_module_and_class() + + return dict( + worker_module_name=worker_module_name, + worker_class_name=worker_class_name, + worker_class_fn=worker_class_fn, + trust_remote_code=self.model_config.trust_remote_code, + ) + + def _init_workers_ray(self, placement_group: "PlacementGroup", + **ray_remote_kwargs): + # Otherwise, the ray workers are allocated with a full GPU. + num_gpus = 1 + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Used in ray compiled DAG: indexed first by PP rank, + # and then TP rank. In other words, the inner list is + # the TP group of workers for a PP rank. + self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] + + logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) + + # Create the workers. + driver_ip = get_ip() + worker_wrapper_kwargs = self._get_worker_wrapper_args() + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if not bundle.get("HPU", 0): + continue + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + worker = ray.remote( + num_cpus=0, + num_gpus=0, + resources={'HPU': num_gpus}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) + + if self.use_ray_spmd_worker: + self.workers.append(worker) + else: + worker_ip = ray.get(worker.get_node_ip.remote()) + if worker_ip == driver_ip and self.driver_dummy_worker is None: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = RayWorkerWrapper( + **worker_wrapper_kwargs) + else: + # Else, added to the list of workers. + self.workers.append(worker) + + logger.debug("workers: %s", self.workers) + logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) + if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + worker_ips = [ + ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] + for worker in self.workers + ] + ip_counts: Dict[str, int] = {} + for ip in worker_ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + + def sort_by_driver_then_worker_ip(worker): + """ + Sort the workers based on 3 properties: + 1. If the worker is on the same node as the driver (vllm engine), + it should be placed first. + 2. Then, if the worker is on a node with fewer workers, it should + be placed first. + 3. Finally, if the work is on a node with smaller IP address, it + should be placed first. + """ + ip = ray.get(worker.get_node_ip.remote()) + return (ip != driver_ip, ip_counts[ip], ip) + + # After sorting, the workers on the same node will be + # close to each other, and the workers on the driver + # node will be placed first. + self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", + use_dummy_driver=True) + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + all_ips = set(worker_ips + [driver_ip]) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP` or " + "`HOST_IP` environment variable, make sure it is unique for" + " each node.") + + VLLM_INSTANCE_ID = get_vllm_instance_id() + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [({ + "VLLM_INSTANCE_ID": + VLLM_INSTANCE_ID, + "VLLM_TRACE_FUNCTION": + str(envs.VLLM_TRACE_FUNCTION), + }, ) for (node_id, _) in worker_node_and_gpu_ids] + self._run_workers("update_environment_variables", + all_args=all_args_to_update_environment_variables) + + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + init_worker_all_kwargs = [ + self._get_worker_kwargs( + local_rank=node_workers[node_id].index(rank), + rank=rank, + distributed_init_method=distributed_init_method, + ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) + ] + self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + if self.use_ray_spmd_worker: + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + self.pp_tp_workers.append([]) + for tp_rank in range( + self.parallel_config.tensor_parallel_size): + # PP=2, TP=4 + # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] + rank = (pp_rank * self.parallel_config.tensor_parallel_size + ) + tp_rank + assert len(self.pp_tp_workers[pp_rank]) == tp_rank + assert pp_rank < len(self.pp_tp_workers) + self.pp_tp_workers[pp_rank].append(self.workers[rank]) + + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[RayWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[RayWorkerWrapper] = [] + + # Enforce rank order for correct rank to return final output. + for index, worker in enumerate(self.workers): + # The driver worker is rank 0 and not in self.workers. + rank = index + 1 + if rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) + + def _driver_execute_model( + self, execute_model_req: Optional[ExecuteModelRequest] + ) -> Optional[List[SamplerOutput]]: + """Run execute_model in the driver worker. + + Passing None will cause the driver to stop the model execution + loop running in each of the remote workers. + """ + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + return self.driver_worker.execute_method("execute_model", + execute_model_req) + + def execute_model( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return super().execute_model(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=False) + + serialized_data = self.input_encoder.encode(execute_model_req) + outputs = ray.get(self.forward_dag.execute(serialized_data)) + output = self.output_decoder.decode(outputs[0]) + return output + + def _run_workers( + self, + method: str, + *args, + async_run_tensor_parallel_workers_only: bool = False, + all_args: Optional[List[Tuple[Any, ...]]] = None, + all_kwargs: Optional[List[Dict[str, Any]]] = None, + use_dummy_driver: bool = False, + max_concurrent_workers: Optional[int] = None, + **kwargs, + ) -> Any: + """Runs the given method on all workers. Can be used in the following + ways: + + Args: + - async_run_tensor_parallel_workers_only: If True the method will be + run only in the remote TP workers, not the driver worker. + It will also be run asynchronously and return a list of futures + rather than blocking on the results. + - args/kwargs: All workers share the same args/kwargs + - all_args/all_kwargs: args/kwargs for each worker are specified + individually + """ + if self.use_ray_spmd_worker: + assert not async_run_tensor_parallel_workers_only, ( + "async_run_tensor_parallel_workers_only is not supported for " + "spmd mode.") + + if max_concurrent_workers: + raise NotImplementedError( + "max_concurrent_workers is not supported yet.") + + count = len(self.workers) if not \ + async_run_tensor_parallel_workers_only \ + else len(self.non_driver_workers) + # If using SPMD worker, all workers are the same, so we should execute + # the args on all workers. Otherwise, we skip the first worker's args + # because those args will go to the driver worker. + first_worker_args_index: int = 0 if self.use_ray_spmd_worker else 1 + all_worker_args = repeat(args, count) if all_args is None \ + else islice(all_args, first_worker_args_index, None) + all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ + else islice(all_kwargs, first_worker_args_index, None) + + # Start the ray workers first. + ray_workers = self.workers + if async_run_tensor_parallel_workers_only: + ray_workers = self.non_driver_workers + ray_worker_outputs = [ + worker.execute_method.remote(method, *worker_args, **worker_kwargs) + for (worker, worker_args, worker_kwargs + ) in zip(ray_workers, all_worker_args, all_worker_kwargs) + ] + + if async_run_tensor_parallel_workers_only: + # Just return futures + return ray_worker_outputs + + driver_worker_output = [] + # In SPMD mode, the driver worker is the same as any other worker, + # so we only explicitly execute on the driver worker if using a + # non-SPMD worker class. + if not self.use_ray_spmd_worker: + driver_args = args if all_args is None else all_args[0] + driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0] + + # Start the driver worker after all the ray workers. + if not use_dummy_driver: + driver_worker_output = [ + self.driver_worker.execute_method(method, *driver_args, + **driver_kwargs) + ] + else: + assert self.driver_dummy_worker is not None + driver_worker_output = [ + ray.get( + self.driver_dummy_worker.execute_method.remote( + method, *driver_args, **driver_kwargs)) + ] + + # Get the results of the ray workers. + if self.workers: + ray_worker_outputs = ray.get(ray_worker_outputs) + + return driver_worker_output + ray_worker_outputs + + def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None: + """Wait for futures returned from _run_workers() with + async_run_remote_workers_only to complete.""" + ray.get(parallel_worker_tasks) + + def _check_ray_adag_installation(self): + import pkg_resources + from packaging import version + + required_version = version.parse("2.35") + current_version = version.parse( + pkg_resources.get_distribution("ray").version) + # TODO: update the constraint once we adapt to the backward + # incompatible API change from ray 2.36 + if current_version != required_version: + raise ValueError(f"Ray version {required_version} is " + f"required, but found {current_version}") + + import importlib.util + adag_spec = importlib.util.find_spec( + "ray.experimental.compiled_dag_ref") + if adag_spec is None: + raise ValueError("Ray accelerated DAG is not installed. " + "Run `pip install ray[adag]` to install it.") + + def _compiled_ray_dag(self, enable_asyncio: bool): + assert self.parallel_config.use_ray + self._check_ray_adag_installation() + from ray.dag import InputNode, MultiOutputNode + from ray.experimental.channel.torch_tensor_type import TorchTensorType + + with InputNode() as input_data: + # Example DAG: PP=2, TP=4 + # (ExecuteModelReq, None) -> 0 -> (ExecuteModelReq, IntermediateOutput) -> 4 -> SamplerOutput # noqa: E501 + # -> 1 -> (ExecuteModelReq, IntermediateOutput) -> 5 -> SamplerOutput # noqa: E501 + # -> 2 -> (ExecuteModelReq, IntermediateOutput) -> 6 -> SamplerOutput # noqa: E501 + # -> 3 -> (ExecuteModelReq, IntermediateOutput) -> 7 -> SamplerOutput # noqa: E501 + + # All workers in the first TP group will take in the + # ExecuteModelRequest as input. + outputs = [input_data for _ in self.pp_tp_workers[0]] + for pp_rank, tp_group in enumerate(self.pp_tp_workers): + # Each PP worker takes in the output of the previous PP worker, + # and the TP group executes in SPMD fashion. + outputs = [ + worker.execute_model_spmd. + bind( # type: ignore[attr-defined] + outputs[i]) for i, worker in enumerate(tp_group) + ] + + last_pp_rank = len(self.pp_tp_workers) - 1 + if pp_rank < last_pp_rank: + # Specify how intermediate tensors should be passed + # between pp stages, no need to specify for the last + # pp stage. + transport = "auto" + outputs = [ + output.with_type_hint( + TorchTensorType(transport=transport)) + for output in outputs + ] + + forward_dag = MultiOutputNode(outputs) + + return forward_dag.experimental_compile(enable_asyncio=enable_asyncio) + + def __del__(self): + self.shutdown() + + +class RayHPUExecutorAsync(RayHPUExecutor, DistributedGPUExecutorAsync): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.pp_locks: Optional[List[asyncio.Lock]] = None + self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER + if not self.use_ray_compiled_dag: + self.driver_exec_method = make_async( + self.driver_worker.execute_method) + + async def execute_model_async( + self, + execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: + if not self.use_ray_spmd_worker: + return await super().execute_model_async(execute_model_req) + + if self.forward_dag is None: + self.forward_dag = self._compiled_ray_dag(enable_asyncio=True) + + serialized_data = self.input_encoder.encode(execute_model_req) + dag_future = await self.forward_dag.execute_async(serialized_data) + outputs = await dag_future + return self.output_decoder.decode(outputs[0]) + + async def _driver_execute_model_async( + self, + execute_model_req: Optional[ExecuteModelRequest] = None + ) -> List[SamplerOutput]: + assert not self.use_ray_spmd_worker, ( + "driver_worker does not exist for VLLM_USE_RAY_SPMD_WORKER=1") + if not self.tp_driver_workers: + return await self.driver_exec_method("execute_model", + execute_model_req) + if self.pp_locks is None: + # This locks each pipeline parallel stage so multiple virtual + # engines can't execute on the same stage at the same time + # We create the locks here to avoid creating them in the constructor + # which uses a different asyncio loop. + self.pp_locks = [ + asyncio.Lock() + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + + tasks = [ + asyncio.create_task( + _run_task_with_lock(self.driver_exec_method, self.pp_locks[0], + "execute_model", execute_model_req)) + ] + for pp_rank, driver_worker in enumerate(self.tp_driver_workers, + start=1): + tasks.append( + asyncio.create_task( + _run_task_with_lock(driver_worker.execute_method.remote, + self.pp_locks[pp_rank], + "execute_model", execute_model_req))) + + results = await asyncio.gather(*tasks) + + # Only the last PP stage has the final results. + return results[-1] + + async def _start_worker_execution_loop(self): + assert not self.use_ray_spmd_worker, ( + "worker loop is disabled for VLLM_USE_RAY_SPMD_WORKER=1") + coros = [ + worker.execute_method.remote("start_worker_execution_loop") + for worker in self.non_driver_workers + ] + return await asyncio.gather(*coros) + + def __del__(self): + self.shutdown() diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 993d279890820..41dd59bc65ec5 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -249,7 +249,11 @@ def initialize_ray_cluster( # Placement group is already set. return - device_str = "GPU" if not current_platform.is_tpu() else "TPU" + device_str = "GPU" + if current_platform.is_tpu(): + device_str = "TPU" + elif current_platform.is_hpu(): + device_str = 'HPU' # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 972649dedf33e..e37e8973790db 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -44,12 +44,7 @@ def _get_worker_kwargs( distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) return dict( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 5f78993ddc4b4..36b7e2265efab 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -2,10 +2,7 @@ import torch -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import ModelConfig, ParallelConfig from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger @@ -21,38 +18,13 @@ class XPUExecutor(GPUExecutor): uses_ray: bool = False - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], - speculative_config: Optional[SpeculativeConfig], - observability_config: Optional[ObservabilityConfig], - ) -> None: - assert device_config.device_type == "xpu" - assert (not speculative_config - ), "Speculative decoding not yet supported for XPU backend" - - model_config = _verify_and_get_model_config(model_config) - - self.model_config = model_config - self.cache_config = cache_config - self.load_config = load_config - self.lora_config = lora_config - self.parallel_config = _verify_and_get_parallel_config(parallel_config) - self.scheduler_config = scheduler_config - self.device_config = device_config - self.prompt_adapter_config = prompt_adapter_config - self.speculative_config = None - self.observability_config = observability_config - - # Instantiate the worker and load the model to GPU. - self._init_executor() + def _init_executor(self) -> None: + assert self.device_config.device_type == "xpu" + assert self.speculative_config is None, ( + "Speculative decoding not yet supported for XPU backend") + + self.model_config = _verify_and_get_model_config(self.model_config) + GPUExecutor._init_executor(self) def _get_worker_module_and_class( self) -> Tuple[str, str, Optional[Callable[[], Type[WorkerBase]]]]: diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 7b73922ddd2c5..54fbd7a321a6f 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,9 +1,11 @@ from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonInputs, - SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, + ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, + SingletonInputs, SingletonInputsAdapter, SingletonPrompt, + TextPrompt, TokenInputs, TokensPrompt, build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import InputContext, InputRegistry +from .registry import (DummyData, InputContext, InputProcessingContext, + InputRegistry) INPUT_REGISTRY = InputRegistry() """ @@ -22,14 +24,18 @@ "ExplicitEncoderDecoderPrompt", "TokenInputs", "token_inputs", - "SingletonInputs", "DecoderOnlyInputs", "EncoderDecoderInputs", + "ProcessorInputs", + "SingletonInputs", + "SingletonInputsAdapter", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", "INPUT_REGISTRY", + "DummyData", "InputContext", + "InputProcessingContext", "InputRegistry", ] diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 9a094191eda38..07ff9faa50f13 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,10 +1,14 @@ -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, +from dataclasses import dataclass +from functools import cached_property +from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) -from typing_extensions import NotRequired, TypedDict, TypeVar +import torch +from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict + from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict + from vllm.multimodal.inputs import MultiModalInputsV2 class TextPrompt(TypedDict): @@ -36,13 +40,13 @@ class TokensPrompt(TypedDict): multi_modal_data: NotRequired["MultiModalDataDict"] """ - Optional multi-modal data to pass to the model, + DEPRECATED: Optional multi-modal data to pass to the model, if the model supports it. """ mm_processor_kwargs: NotRequired[Dict[str, Any]] """ - Optional multi-modal processor kwargs to be forwarded to the + DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them. @@ -122,21 +126,30 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): class TokenInputs(TypedDict): """Represents token-based inputs.""" + + type: Literal["token"] + """The type of inputs.""" + prompt_token_ids: List[int] """The token IDs of the prompt.""" - prompt: NotRequired[Optional[str]] + prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. """ - mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]] + multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] + """ + Placeholder ranges for the multi-modal data. + """ + + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -149,28 +162,25 @@ def token_inputs( prompt_token_ids: List[int], prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, + multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" - inputs = TokenInputs(prompt_token_ids=prompt_token_ids) + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: inputs["prompt"] = prompt if multi_modal_data is not None: inputs["multi_modal_data"] = multi_modal_data + if multi_modal_placeholders is not None: + inputs["multi_modal_placeholders"] = multi_modal_placeholders if mm_processor_kwargs is not None: inputs["mm_processor_kwargs"] = mm_processor_kwargs return inputs -SingletonInputs = TokenInputs -""" -A processed :class:`SingletonPrompt` which can be passed to -:class:`vllm.sequence.Sequence`. -""" - -DecoderOnlyInputs = TokenInputs +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -178,28 +188,102 @@ def token_inputs( """ -class EncoderDecoderInputs(TokenInputs): +class EncoderDecoderInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. This specifies the required data for encoder-decoder models. """ - encoder_prompt_token_ids: List[int] - """The token IDs of the encoder prompt.""" + encoder: Union[TokenInputs, "MultiModalInputsV2"] + """The inputs for the encoder portion.""" + + decoder: Union[TokenInputs, "MultiModalInputsV2"] + """The inputs for the decoder portion.""" + + +SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +""" +A processed :class:`SingletonPrompt` which can be passed to +:class:`vllm.sequence.Sequence`. +""" - encoder_prompt: NotRequired[Optional[str]] - """ - The original encoder prompt text corresponding to the token IDs, if - available. - """ - encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] +@dataclass +class SingletonInputsAdapter: """ - Optional multi-modal data to pass to the encoder model, - if the model supports it. + Unified interface to access the components of :class:`SingletonInputs`. """ + inputs: SingletonInputs + @cached_property + def prompt(self) -> Optional[str]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt") + + assert_never(inputs) + + @cached_property + def prompt_token_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt_token_ids", []) + + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return None + + assert_never(inputs) + + @cached_property + def multi_modal_data(self) -> "MultiModalDataDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + + @cached_property + def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_placeholders", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_placeholders", {}) + + assert_never(inputs) + + @cached_property + def mm_processor_kwargs(self) -> Dict[str, Any]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + if inputs["type"] == "multimodal": + return {} + + assert_never(inputs) + + +ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] +""" +The inputs to :data:`vllm.inputs.InputProcessor`. +""" _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) @@ -226,10 +310,11 @@ def zip_enc_dec_prompts( ) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs - may also be provided; if a dict is passed, the same dictionary will be - used for every encoder/decoder prompt. If an iterable is provided, it will - be zipped with the encoder/decoder prompts. + :class:`ExplicitEncoderDecoderPrompt` instances. + + ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: mm_processor_kwargs = cast(Dict[str, Any], {}) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index e79d2c813bb4f..09f1ff2cb42e9 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -4,9 +4,9 @@ from vllm.utils import is_list_of -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonPrompt, - TextPrompt, TokensPrompt) +from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, + ProcessorInputs, PromptType, SingletonPrompt, TextPrompt, + TokensPrompt) class ParsedText(TypedDict): @@ -98,12 +98,15 @@ def parse_singleton_prompt( raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt") +def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: + return isinstance(prompt, dict) and "prompt_token_ids" in prompt + + def is_explicit_encoder_decoder_prompt( prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: return isinstance(prompt, dict) and "encoder_prompt" in prompt def is_encoder_decoder_inputs( - inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], -) -> TypeIs[EncoderDecoderInputs]: - return "encoder_prompt_token_ids" in inputs + inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]: + return "encoder" in inputs and "decoder" in inputs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 82ce7d392b719..fdf28615fda10 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,31 +1,23 @@ import asyncio -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import List, Mapping, Optional, Union from typing_extensions import assert_never from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, PromptType, - SingletonPrompt) +from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, + PromptType, SingletonInputs, SingletonPrompt, token_inputs) from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt -if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict - logger = init_logger(__name__) -PromptComponents = Tuple[Optional[str], List[int], - Optional["MultiModalDataDict"], Optional[Dict[str, - Any]]] -DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]], - Optional["MultiModalDataDict"], - Optional[Dict[str, Any]]] - class InputPreprocessor: @@ -33,11 +25,13 @@ def __init__( self, model_config: ModelConfig, tokenizer: Optional[BaseTokenizerGroup], + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: super().__init__() self.model_config = model_config self.tokenizer = tokenizer + self.mm_registry = mm_registry def get_tokenizer_group(self) -> BaseTokenizerGroup: if self.tokenizer is None: @@ -115,7 +109,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: "default" decoder prompt be . However, it is possible that in the future - other models may have different or more + other models may have different or more complex logic for the default decoder prompt. This motivates having a special helper method for default decoder prompts. @@ -132,7 +126,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: def _prepare_decoder_input_ids_for_generation( self, decoder_input_ids: Optional[List[int]], - force_bos: bool = True, ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -162,8 +155,8 @@ def _prepare_decoder_input_ids_for_generation( # use decoder_start_token_id as decoder_input_ids decoder_input_ids = self._get_default_enc_dec_decoder_prompt() - if force_bos and (len(decoder_input_ids) == 0 - or decoder_input_ids[0] != decoder_start_token_id): + if (len(decoder_input_ids) == 0 + or decoder_input_ids[0] != decoder_start_token_id): decoder_input_ids = [decoder_start_token_id] + decoder_input_ids return decoder_input_ids @@ -209,14 +202,79 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) - def _extract_prompt_components( + def _can_process_multimodal(self) -> bool: + model_config = self.model_config + + if not model_config.is_multimodal_model: + raise ValueError("Your model does not support multi-modal inputs") + + # Interim measure so we can handle models that have yet to be + # updated to use the new multi-modal processor + can_process_multimodal = self.mm_registry.has_processor(model_config) + if not can_process_multimodal: + logger.info( + "Your model uses the legacy input pipeline instead of the new " + "multi-modal processor. Please note that the legacy pipeline " + "will be removed in a future release. For more details, see: " + "https://github.com/vllm-project/vllm/issues/10114") + + return can_process_multimodal + + def _process_multimodal( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """ + Apply the model's multi-modal processor to a multi-modal prompt, + returning the corresponding token IDs and metadata. + """ + tokenizer_group = self.get_tokenizer_group() + tokenizer = tokenizer_group.get_lora_tokenizer(lora_request) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + + if isinstance(prompt, list): + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + async def _process_multimodal_async( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """Async version of :meth:`_process_multimodal`.""" + tokenizer_group = self.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request + ) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + if isinstance(prompt, list): + logger.warning("Passing `multi_modal_data` in TokensPrompt is" + "deprecated and will be removed in a future update") + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: - ''' - Extract the components of any single encoder or decoder input prompt. + ) -> SingletonInputs: + """ + Extract the singleton inputs from a prompt. Arguments: @@ -226,12 +284,8 @@ def _extract_prompt_components( Returns: - * prompt - * prompt_token_ids - * multi_modal_data - * mm_processor_kwargs (request-level input processor/mapper overrides) - ''' - + * :class:`SingletonInputs` instance + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": @@ -241,34 +295,69 @@ def _extract_prompt_components( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + assert_never(parsed) - async def _extract_prompt_components_async( + async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -279,59 +368,93 @@ async def _extract_prompt_components_async( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + assert_never(parsed) def _build_enc_dec_llm_inputs( self, - encoder_comps: PromptComponents, - decoder_comps: DecoderPromptComponents, - mm_processor_kwargs: Dict[str, Any], + encoder_inputs: SingletonInputs, + decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps - - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if decoder_mm_data is not None: - raise ValueError( - "Multi-modality decoder inputs of encoder-decoder models are " - "not supported yet") - - # For Multi-Modal models (e.g., mllama), the text input can be - # <|image|><|begin_of_text|>hello world. And we should not add - # another <|begin_of_text|> to the beginning. - decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation( - decoder_prompt_ids, - force_bos=(encoder_mm_data is None and decoder_mm_data is None))) + if (encoder_inputs["type"] == "token" + or encoder_inputs["type"] == "multimodal"): + pass + else: + assert_never(encoder_inputs) + + if decoder_inputs is None: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) + decoder_inputs = token_inputs(dec_token_ids) + elif (decoder_inputs["type"] == "token" + or decoder_inputs["type"] == "multimodal"): + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"]) + decoder_inputs["prompt_token_ids"] = dec_token_ids + + if "multi_modal_data" in decoder_inputs: + raise ValueError("Multi-modal decoder inputs of encoder-" + "decoder models are not supported yet") + else: + assert_never(encoder_inputs) return EncoderDecoderInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - multi_modal_data=decoder_mm_data, - mm_processor_kwargs=mm_processor_kwargs, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - encoder_multi_modal_data=encoder_mm_data, + encoder=encoder_inputs, + decoder=decoder_inputs, ) def _process_encoder_decoder_prompt( @@ -339,10 +462,9 @@ def _process_encoder_decoder_prompt( prompt: PromptType, request_id: str, ) -> EncoderDecoderInputs: - ''' + """ For encoder/decoder models only: - Process an input prompt into an - :class:`EncoderDecoderInputs` instance. + Process an input prompt into an :class:`EncoderDecoderInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -361,7 +483,7 @@ def _process_encoder_decoder_prompt( have any possible singleton type; thus this method relies on helper functions to obtain token ids for the sub-prompts. - + Arguments: * prompt: an input prompt @@ -370,42 +492,32 @@ def _process_encoder_decoder_prompt( Returns: * :class:`EncoderDecoderInputs` instance - ''' - - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + """ + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - decoder_comps = None, None, None, None + decoder_inputs = None else: - decoder_comps = self._extract_prompt_components( + decoder_inputs = self._prompt_to_llm_inputs( decoder_input, request_id=request_id, ) - # Handle this carefully in case it was directly initialized by user - mm_processor_kwargs = prompt.get("mm_processor_kwargs", {}) else: - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) async def _process_encoder_decoder_prompt_async( self, @@ -413,59 +525,51 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_task = self._extract_prompt_components_async( + encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - encoder_comps = await encoder_task - decoder_comps = None, None, None, None + encoder_inputs = await encoder_task + decoder_inputs = None else: - decoder_task = self._extract_prompt_components_async( + decoder_task = self._prompt_to_llm_inputs_async( decoder_input, request_id=request_id, ) - encoder_comps, decoder_comps = await asyncio.gather( + encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) - mm_processor_kwargs = prompt["mm_processor_kwargs"] else: - encoder_comps = await self._extract_prompt_components_async( + encoder_inputs = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) def _build_decoder_only_llm_inputs( self, - prompt_comps: PromptComponents, + prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - (prompt, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = prompt_comps - - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) + if (prompt_inputs["type"] == "token" + or prompt_inputs["type"] == "multimodal"): + prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( + prompt_inputs["prompt_token_ids"], + prompt_adapter_request=prompt_adapter_request, + ) + else: + assert_never(prompt_inputs) - return DecoderOnlyInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs) + return prompt_inputs def _process_decoder_only_prompt( self, @@ -474,7 +578,7 @@ def _process_decoder_only_prompt( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: - ''' + """ For decoder-only models: Process an input prompt into an :class:`DecoderOnlyInputs` instance. @@ -488,9 +592,9 @@ def _process_decoder_only_prompt( Returns: * :class:`DecoderOnlyInputs` instance - ''' + """ - prompt_comps = self._extract_prompt_components( + prompt_comps = self._prompt_to_llm_inputs( prompt, request_id=request_id, lora_request=lora_request, @@ -509,7 +613,7 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" - prompt_comps = await self._extract_prompt_components_async( + prompt_comps = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, lora_request=lora_request, @@ -526,7 +630,7 @@ def preprocess( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of @@ -554,7 +658,7 @@ async def preprocess_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of @@ -577,4 +681,4 @@ async def preprocess_async( ) def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model + return self.model_config.is_encoder_decoder diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4cebc91ce715c..68b4756331e6d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -1,22 +1,26 @@ import functools from collections import UserDict from dataclasses import dataclass -from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, - Protocol, Tuple, Type) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, NamedTuple, + Optional, Protocol, Type, cast) from torch import nn -from transformers import PretrainedConfig -from typing_extensions import TypeVar +from transformers import PretrainedConfig, ProcessorMixin +from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger +from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, resolve_mm_processor_kwargs) -from .data import DecoderOnlyInputs +from .data import ProcessorInputs, SingletonInputs +from .parse import is_encoder_decoder_inputs if TYPE_CHECKING: from vllm.config import ModelConfig - from vllm.multimodal import MultiModalDataDict, MultiModalRegistry + from vllm.multimodal import (MultiModalDataDict, MultiModalPlaceholderDict, + MultiModalRegistry) from vllm.sequence import SequenceData logger = init_logger(__name__) @@ -60,9 +64,30 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]: return self.model_config.hf_image_processor_config +@dataclass(frozen=True) +class InputProcessingContext(InputContext): + tokenizer: AnyTokenizer + """The tokenizer used to tokenize the inputs.""" + + def get_hf_processor(self) -> ProcessorMixin: + return cached_get_processor( + self.model_config.tokenizer, + tokenizer=self.tokenizer, # Override the tokenizer with ours + trust_remote_code=self.model_config.trust_remote_code, + ) + + N = TypeVar("N", bound=Type[nn.Module]) +class DummyData(NamedTuple): + """Dummy data used for profiling.""" + + seq_data: "SequenceData" + multi_modal_data: Optional["MultiModalDataDict"] = None + multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None + + class DummyDataFactory(Protocol): def __call__( @@ -71,7 +96,7 @@ def __call__( seq_len: int, mm_counts: Mapping[str, int], **mm_processor_kwargs: Any, - ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]: + ) -> DummyData: """ Create dummy data to be inputted into the model. @@ -85,7 +110,7 @@ def __call__( ... -class _MultiModalCounts(UserDict): +class _MultiModalCounts(UserDict[str, int]): """ Wraps `mm_counts` for a more informative error message when attempting to access a plugin that does not exist. @@ -100,7 +125,7 @@ def __getitem__(self, key: str) -> int: raise KeyError(msg) from exc -InputProcessor = Callable[[InputContext, DecoderOnlyInputs], DecoderOnlyInputs] +InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs] """Preprocess the inputs to the model.""" @@ -123,7 +148,7 @@ def _default_dummy_data_factory( ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int], - ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]: + ) -> DummyData: """ The default dummy data factory represents the longest possible text that can be inputted to the model. @@ -134,10 +159,7 @@ def _default_dummy_data_factory( # Avoid circular import from vllm.sequence import SequenceData - dummy_seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) - dummy_multi_modal_data = None - - return dummy_seq_data, dummy_multi_modal_data + return DummyData(SequenceData.from_prompt_token_counts((0, seq_len))) def register_dummy_data(self, factory: DummyDataFactory): """ @@ -195,7 +217,7 @@ def dummy_data_for_profiling( seq_len: int, mm_registry: "MultiModalRegistry", is_encoder_data: bool = False, - ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]: + ) -> DummyData: """ Create dummy data for profiling the memory usage of a model. @@ -220,12 +242,12 @@ def dummy_data_for_profiling( mm_processor_kwargs = get_allowed_kwarg_only_overrides( dummy_factory, overrides=model_config.mm_processor_kwargs) - seq_data, mm_data = dummy_factory(InputContext(model_config), seq_len, - _MultiModalCounts(mm_counts), - **mm_processor_kwargs) + dummy_data = dummy_factory(InputContext(model_config), seq_len, + _MultiModalCounts(mm_counts), + **mm_processor_kwargs) # Having more tokens is over-conservative but otherwise fine - num_tokens = seq_data.prompt_token_ids + num_tokens = dummy_data.seq_data.prompt_token_ids if len(num_tokens) < seq_len: if is_encoder_data: print_warning_once( @@ -235,21 +257,21 @@ def dummy_data_for_profiling( raise AssertionError( f"Expected at least {seq_len} dummy tokens for profiling, " f"but found {len(num_tokens)} tokens instead.") - if mm_data is not None: - for k, v in mm_data.items(): + if dummy_data.multi_modal_data is not None: + for k, v in dummy_data.multi_modal_data.items(): num_items = len(v) if isinstance(v, list) else 1 num_expected = mm_counts[k] assert num_items >= num_expected, ( f"Expected at least {num_expected} dummy '{k}' instances " f"for profiling, but found {num_items} instances instead.") - return seq_data, mm_data + return dummy_data def _default_input_processor( self, ctx: InputContext, - inputs: DecoderOnlyInputs, - ) -> DecoderOnlyInputs: + inputs: ProcessorInputs, + ) -> ProcessorInputs: """The default input processor is a no-op.""" return inputs @@ -281,8 +303,23 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]): return self._input_processors_by_model_type \ .get(model_cls, self._default_input_processor) + def _ensure_mm_kwargs( + self, + inputs: SingletonInputs, + mm_processor_kwargs: Dict[str, Any], + ): + if inputs["type"] == "token": + # In case the input processor for that model fails to set it + if "mm_processor_kwargs" not in inputs: + inputs["mm_processor_kwargs"] = mm_processor_kwargs + elif inputs["type"] == "multimodal": + # Be more strict in V2 + assert "mm_kwargs" in inputs + else: + assert_never(inputs["type"]) + def process_input(self, model_config: "ModelConfig", - inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: + inputs: ProcessorInputs) -> ProcessorInputs: """ Apply an input processor to an instance of model inputs. @@ -302,12 +339,25 @@ def process_input(self, model_config: "ModelConfig", # If it's empty, it'll fall back to the default kwarg values mm_processor_kwargs = resolve_mm_processor_kwargs( model_config.mm_processor_kwargs, - inputs.get("mm_processor_kwargs"), + cast(Dict[str, Any], inputs.get("mm_processor_kwargs")), processor, ) - return processor(InputContext(model_config), inputs, - **mm_processor_kwargs) + processed_inputs = processor( + InputContext(model_config), + inputs, + **mm_processor_kwargs, + ) + + if is_encoder_decoder_inputs(processed_inputs): + self._ensure_mm_kwargs(processed_inputs["encoder"], + mm_processor_kwargs) + self._ensure_mm_kwargs(processed_inputs["decoder"], + mm_processor_kwargs) + else: + self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs) + + return processed_inputs def create_input_processor(self, model_config: "ModelConfig"): """ diff --git a/vllm/logger.py b/vllm/logger.py index ccf09691a052a..9e16e591315ba 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -24,7 +24,7 @@ DEFAULT_LOGGING_CONFIG = { "formatters": { "vllm": { - "class": "vllm.logging.NewLineFormatter", + "class": "vllm.logging_utils.NewLineFormatter", "datefmt": _DATE_FORMAT, "format": _FORMAT, }, @@ -67,8 +67,7 @@ def _configure_vllm_root_logger() -> None: raise RuntimeError( "Could not load logging config. File does not exist: %s", VLLM_LOGGING_CONFIG_PATH) - with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8", - mode="r") as file: + with open(VLLM_LOGGING_CONFIG_PATH, encoding="utf-8") as file: custom_config = json.loads(file.read()) if not isinstance(custom_config, dict): @@ -118,13 +117,14 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None): last_lineno = 0 last_func_name = "" with open(log_path, 'a') as f: + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") if event == 'call': - f.write(f"{datetime.datetime.now()} Call to" + f.write(f"{ts} Call to" f" {func_name} in {filename}:{lineno}" f" from {last_func_name} in {last_filename}:" f"{last_lineno}\n") else: - f.write(f"{datetime.datetime.now()} Return from" + f.write(f"{ts} Return from" f" {func_name} in {filename}:{lineno}" f" to {last_func_name} in {last_filename}:" f"{last_lineno}\n") diff --git a/vllm/logging/__init__.py b/vllm/logging/__init__.py deleted file mode 100644 index b9aec380776f3..0000000000000 --- a/vllm/logging/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from vllm.logging.formatter import NewLineFormatter - -__all__ = [ - "NewLineFormatter", -] diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py new file mode 100644 index 0000000000000..576ccf78a8117 --- /dev/null +++ b/vllm/logging_utils/__init__.py @@ -0,0 +1,5 @@ +from vllm.logging_utils.formatter import NewLineFormatter + +__all__ = [ + "NewLineFormatter", +] diff --git a/vllm/logging/formatter.py b/vllm/logging_utils/formatter.py similarity index 100% rename from vllm/logging/formatter.py rename to vllm/logging_utils/formatter.py diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index a7887a048746a..04fc635828d4d 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -70,6 +70,14 @@ def apply(self, x: torch.Tensor, self.lora_b_stacked, add_input=True) # now have column partitioned output + + if self.bias_stacked is not None: + self.bias_stacked = self.bias_stacked.view( + -1, self.bias_stacked.shape[-1]) + self.bias_stacked = self.bias_stacked[ + self.punica_wrapper.token_lora_indices] + output += self.bias_stacked + output = output.view(*out_orig_shape) return output @@ -121,6 +129,15 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] + + if layer.bias_stacked is not None: + bias = layer.bias_stacked[idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[layer.punica_wrapper.token_lora_indices] + bias[layer.punica_wrapper.token_lora_indices == -1] = 0 + output[:, left_offset:left_offset + shard_size] += bias + layer.punica_wrapper.add_expand_slice( output, buffers[idx], @@ -295,6 +312,15 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + shard_size = self.bias_stacked.shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) @@ -318,6 +344,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size + + if self.bias_stacked is not None: + bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) + bias = bias[self.punica_wrapper.token_lora_indices] + bias[self.punica_wrapper.token_lora_indices == -1] = 0 + output += bias + self.punica_wrapper.add_expand_slice(output, buffer, self.lora_b_stacked, start_idx, shard_size) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 6254c67596e65..7429c60e0222d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -67,6 +67,63 @@ def dec(*args, **kwargs): return dec +def apply_bias( + indices: torch.Tensor, + output: torch.Tensor, + bias_stacked: torch.Tensor, +): + """Applies bias to output + + Input shapes: + bias_stacked: (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) + bias_stacked = bias_stacked[indices] + bias_stacked[indices == -1] = 0 + output += bias_stacked + + return output.view_as(org_output) + + +def apply_bias_packed_nslice( + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], +): + """Applies bias to output + + Input shapes: + bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + + offset_left += slice + + return output.view_as(org_output) + + @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -105,6 +162,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): """Overwrites lora tensors at index.""" ... @@ -203,6 +261,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( @@ -299,10 +358,22 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def set_lora( self, @@ -310,6 +381,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) @@ -319,10 +391,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -401,11 +484,25 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + self.output_dim = self.lora_b_stacked.shape[2] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -418,18 +515,30 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_dim + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -437,10 +546,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -534,6 +654,17 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) for _ in range(n_slices)) + if lora_config.bias_enabled: + self.bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + self.output_size // 2, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(n_slices)) + else: + self.bias_stacked = None self.output_dim = self.lora_b_stacked[0].shape[2] @@ -542,6 +673,9 @@ def reset_lora(self, index: int): self.lora_a_stacked[1][index] = 0 self.lora_b_stacked[0][index] = 0 self.lora_b_stacked[1][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -562,18 +696,32 @@ def slice_lora_b( ] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + if bias[0] is None or bias[1] is None: + return bias + shard_size = self.output_dim + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_a[0] is not None: self.lora_a_stacked[0][ @@ -582,6 +730,10 @@ def set_lora( self.lora_b_stacked[0][ index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) + if bias is not None and bias[0] is not None: + self.bias_stacked[0][index, + 0, :bias[0].shape[0]].copy_(bias[0].T, + non_blocking=True) if lora_a[1] is not None: self.lora_a_stacked[1][ index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( @@ -589,10 +741,22 @@ def set_lora( self.lora_b_stacked[1][ index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) + if bias is not None and bias[1] is not None: + self.bias_stacked[1][index, + 0, :bias[1].shape[0]].copy_(bias[1].T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + (self.output_dim, self.output_dim), + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice( output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, (self.output_dim, self.output_dim)) @@ -654,17 +818,35 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + bias_q = bias[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + bias_k = bias[k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + bias_v = bias[v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + bias = torch.cat([bias_q, bias_k, bias_v], dim=1) + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -672,6 +854,10 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -768,6 +954,32 @@ def create_lora_weights( device=self.device, ), ) + if lora_config.bias_enabled: + self.bias_stacked = ( + torch.zeros( + max_loras, + 1, + self.q_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + ) + else: + self.bias_stacked = None self.output_slices = ( self.q_proj_shard_size, @@ -787,6 +999,10 @@ def reset_lora(self, index: int): self.lora_b_stacked[1][index] = 0 self.lora_a_stacked[2][index] = 0 self.lora_b_stacked[2][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 + self.bias_stacked[2][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -812,18 +1028,40 @@ def slice_lora_b( lora_b = [lora_b_q, lora_b_k, lora_b_v] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + bias_q, bias_k, bias_v = bias + if bias_q is not None: + bias_q = bias_q[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + if bias_k is not None: + bias_k = bias_k[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + if bias_v is not None: + bias_v = bias_v[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + bias = [bias_q, bias_k, bias_v] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_b[0] is not None: lora_b_q = lora_b[0] @@ -854,9 +1092,28 @@ def set_lora( index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( lora_a[2].T, non_blocking=True) + if bias is not None: + if bias[0] is not None: + self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( + bias[0].T, non_blocking=True) + if bias[1] is not None: + self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( + bias[1].T, non_blocking=True) + if bias[2] is not None: + self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( + bias[2].T, non_blocking=True) + def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + self.output_slices, + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, @@ -919,9 +1176,27 @@ def create_lora_weights( device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + ( + max_loras, + 1, + self.output_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + # Lazily initialized + self.indices: torch.Tensor + self.indices_len: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tensor_model_parallel_rank = get_tensor_model_parallel_rank() @@ -934,18 +1209,24 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.base_layer.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -953,9 +1234,20 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -1132,6 +1424,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, @@ -1199,7 +1492,7 @@ def _get_logits( neginf=float("-inf"))) logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1], ] = lora_logits + lora_logits.shape[1]] = lora_logits # LogitsProcessorWithLoRA always using bgmv self.punica_wrapper.add_lora_logits(logits, hidden_states, @@ -1276,6 +1569,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): ... diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 14081b5ba441c..b648312ba76ec 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -17,6 +17,7 @@ def __init__( lora_alpha: int, lora_a: torch.Tensor, lora_b: torch.Tensor, + bias: Optional[torch.Tensor] = None, embeddings_tensor: Optional[torch.Tensor] = None, scaling: Optional[float] = None, ) -> None: @@ -25,6 +26,7 @@ def __init__( self.lora_alpha = lora_alpha self.lora_a = lora_a self.lora_b = lora_b + self.bias = bias self.embeddings_tensor = embeddings_tensor if scaling is None: @@ -66,7 +68,8 @@ def create_dummy_lora_weights( rank: int, dtype: torch.dtype, device: torch.types.Device, - embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights": + embeddings_tensor_dim: Optional[int] = None, + bias_enabled: Optional[bool] = False) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() lora_a = torch.zeros([input_dim, rank], dtype=dtype, @@ -76,6 +79,14 @@ def create_dummy_lora_weights( dtype=dtype, device=device, pin_memory=pin_memory) + if bias_enabled: + bias = torch.zeros([output_dim], + dtype=dtype, + device=device, + pin_memory=pin_memory) + else: + bias = None + embeddings_tensor = torch.rand( 10, embeddings_tensor_dim, @@ -88,6 +99,7 @@ def create_dummy_lora_weights( lora_alpha=1, lora_a=lora_a, lora_b=lora_b, + bias=bias, embeddings_tensor=embeddings_tensor, ) @@ -102,6 +114,7 @@ def __init__( lora_alphas: List[Optional[int]], lora_a: List[Optional[torch.Tensor]], lora_b: List[Optional[torch.Tensor]], + bias: Optional[List[Optional[torch.Tensor]]] = None, scaling: Optional[List[float]] = None, ) -> None: super().__init__( @@ -110,6 +123,7 @@ def __init__( lora_alpha=0, lora_a=lora_a, lora_b=lora_b, + bias=bias, scaling=scaling, # type: ignore embeddings_tensor=None, ) @@ -141,6 +155,7 @@ def pack( [lora.lora_alpha if lora is not None else None for lora in loras], [lora.lora_a if lora is not None else None for lora in loras], [lora.lora_b if lora is not None else None for lora in loras], + [lora.bias if lora is not None else None for lora in loras], scaling=[ 1 if lora is not None else None # type: ignore for lora in loras diff --git a/vllm/lora/models.py b/vllm/lora/models.py index d0279f273db7a..2ffefe61427e3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Type import safetensors.torch import torch @@ -119,7 +119,8 @@ def from_lora_tensors( pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: Dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): - module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) + module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( + tensor_name) if module_name not in loras: lora_embeddings_tensor = None if embeddings: @@ -136,8 +137,16 @@ def from_lora_tensors( lora_embeddings_tensor.pin_memory()) loras[module_name] = LoRALayerWeights(module_name, rank, lora_alpha, None, None, + None, lora_embeddings_tensor) - if is_lora_a: + if is_bias: + loras[module_name].bias = tensor.to(device=device, + dtype=dtype).t() + bias = tensor.to(device=device, dtype=dtype).t() + if pin_memory: + bias = bias.pin_memory() + loras[module_name].bias = bias + elif is_lora_a: loras[module_name].lora_a = tensor.to(device=device, dtype=dtype).t() if pin_memory: @@ -215,7 +224,7 @@ def from_local_checkpoint( with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore for lora_module in f.keys(): # noqa - module_name, _ = parse_fine_tuned_lora_name(lora_module) + module_name, _, _ = parse_fine_tuned_lora_name(lora_module) part_name = module_name.split(".")[-1] if part_name not in expected_lora_modules: unexpected_modules.append(module_name) @@ -301,6 +310,7 @@ def __init__( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + device: torch.device, ): """Create a LoRAModelManager and adapter for a given model. @@ -314,6 +324,7 @@ def __init__( lora_config: the LoRA configuration. """ self.lora_config = lora_config + self.device = device self.max_num_seqs = max_num_seqs assert self.capacity >= self.lora_slots self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 @@ -322,7 +333,7 @@ def __init__( self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, max_batches=self.max_num_seqs, - device="cuda") + device=self.device) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} @@ -343,7 +354,7 @@ def __init__( # text modules (e.g. ChatGLM) and hasattr(self.model, "get_mm_mapping")) self.packed_modules: Dict[str, List[str]] = {} - self.modules: Dict[str, "BaseLayerWithLoRA"] = {} + self.modules: Dict[str, BaseLayerWithLoRA] = {} # Dict instead of a Set for compatibility with LRUCache. self._last_mapping: Optional[LoRAMapping] = None self._create_lora_modules() @@ -384,8 +395,19 @@ def activate_adapter( module_lora = lora_model.get_lora(module_name) if module_lora: module_lora.optimize() + # Bias is not explicitly enabled with the flag enable_lora_bias. + bias = module_lora.bias + if ((torch.is_tensor(bias) or + (isinstance(bias, Sequence) and any(b is not None + for b in bias))) + and not self.lora_config.bias_enabled): + module_lora.bias = None + raise ValueError( + f"Adapter bias cannot be used for {module_name}" + " without --enable-lora-bias.") module.set_lora(index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor) + module_lora.embeddings_tensor, + module_lora.bias) else: module.reset_lora(index) return True @@ -507,6 +529,7 @@ def create_dummy_lora( """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): + bias_enabled = self.lora_config.bias_enabled if (not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) or isinstance(module, LinearScalingRotaryEmbeddingWithLora) @@ -534,7 +557,8 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim) + embeddings_tensor_dim=embeddings_tensor_dim, + bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, @@ -543,12 +567,13 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() else: parts = module_name.split(".") replacements = self.packed_modules_mapping[parts[-1]] - subloras: List[Optional["LoRALayerWeights"]] = [] + subloras: List[Optional[LoRALayerWeights]] = [] for i, r in enumerate(replacements): lora = LoRALayerWeights.create_dummy_lora_weights( module_name + "." + r, @@ -557,6 +582,7 @@ def create_dummy_lora( rank, module.lora_a_stacked[i].dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() subloras.append(lora) @@ -653,16 +679,11 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], class LRUCacheLoRAModelManager(LoRAModelManager): """A model manager that manages multiple LoRAs with LRU cache.""" - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - ): + def __init__(self, model: nn.Module, max_num_seqs: int, + max_num_batched_tokens: int, vocab_size: int, + lora_config: LoRAConfig, device: torch.device): super().__init__(model, max_num_seqs, max_num_batched_tokens, - vocab_size, lora_config) + vocab_size, lora_config, device) self._registered_adapters: LoRALRUCache = LoRALRUCache( self.capacity, self.deactivate_adapter) self._active_adapters: LoRALRUCache = LoRALRUCache( @@ -732,6 +753,7 @@ def create_lora_manager( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + device: torch.device, lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, **kwargs) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" @@ -743,5 +765,6 @@ def create_lora_manager( max_num_batched_tokens=max_num_batched_tokens, vocab_size=vocab_size, lora_config=lora_config, + device=device, **kwargs) return lora_manager diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index adb3ab5b46b87..4910cb4061298 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_expand_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index efa234520ab87..844f5cec39e93 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_expand_slice_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index c003f3dc0ce9e..b4d893047b06b 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_shrink_kernel( input_ptr, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 5033ce4126929..082041f390750 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -62,6 +62,7 @@ def convert_mapping( max_loras: int, vocab_size: int, extra_vocab_size: int, + device: torch.device, long_lora_context: Optional["LongContextLoRAContext"] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor], List[int]]: @@ -104,7 +105,7 @@ def convert_mapping( long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", + device=device, dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -131,10 +132,10 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + indices = torch.tensor(indices_list, dtype=torch.long, device=device) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", - dtype=torch.long) + dtype=torch.long, + device=device) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, indices[2] * (vocab_size + extra_vocab_size), @@ -145,7 +146,7 @@ def convert_mapping( sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( + 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( sampler_indices_padded * len(sampler_indices_padded)) long_lora_indices = None long_lora_indices_len: Optional[int] = None @@ -183,7 +184,7 @@ class PunicaWrapper: """ def __init__(self, max_num_batched_tokens: int, max_batches: int, - device: str): + device: Union[torch.device, str]): self._token_lora_indices = torch.empty(max_num_batched_tokens, dtype=torch.long, device=device) @@ -215,6 +216,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, self._lora_indices_per_batch = torch.empty(max_batches, dtype=torch.long, device=device) + self.device: torch.device = device self.max_length: int = 0 self.token_nums: int = 0 self.batch_size: int = -1 @@ -263,6 +265,7 @@ def _update_base_metadata( max_loras, vocab_size, extra_vocab_size, + self.device, long_lora_context, ) self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a780429f413d3..5876494ce2824 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -91,7 +91,7 @@ def replace_submodule(model: nn.Module, module_name: str, return new_module -def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: +def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]: """Parse the name of lora weights. args: @@ -101,15 +101,18 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. + is_bias whether the tensor is lora bias. """ parts = name.split(".") + if parts[-1] == "weight" and (parts[-2] == "lora_A" + or parts[-2] == "lora_B"): + return ".".join(parts[2:-2]), parts[-2] == "lora_A", False - if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model": - if parts[-1] == "weight": - if parts[-2] == "lora_A" or parts[-2] == "lora_B": - return ".".join(parts[2:-2]), parts[-2] == "lora_A" - elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": - return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" + if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": + return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False + + if parts[-1] == "bias": + return ".".join(parts[2:-2]), False, True raise ValueError(f"{name} is unsupported LoRA weight") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 724c308a07a27..93a5e27621912 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -73,6 +73,7 @@ def create_lora_manager( max_num_batched_tokens=self.max_num_batched_tokens, vocab_size=self.vocab_size, lora_config=self.lora_config, + device=self.device, lora_manager_cls=self._manager_cls, ) self._adapter_manager = lora_manager @@ -176,6 +177,7 @@ def create_lora_manager( max_num_seqs=self.max_num_seqs, vocab_size=self.vocab_size, lora_config=self.lora_config, + device=self.device, max_num_batched_tokens=self.max_num_batched_tokens, ) self._adapter_manager = lora_manager diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 764f4e9c99df8..24d75f4df4e02 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -55,10 +55,9 @@ def forward_tpu(self, *args, **kwargs): # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) - def forward_gaudi(self, *args, **kwargs): + def forward_hpu(self, *args, **kwargs): # By default, we assume that Gaudi ops are compatible with the # PyTorch-native implementation. - # NOTE(woosuk): This is a placeholder for future extensions. return self.forward_native(*args, **kwargs) def dispatch_forward(self): @@ -76,6 +75,8 @@ def dispatch_forward(self): return self.forward_hip elif current_platform.is_cpu(): return self.forward_cpu + elif current_platform.is_hpu(): + return self.forward_hpu elif current_platform.is_tpu(): return self.forward_tpu elif current_platform.is_xpu(): @@ -103,7 +104,7 @@ def enabled(cls) -> bool: # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence. @staticmethod - @lru_cache() + @lru_cache def default_on() -> bool: count_none = envs.VLLM_CUSTOM_OPS.count("none") count_all = envs.VLLM_CUSTOM_OPS.count("all") diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py index 658a3700f33d6..34d65ed51ef3f 100644 --- a/vllm/model_executor/layers/activation.py +++ b/vllm/model_executor/layers/activation.py @@ -9,7 +9,6 @@ from vllm.distributed import (divide, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.custom_op import CustomOp -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.utils import set_weight_attrs from vllm.utils import LazyDict @@ -277,25 +276,27 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): }) -def get_act_fn( - act_fn_name: str, - quant_config: Optional[QuantizationConfig] = None, - intermediate_size: Optional[int] = None, - input_is_parallel: bool = True, - params_dtype: Optional[torch.dtype] = None, -) -> nn.Module: +def get_act_fn(act_fn_name: str) -> nn.Module: """Get an activation function by name.""" act_fn_name = act_fn_name.lower() if act_fn_name not in _ACTIVATION_REGISTRY: raise ValueError( f"Activation function {act_fn_name!r} is not supported.") - act_fn = _ACTIVATION_REGISTRY[act_fn_name] - if (quant_config is not None - and act_fn_name in quant_config.get_scaled_act_names()): - if intermediate_size is None: - raise ValueError("intermediate_size must be specified for scaled " - "activation functions.") - return ScaledActivation(act_fn, intermediate_size, input_is_parallel, - params_dtype) - return act_fn + return _ACTIVATION_REGISTRY[act_fn_name] + + +_ACTIVATION_AND_MUL_REGISTRY = LazyDict({ + "gelu": lambda: GeluAndMul(), + "silu": lambda: SiluAndMul(), +}) + + +def get_act_and_mul_fn(act_fn_name: str) -> nn.Module: + """Get an activation-and-mul (i.e. SiluAndMul) function by name.""" + act_fn_name = act_fn_name.lower() + if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY: + raise ValueError( + f"Activation function {act_fn_name!r} is not supported.") + + return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name] diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 93019d0d0abb6..4741d69de11ac 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -8,6 +8,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size, try_get_optimal_moe_config) from vllm.scalar_type import scalar_types +from vllm.utils import direct_register_custom_op def get_scalar_type(num_bits: int, has_zp: bool): @@ -18,7 +19,6 @@ def get_scalar_type(num_bits: int, has_zp: bool): return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128 -@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[]) def single_marlin_moe( hidden_states: torch.Tensor, w: torch.Tensor, @@ -119,8 +119,7 @@ def single_marlin_moe( return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1) -@single_marlin_moe.register_fake -def _( +def single_marlin_moe_fake( hidden_states: torch.Tensor, w: torch.Tensor, scales: torch.Tensor, @@ -136,7 +135,14 @@ def _( return torch.empty_like(hidden_states) -@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[]) +direct_register_custom_op( + op_name="single_marlin_moe", + op_func=single_marlin_moe, + mutates_args=[], + fake_impl=single_marlin_moe_fake, +) + + def fused_marlin_moe( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -324,8 +330,7 @@ def fused_marlin_moe( dim=1) -@fused_marlin_moe.register_fake -def _( +def fused_marlin_moe_fake( hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -344,3 +349,11 @@ def _( is_k_full: bool = True, ) -> torch.Tensor: return torch.empty_like(hidden_states) + + +direct_register_custom_op( + op_name="fused_marlin_moe", + op_func=fused_marlin_moe, + mutates_args=[], + fake_impl=fused_marlin_moe_fake, +) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 1cf5c2253ca0b..340da32263c1c 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -12,6 +12,7 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op logger = init_logger(__name__) @@ -466,8 +467,6 @@ def get_config_dtype_str(dtype: torch.dtype, return None -@torch.library.custom_op("vllm::inplace_fused_experts", - mutates_args=["hidden_states"]) def inplace_fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -484,22 +483,29 @@ def inplace_fused_experts(hidden_states: torch.Tensor, a1_scale, a2_scale) -@inplace_fused_experts.register_fake -def _(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - use_fp8_w8a8: bool = False, - use_int8_w8a16: bool = False, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None) -> None: +def inplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> None: pass -@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[]) +direct_register_custom_op( + op_name="inplace_fused_experts", + op_func=inplace_fused_experts, + mutates_args=["hidden_states"], + fake_impl=inplace_fused_experts_fake, +) + + def outplace_fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -517,21 +523,29 @@ def outplace_fused_experts( w2_scale, a1_scale, a2_scale) -@outplace_fused_experts.register_fake -def _(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - use_fp8_w8a8: bool = False, - use_int8_w8a16: bool = False, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor: +def outplace_fused_experts_fake( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor: return torch.empty_like(hidden_states) +direct_register_custom_op( + op_name="outplace_fused_experts", + op_func=outplace_fused_experts, + mutates_args=[], + fake_impl=outplace_fused_experts_fake, +) + + def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 30b43f375dd5c..345919c5d1636 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -92,6 +92,25 @@ def forward_cuda( ) return out + def forward_hpu( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + from vllm_hpu_extension.ops import HPUFusedRMSNorm + if HPUFusedRMSNorm is None: + return self.forward_native(x, residual) + if residual is not None: + orig_shape = x.shape + residual += x.view(residual.shape) + # Note: HPUFusedRMSNorm requires 3D tensors as inputs + x = HPUFusedRMSNorm.apply(residual, self.weight, + self.variance_epsilon) + return x.view(orig_shape), residual + + x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon) + return x + def forward_xpu( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py index 288f5a1134b6b..fb76b1b17925e 100644 --- a/vllm/model_executor/layers/logits_processor.py +++ b/vllm/model_executor/layers/logits_processor.py @@ -111,8 +111,14 @@ def _prune_hidden_states( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - return hidden_states.index_select(0, - sampling_metadata.selected_token_indices) + # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios + # (warmup, profile_run) we might not have selected_token_indices, + # so we skip pruning. + if sampling_metadata.selected_token_indices is not None: + return hidden_states.index_select( + 0, sampling_metadata.selected_token_indices) + else: + return hidden_states def _apply_logits_processors( diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py new file mode 100644 index 0000000000000..8ef0a6cdf2c52 --- /dev/null +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -0,0 +1,217 @@ +import torch +from torch import nn +from torch.nn.parameter import Parameter + +from vllm.attention.backends.abstract import AttentionMetadata +from vllm.distributed.parallel_state import ( + get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) +from vllm.model_executor.custom_op import CustomOp +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( + causal_conv1d_fn, causal_conv1d_update) +from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( + selective_scan_fn, selective_state_update) +from vllm.model_executor.models.mamba_cache import MambaCacheParams +from vllm.model_executor.utils import set_weight_attrs + + +# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer +@CustomOp.register("mamba_mixer") +class MambaMixer(CustomOp): + """ + Compute ∆, A, B, C, and D the state space parameters and compute + the `contextualized_states`. A, D are input independent + (see Mamba paper [1] Section 3.5.2 "Interpretation of A" + for why A isn't selective) ∆, B, C are input-dependent + (this is a key difference between Mamba and the linear time + invariant S4, and is why Mamba is called + **selective** state spaces) + """ + + def __init__(self, + hidden_size: int, + ssm_state_size: int, + conv_kernel_size: int, + intermediate_size: int, + time_step_rank: int, + use_conv_bias: bool, + use_bias: bool, + use_rms_norm: bool, + rms_norm_eps: float = 1e-5, + activation="silu"): + super().__init__() + self.time_step_rank = time_step_rank + self.ssm_state_size = ssm_state_size + self.use_rms_norm = use_rms_norm + self.activation = activation + + self.conv1d = ColumnParallelLinear( + input_size=conv_kernel_size, + output_size=intermediate_size, + bias=use_conv_bias, + ) + # unsqueeze to fit conv1d weights shape into the linear weights shape. + # Can't do this in `weight_loader` since it already exists in + # `ColumnParallelLinear` and `set_weight_attrs` + # doesn't allow to override it + self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) + + self.in_proj = MergedColumnParallelLinear(hidden_size, + [intermediate_size] * 2, + bias=use_bias) + # selective projection used to make dt, B and C input dependent + self.x_proj = RowParallelLinear( + intermediate_size, + time_step_rank + ssm_state_size * 2, + bias=False, + ) + # time step projection (discretization) - + # In the forward we need to apply dt_proj without the bias, + # as the bias is added in the selective scan kernel. + self.dt_proj = ColumnParallelLinear(time_step_rank, + intermediate_size, + bias=True, + skip_bias_add=True) + + def weight_loader(param: Parameter, loaded_weight: torch.Tensor): + tp_rank = get_tensor_model_parallel_rank() + tp_size = get_tensor_model_parallel_world_size() + param.data.copy_( + loaded_weight.data.split(loaded_weight.shape[0] // tp_size, + dim=0)[tp_rank]) + + def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): + weight_loader(param, -torch.exp(loaded_weight.float())) + + tp_size = get_tensor_model_parallel_world_size() + self.A = nn.Parameter( + torch.empty( + intermediate_size // tp_size, + ssm_state_size, + dtype=torch.float32, + )) + self.D = nn.Parameter(torch.ones(intermediate_size // tp_size)) + + set_weight_attrs(self.D, {"weight_loader": weight_loader}) + set_weight_attrs(self.A, {"weight_loader": A_weight_loader}) + + self.out_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=use_bias, + input_is_parallel=True, + ) + + self.dt_layernorm = RMSNorm(time_step_rank, + eps=rms_norm_eps) if use_rms_norm else None + + self.b_layernorm = RMSNorm(ssm_state_size, + eps=rms_norm_eps) if use_rms_norm else None + + self.c_layernorm = RMSNorm(ssm_state_size, + eps=rms_norm_eps) if use_rms_norm else None + + def forward_native(self, hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + conv_state: torch.Tensor, ssm_state: torch.Tensor): + pass + + def forward_cuda(self, hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + mamba_cache_params: MambaCacheParams): + + # 1. Gated MLP's linear projection + projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) + hidden_states, gate = projected_states.chunk(2, dim=-2) + + # 2. Convolution sequence transformation + conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), + self.conv1d.weight.size(2)) + + if attn_metadata.query_start_loc is not None \ + and attn_metadata.context_lens_tensor is not None: + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + hidden_states = causal_conv1d_fn( + hidden_states, + conv_weights, + self.conv1d.bias, + activation=self.activation, + conv_states=mamba_cache_params.conv_state, + has_initial_state=attn_metadata.context_lens_tensor > 0, + cache_indices=mamba_cache_params.state_indices_tensor, + query_start_loc=attn_metadata.query_start_loc) + else: + hidden_states = causal_conv1d_update( + hidden_states.transpose(0, 1), + mamba_cache_params.conv_state, + conv_weights, + self.conv1d.bias, + self.activation, + conv_state_indices=mamba_cache_params.state_indices_tensor) + hidden_states = hidden_states.transpose(0, 1) + + # 3. State Space Model sequence transformation + # 3.a. input varying initialization of time_step, B and C + ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] + + time_step, B, C = torch.split( + ssm_parameters, + [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], + dim=-1, + ) + if self.use_rms_norm: + assert self.dt_layernorm is not None + assert self.b_layernorm is not None + assert self.c_layernorm is not None + time_step = self.dt_layernorm(time_step.contiguous()) + B = self.b_layernorm(B.contiguous()) + C = self.c_layernorm(C.contiguous()) + + discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) + # 3.c perform the recurrence y ← SSM(A, B, C)(x) + time_proj_bias = (self.dt_proj.bias.float() if hasattr( + self.dt_proj, "bias") else None) + + if attn_metadata.query_start_loc is not None \ + and attn_metadata.context_lens_tensor is not None: + scan_outputs = selective_scan_fn( + hidden_states, + mamba_cache_params.ssm_state, + discrete_time_step, + self.A, + B.transpose(-2, -1), + C.transpose(-2, -1), + self.D.float(), + gate, + time_proj_bias, + delta_softplus=True, + cache_indices=mamba_cache_params.state_indices_tensor, + has_initial_state=attn_metadata.context_lens_tensor > 0, + query_start_loc=attn_metadata.query_start_loc) + else: + scan_outputs = selective_state_update( + mamba_cache_params.ssm_state, + hidden_states.transpose(0, 1), + discrete_time_step.transpose(0, 1), + self.A, + B, + C, + self.D, + gate.transpose(0, 1), + time_proj_bias, + dt_softplus=True, + state_batch_indices=mamba_cache_params.state_indices_tensor) + scan_outputs = scan_outputs.transpose(0, 1) + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_outputs.transpose(-2, + -1))[0] + return contextualized_states diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 1c9772b41cbef..024badbc17b96 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -16,6 +16,7 @@ class PoolingType(IntEnum): ALL = 1 CLS = 2 STEP = 3 + MEAN = 4 class Pooler(nn.Module): @@ -27,7 +28,7 @@ class Pooler(nn.Module): 3. Returns structured results as `PoolerOutput`. Attributes: - pooling_type: The type of pooling to use (LAST, ALL, CLS). + pooling_type: The type of pooling to use. normalize: Whether to normalize the pooled data. """ @@ -97,6 +98,17 @@ def forward( for prompt_len in prompt_lens: pooled_data.append(hidden_states[offset:offset + prompt_len]) offset += prompt_len + elif self.pooling_type == PoolingType.MEAN: + # Calculate mean pooling + cumsum = torch.cumsum(hidden_states, dim=0) + start_indices = torch.cat([ + torch.tensor([0], device=hidden_states.device), + torch.cumsum(prompt_lens[:-1], dim=0) + ]) + end_indices = torch.cumsum(prompt_lens, dim=0) + pooled_data = ( + cumsum[end_indices - 1] - cumsum[start_indices] + + hidden_states[start_indices]) / prompt_lens.unsqueeze(1) elif self.pooling_type == PoolingType.STEP: if self.returned_token_ids is not None and len( self.returned_token_ids) > 0: diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py index c88ca340ebcc5..72c89fe2b0e48 100644 --- a/vllm/model_executor/layers/quantization/aqlm.py +++ b/vllm/model_executor/layers/quantization/aqlm.py @@ -213,9 +213,6 @@ def get_quant_method(self, layer: torch.nn.Module, return AQLMLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class AQLMLinearMethod(LinearMethodBase): """Linear method for AQLM. diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 38dd1f2e10fcd..d83528e9ec79c 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -77,9 +77,6 @@ def get_quant_method(self, layer: torch.nn.Module, return AWQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] - def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): return any(module_name in prefix for module_name in modules_to_not_convert) diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index 95ec12daeeeb5..4d1a837d11585 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -9,7 +9,9 @@ from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod, set_weight_attrs) +from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.utils import replace_parameter @@ -36,13 +38,18 @@ class AWQMarlinConfig(QuantizationConfig): 8: scalar_types.uint8, } - def __init__(self, weight_bits: int, group_size: int, has_zp: bool, - lm_head_quantized: bool) -> None: + def __init__(self, + weight_bits: int, + group_size: int, + zero_point: bool, + lm_head_quantized: bool, + modules_to_not_convert: Optional[List[str]] = None) -> None: self.pack_factor = 32 // weight_bits # packed into int32 self.group_size = group_size - self.has_zp = has_zp + self.zero_point = zero_point self.lm_head_quantized = lm_head_quantized self.weight_bits = weight_bits + self.modules_to_not_convert = modules_to_not_convert or [] if self.weight_bits not in self.TYPE_MAP: raise ValueError(f"Unsupported num_bits = {self.weight_bits}. " @@ -52,13 +59,14 @@ def __init__(self, weight_bits: int, group_size: int, has_zp: bool, verify_marlin_supported(self.quant_type, group_size=self.group_size, - has_zp=self.has_zp) + has_zp=self.zero_point) def __repr__(self) -> str: return (f"AWQMarlinConfig(quant_type={self.quant_type}, " f"group_size={self.group_size}, " - f"has_zp={self.has_zp}, " - f"lm_head_quantized={self.lm_head_quantized})") + f"zero_point={self.zero_point}, " + f"lm_head_quantized={self.lm_head_quantized}, " + f"modules_to_not_convert={self.modules_to_not_convert})") @classmethod def get_name(cls) -> str: @@ -80,10 +88,13 @@ def get_config_filenames(cls) -> List[str]: def from_config(cls, config: Dict[str, Any]) -> "AWQMarlinConfig": weight_bits = cls.get_from_keys(config, ["bits"]) group_size = cls.get_from_keys(config, ["group_size"]) - has_zp = cls.get_from_keys(config, ["zero_point"]) + zero_point = cls.get_from_keys(config, ["zero_point"]) lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"], default=False) - return cls(weight_bits, group_size, has_zp, lm_head_quantized) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None) + return cls(weight_bits, group_size, zero_point, lm_head_quantized, + modules_to_not_convert) @classmethod def override_quantization_method(cls, hf_quant_cfg, @@ -109,21 +120,20 @@ def get_quant_method(self, layer: torch.nn.Module, prefix: str) -> Optional["QuantizeMethodBase"]: if (isinstance(layer, LinearBase) or (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)): + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() return AWQMarlinLinearMethod(self) elif isinstance(layer, FusedMoE): return AWQMoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. quant_method = quant_config.get("quant_method", "").lower() num_bits = quant_config.get("bits") group_size = quant_config.get("group_size") - has_zp = quant_config.get("zero_point") + zero_point = quant_config.get("zero_point") if not current_platform.is_cuda(): return False @@ -132,7 +142,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): return False # If we cannot find the info needed in the config, cannot convert. - if (num_bits is None or group_size is None or has_zp is None): + if (num_bits is None or group_size is None or zero_point is None): return False if num_bits not in cls.TYPE_MAP: @@ -140,7 +150,7 @@ def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]): return check_marlin_supported(quant_type=cls.TYPE_MAP[num_bits], group_size=group_size, - has_zp=has_zp) + has_zp=zero_point) class AWQMarlinLinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 75fa8249cd3c2..6dfac8aad5358 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -133,11 +133,3 @@ def get_quant_method(self, layer: torch.nn.Module, method. """ raise NotImplementedError - - @abstractmethod - def get_scaled_act_names(self) -> List[str]: - """Returns the activation function names that should be post-scaled. - - For now, this is only used by AWQ. - """ - raise NotImplementedError diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 7a039a78f09b8..39965ac9115c2 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -114,12 +114,14 @@ def get_quant_method(self, layer: torch.nn.Module, return BitsAndBytesLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]): - return any(module_name in prefix for module_name in llm_int8_skip_modules) + # Split the prefix into its dot-separated components + components = prefix.split('.') + + # Check if any of the skip modules exactly matches any component + return any(module_name in components + for module_name in llm_int8_skip_modules) class BitsAndBytesLinearMethod(LinearMethodBase): @@ -198,8 +200,9 @@ def create_qweight_for_4bit(): qweight = create_qweight_for_8bit() else: qweight = create_qweight_for_4bit() - - layer.register_parameter("qweight", qweight) + # Enable parameters to have the same name as in the BNB + # checkpoint format. + layer.register_parameter("weight", qweight) set_weight_attrs(qweight, extra_weight_attrs) def apply(self, @@ -229,7 +232,7 @@ def _apply_8bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight offsets = qweight.bnb_shard_offsets quant_states = qweight.bnb_quant_state matmul_states = qweight.matmul_state @@ -308,7 +311,7 @@ def _apply_4bit_weight( reshape_after_matmul = True bf_x = x.to(torch.bfloat16) - qweight = layer.qweight + qweight = layer.weight quant_states = qweight.bnb_quant_state offsets = qweight.bnb_shard_offsets diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ecc345f116c37..4f5758a42dbbc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -45,9 +45,6 @@ def __init__(self, def get_linear_method(self) -> "CompressedTensorsLinearMethod": return CompressedTensorsLinearMethod(self) - def get_scaled_act_names(self) -> List[str]: - return [] - def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.float16, torch.bfloat16] diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py new file mode 100644 index 0000000000000..3ff162170f255 --- /dev/null +++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py @@ -0,0 +1,184 @@ +from typing import Optional, Type + +import torch +import triton +import triton.language as tl + + +def is_weak_contiguous(x: torch.Tensor): + strides = x.stride() + sizes = x.shape + is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0])) + is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1])) + return is_transpose or is_not_transpose + + +@triton.jit +def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr, + M, N, K, stride_am, stride_ak, stride_bk, stride_bn, + stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr, + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, + BLOCK_SIZE_K: tl.constexpr, + BLOCK_SIZE_SCALE_A: tl.constexpr, + BLOCK_SIZE_SCALE_B: tl.constexpr): + pid = tl.program_id(axis=0) + + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + + pid_m = pid // num_pid_n + pid_n = pid % num_pid_n + + accumulator_dtype = ACCUMULATOR_DTYPE + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), + dtype=accumulator_dtype) + + # NOTE: Some tensor inputs are so large, they will cause int32 overflow + # so it is necessary to use tl.int64 for all the offsets, else SEGV will + # eventually occur. + + # Offsets and masks. + offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + masks_am = offsets_am < M + + offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + masks_bn = offsets_bn < N + + offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64) + offsets_a = (stride_am * offsets_am[:, None] + + stride_ak * offsets_k[None, :]) + offsets_b = (stride_bk * offsets_k[:, None] + + stride_bn * offsets_bn[None, :]) + + # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create + # appropriate offsets and masks for each case. Same goes for + # BLOCK_SIZE_SCALE_B. + offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) + + (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M) + masks_scale_am = offsets_scale_am < M + + offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) + + (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N) + masks_scale_bn = offsets_scale_bn < N + + a_ptrs = a_ptr + offsets_a + b_ptrs = b_ptr + offsets_b + + scale_a_ptrs = scale_a_ptr + offsets_scale_am + scale_b_ptrs = scale_b_ptr + offsets_scale_bn + + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + masks_k = offsets_k < K + masks_a = masks_am[:, None] & masks_k[None, :] + a = tl.load(a_ptrs, mask=masks_a) + + masks_b = masks_k[:, None] & masks_bn[None, :] + b = tl.load(b_ptrs, mask=masks_b) + + # Accumulate results. + accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype) + + offsets_k += BLOCK_SIZE_K + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + + # Apply scale at end. + masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None] + scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a) + # Need to broadcast to the appropriate size, if scale_a is already + # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes + # for scale_b below. + scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1)) + accumulator = scale_a * accumulator.to(tl.float32) + + masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :] + scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b) + scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1)) + accumulator = scale_b.T * accumulator.to(tl.float32) + + # Convert to output format. + c = accumulator.to(c_ptr.type.element_ty) + + # Add bias, it's already in output format, so add it after conversion. + if bias_ptr: + offsets_bias = offsets_bn + bias_ptrs = bias_ptr + offsets_bias + bias_mask = offsets_bias < N + bias = tl.load(bias_ptrs, bias_mask) + c += bias + + # Save output + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64) + offs_cm = offs_cm.to(tl.int64) + offs_cn = offs_cn.to(tl.int64) + c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] + + stride_cn * offs_cn[None, :]) + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + + tl.store(c_ptrs, c, mask=c_mask) + + +# input - [M, K] +# weight - [K, N] +def triton_scaled_mm(input: torch.Tensor, + weight: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None, + block_size_m: int = 32, + block_size_n: int = 32, + block_size_k: int = 32) -> torch.Tensor: + M, K = input.shape + N = weight.shape[1] + + assert N > 0 and K > 0 and M > 0 + assert weight.shape[0] == K + assert input.dtype == weight.dtype + assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point() + assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size( + [M, 1]) + assert scale_b.shape == torch.Size([1, 1]) or scale_b.shape == torch.Size( + [N, 1]) + assert out_dtype.is_floating_point + assert bias is None or bias.is_floating_point() + assert is_weak_contiguous(input) + assert is_weak_contiguous(weight) + + grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv( + N, META['BLOCK_SIZE_N']), ) + + result = torch.empty((M, N), dtype=out_dtype, device=input.device) + + has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1 + + block_size_sa = 1 if has_scalar(scale_a) else block_size_m + block_size_sb = 1 if has_scalar(scale_b) else block_size_n + + accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32 + + # A = input, B = weight, C = result + # A = M x K, B = K x N, C = M x N + scaled_mm_kernel[grid](input, + weight, + scale_a, + scale_b, + result, + bias, + M, + N, + K, + input.stride(0), + input.stride(1), + weight.stride(0), + weight.stride(1), + result.stride(0), + result.stride(1), + accumulator_dtype, + BLOCK_SIZE_M=block_size_m, + BLOCK_SIZE_N=block_size_n, + BLOCK_SIZE_K=block_size_k, + BLOCK_SIZE_SCALE_A=block_size_sa, + BLOCK_SIZE_SCALE_B=block_size_sb) + + return result.to(out_dtype) diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py index 29484801dc380..36598b3e2990f 100644 --- a/vllm/model_executor/layers/quantization/deepspeedfp.py +++ b/vllm/model_executor/layers/quantization/deepspeedfp.py @@ -50,9 +50,6 @@ def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig": def get_linear_method(self) -> "DeepSpeedFPLinearMethod": return DeepSpeedFPLinearMethod(self) - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: return [torch.half, torch.bfloat16] diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 116a4ea0aed89..97297970d9317 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -45,9 +45,6 @@ def get_quant_method(self, layer: torch.nn.Module, return ExpertsInt8MoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ExpertsInt8MoEMethod(FusedMoEMethodBase): diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 825d01d1b3551..7b71e13b50ccc 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -64,9 +64,6 @@ def get_quant_method(self, layer: torch.nn.Module, return FBGEMMFp8LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class FBGEMMFp8LinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index d34579b7099bb..978e727bc7cb3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -92,9 +92,6 @@ def get_quant_method(self, layer: torch.nn.Module, return Fp8KVCacheMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class Fp8LinearMethod(LinearMethodBase): """Linear method for FP8. diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index d73b9f6d92832..24138662eb25c 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -48,9 +48,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GGUFEmbeddingMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor, qweight_type: int) -> torch.Tensor: diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 1cfadb4f42ca8..0aa605e62454e 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -80,9 +80,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GPTQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ExllamaState(Enum): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index b97dd108d6785..1f72e3afbbce5 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -125,9 +125,6 @@ def get_quant_method( return GPTQMarlinMoEMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - @classmethod def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]): # Extract data from quant config. diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py index 0971aedba4c3c..07552c0f13348 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py @@ -127,9 +127,6 @@ def get_quant_method(self, layer: torch.nn.Module, return GPTQMarlin24LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class GPTQMarlin24LinearMethod(LinearMethodBase): """Linear method for Marlin24. diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index e54052632e468..330c2ad195d78 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -54,7 +54,7 @@ def get_name(cls) -> str: @classmethod def get_supported_act_dtypes(cls) -> List[torch.dtype]: - return [torch.bfloat16] + return [torch.bfloat16, torch.float16] @classmethod def get_min_capability(cls) -> int: @@ -93,12 +93,6 @@ def get_quant_method(self, layer: torch.nn.Module, return self.quant_method(self) return None - def get_scaled_act_names(self) -> List[str]: - if self.method == "awq": - return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] - else: - return [] - class IPEXAWQLinearMethod(AWQLinearMethod): """AWQ linear method using IPEX for the CPU backend. diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py index 8f1b5370b4538..20212e672eab0 100644 --- a/vllm/model_executor/layers/quantization/marlin.py +++ b/vllm/model_executor/layers/quantization/marlin.py @@ -110,9 +110,6 @@ def get_quant_method(self, layer: torch.nn.Module, return MarlinLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class MarlinLinearMethod(LinearMethodBase): """Linear method for Marlin. diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index dc5f47eb9b0fb..a1b3eeb43cbee 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -68,9 +68,6 @@ def get_quant_method(self, layer: torch.nn.Module, return ModelOptFp8KVCacheMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class ModelOptFp8KVCacheMethod(BaseKVCacheMethod): """ @@ -141,8 +138,11 @@ def create_weights( layer.register_parameter("input_scale", scale) def process_weights_after_loading(self, layer: Module) -> None: - max_w_scale, weight = requantize_with_max_scale( - layer.weight, layer.weight_scale, layer.logical_widths) + weight = layer.weight + max_w_scale = layer.weight_scale.max() + if not (layer.weight_scale == layer.weight_scale[0]).all(): + max_w_scale, weight = requantize_with_max_scale( + layer.weight, layer.weight_scale, layer.logical_widths) layer.weight = Parameter(weight.t(), requires_grad=False) layer.weight_scale = Parameter(max_w_scale, requires_grad=False) layer.input_scale = Parameter(layer.input_scale.max(), diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py index 2624981f6a614..2d5cdfa165775 100644 --- a/vllm/model_executor/layers/quantization/neuron_quant.py +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -57,9 +57,6 @@ def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]: "Neuron Quantization is only supported through" " transformers_neuronx.") - def get_scaled_act_names(self) -> List[str]: - return [] - def get_quantization_config(self): from transformers_neuronx.config import QuantizationConfig return QuantizationConfig(quant_dtype=self.quant_dtype, diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py index 5bc3737520865..2ccd082029610 100644 --- a/vllm/model_executor/layers/quantization/qqq.py +++ b/vllm/model_executor/layers/quantization/qqq.py @@ -112,9 +112,6 @@ def get_quant_method(self, layer: torch.nn.Module, return QQQLinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class QQQLinearMethod(LinearMethodBase): """Linear method for QQQ. diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py index be8235b468f68..605c3a38644ac 100644 --- a/vllm/model_executor/layers/quantization/tpu_int8.py +++ b/vllm/model_executor/layers/quantization/tpu_int8.py @@ -50,9 +50,6 @@ def get_quant_method(self, layer: Module, return TPUInt8LinearMethod(self) return None - def get_scaled_act_names(self) -> List[str]: - return [] - class TPUInt8LinearMethod(LinearMethodBase): """Int8 Linear method for TPU Quant. """ diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 445117ac99a34..ec73533126ab6 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -7,8 +7,7 @@ # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale -TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \ - if current_platform.is_rocm() else None +TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32) def cutlass_fp8_supported() -> bool: @@ -166,8 +165,7 @@ def apply_fp8_linear( # Making sure the dummy tensor is on the same device as the weight global TORCH_DEVICE_IDENTITY - if (TORCH_DEVICE_IDENTITY is not None - and TORCH_DEVICE_IDENTITY.device != weight.device): + if TORCH_DEVICE_IDENTITY.device != weight.device: TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device) # GEMM diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index 8cd938fc85fb2..aae806f6af323 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py @@ -41,6 +40,7 @@ from torch.nn.init import trunc_normal_ from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.quantization import QuantizationConfig DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) @@ -154,15 +154,15 @@ class BaseResampler(nn.Module): A tensor with the shape of (grid_size**2, embed_dim) """ - def __init__( - self, - num_queries: int, - embed_dim: int, - num_heads: int, - kv_dim: Optional[int] = None, - norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - do_post_projection: bool = True, - ) -> None: + def __init__(self, + num_queries: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + do_post_projection: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__() self.num_queries = num_queries @@ -172,7 +172,11 @@ def __init__( self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) trunc_normal_(self.query, std=0.02) if kv_dim is not None and kv_dim != embed_dim: - self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) + self.kv_proj = ReplicatedLinear(kv_dim, + embed_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.kv_proj") else: # Maintain the same return value with ReplicatedLinear.forward self.kv_proj = lambda *args, **kwargs: ( # type: ignore # noqa @@ -209,22 +213,24 @@ class Resampler2(BaseResampler): present in minicpmv2.0, but not qwen-vl. """ - def __init__( - self, - grid_size: int, - embed_dim: int, - num_heads: int, - kv_dim: Optional[int] = None, - norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - adaptive: bool = False, - do_post_projection: bool = True, - ) -> None: + def __init__(self, + grid_size: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + adaptive: bool = False, + do_post_projection: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: super().__init__(grid_size**2, embed_dim, num_heads, kv_dim, norm_layer, - do_post_projection=do_post_projection) + do_post_projection=do_post_projection, + quant_config=quant_config, + prefix=prefix) self.adaptive = adaptive pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 2158ad3339673..63ceec63e8317 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.33.2/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -195,6 +194,61 @@ def forward_xpu( self.cos_sin_cache, self.is_neox_style) return query, key + def forward_hpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor, + offsets: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + from habana_frameworks.torch.hpex.kernels import ( + RotaryPosEmbeddingMode, apply_rotary_pos_emb) + positions = positions.flatten() + if offsets is not None: + positions = positions + offsets + num_tokens = positions.shape[0] + cos_sin = self.cos_sin_cache.index_select(0, positions).view( + num_tokens, 1, -1) + cos, sin = cos_sin.chunk(2, dim=-1) + # HPU RoPE kernel requires hidden dimension for cos and sin to be equal + # to query hidden dimension, so the original tensors need to be + # expanded + # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE + # and expansion of cos/sin tensors via concatenation + # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE + # and expansion of cos/sin tensors via repeat_interleave + rope_mode: RotaryPosEmbeddingMode + if self.is_neox_style: + rope_mode = RotaryPosEmbeddingMode.BLOCKWISE + cos = torch.cat((cos, cos), dim=-1) + sin = torch.cat((sin, sin), dim=-1) + else: + rope_mode = RotaryPosEmbeddingMode.PAIRWISE + sin = torch.repeat_interleave(sin, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + cos = torch.repeat_interleave(cos, + 2, + dim=-1, + output_size=cos_sin.shape[-1]) + + query_shape = query.shape + query = query.view(num_tokens, -1, self.head_size) + query_rot = query[..., :self.rotary_dim] + query_pass = query[..., self.rotary_dim:] + query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0, + rope_mode) + query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) + + key_shape = key.shape + key = key.view(num_tokens, -1, self.head_size) + key_rot = key[..., :self.rotary_dim] + key_pass = key[..., self.rotary_dim:] + key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0, rope_mode) + key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) + return query, key + def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index f86c6ec362ebe..c10efefea5471 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -30,6 +30,15 @@ else: flashinfer_top_k_top_p_sampling = None + +def get_sampler() -> torch.nn.Module: + if envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.sample.sampler import Sampler as V1Sampler + return V1Sampler() + return Sampler() + + # (num_token_ids, num_parent_ids) per sequence group. SampleResultType = List[Tuple[List[int], List[int]]] diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index b448557af13b3..52771f50a7a23 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -12,6 +12,7 @@ QuantizationConfig, QuantizeMethodBase, method_has_implemented_embedding) from vllm.model_executor.parameter import BasevLLMParameter from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform DEFAULT_VOCAB_PADDING_SIZE = 64 @@ -382,8 +383,20 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): # Copy the data. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) - param[:loaded_weight.shape[0]].data.copy_(loaded_weight) - param[loaded_weight.shape[0]:].data.fill_(0) + + if current_platform.is_hpu(): + # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here, + # so we're using a workaround. Remove this when fixed in + # HPU PT bridge. + padded_weight = torch.cat([ + loaded_weight, + torch.zeros(param.shape[0] - loaded_weight.shape[0], + *loaded_weight.shape[1:]) + ]) + param.data.copy_(padded_weight) + else: + param[:loaded_weight.shape[0]].data.copy_(loaded_weight) + param[loaded_weight.shape[0]:].data.fill_(0) def forward(self, input_): if self.tp_size > 1: diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index d1ec171c9ec2a..12468997e4653 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -1,27 +1,15 @@ -from typing import Optional - from torch import nn -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.model_executor.model_loader.loader import (BaseModelLoader, get_model_loader) from vllm.model_executor.model_loader.utils import ( get_architecture_class_name, get_model_architecture) -def get_model(*, model_config: ModelConfig, load_config: LoadConfig, - device_config: DeviceConfig, parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - lora_config: Optional[LoRAConfig], - cache_config: CacheConfig) -> nn.Module: - loader = get_model_loader(load_config) - return loader.load_model(model_config=model_config, - device_config=device_config, - lora_config=lora_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - cache_config=cache_config) +def get_model(*, vllm_config: VllmConfig) -> nn.Module: + loader = get_model_loader(vllm_config.load_config) + return loader.load_model(vllm_config=vllm_config) __all__ = [ diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 79703bb7ded7a..715e6c11f86ce 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -9,8 +9,7 @@ import os from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import (Any, Dict, Generator, Iterable, List, Optional, Tuple, - Type, cast) +from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast import gguf import huggingface_hub @@ -18,18 +17,17 @@ import torch from huggingface_hub import HfApi, hf_hub_download from torch import nn -from transformers import AutoModelForCausalLM, PretrainedConfig +from transformers import AutoModelForCausalLM from transformers.utils import SAFE_WEIGHTS_INDEX_NAME -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat, - LoRAConfig, ModelConfig, MultiModalConfig, - ParallelConfig, PoolerConfig, SchedulerConfig) +from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig, + VllmConfig) from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) +from vllm.model_executor.layers.linear import (ReplicatedLinear, + RowParallelLinear) from vllm.model_executor.model_loader.tensorizer import ( TensorizerConfig, is_vllm_tensorized, load_with_tensorizer, serialize_vllm_model, tensorizer_weights_iterator) @@ -38,11 +36,9 @@ from vllm.model_executor.model_loader.weight_utils import ( download_safetensors_index_file_from_hf, download_weights_from_hf, filter_duplicate_safetensors_files, filter_files_not_needed_for_inference, - get_gguf_extra_tensor_names, get_quant_config, gguf_quant_weights_iterator, + get_gguf_extra_tensor_names, gguf_quant_weights_iterator, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) -from vllm.model_executor.models import (has_inner_state, supports_lora, - supports_multimodal) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -92,105 +88,11 @@ def device_loading_context(module: torch.nn.Module, logger = init_logger(__name__) -def _get_quantization_config( - model_config: ModelConfig, - load_config: LoadConfig) -> Optional[QuantizationConfig]: - """Get the quantization config.""" - if model_config.quantization is not None: - quant_config = get_quant_config(model_config, load_config) - capability_tuple = current_platform.get_device_capability() - - if capability_tuple is not None: - capability = capability_tuple.to_int() - if capability < quant_config.get_min_capability(): - raise ValueError( - f"The quantization method {model_config.quantization} " - "is not supported for the current GPU. " - f"Minimum capability: {quant_config.get_min_capability()}. " - f"Current capability: {capability}.") - supported_dtypes = quant_config.get_supported_act_dtypes() - if model_config.dtype not in supported_dtypes: - raise ValueError( - f"{model_config.dtype} is not supported for quantization " - f"method {model_config.quantization}. Supported dtypes: " - f"{supported_dtypes}") - return quant_config - return None - - -def _get_model_initialization_kwargs( - model_class: Type[nn.Module], - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig] = None, - pooler_config: Optional[PoolerConfig] = None) -> Dict[str, Any]: - """Get extra kwargs for model initialization.""" - extra_kwargs: Dict[str, Any] = {} - - if supports_lora(model_class): - # lora_config=None is used to disable LoRA - extra_kwargs["lora_config"] = lora_config - elif lora_config: - raise ValueError( - f"Model {model_class.__name__} does not support LoRA, " - "but LoRA is enabled. Support for this model may " - "be added in the future. If this is important to you, " - "please open an issue on github.") - - if supports_multimodal(model_class): - assert multimodal_config is not None - - extra_kwargs["multimodal_config"] = multimodal_config - - if has_inner_state(model_class) and scheduler_config: - extra_kwargs["scheduler_config"] = scheduler_config - if pooler_config: - extra_kwargs["pooler_config"] = pooler_config - return extra_kwargs - - -def build_model(model_class: Type[nn.Module], - hf_config: PretrainedConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - *, - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], - scheduler_config: Optional[SchedulerConfig], - prefix: Optional[str] = None, - pooler_config: Optional[PoolerConfig] = None) -> nn.Module: - extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config, - multimodal_config, - scheduler_config, - pooler_config) - if prefix: - extra_kwargs["prefix"] = prefix - - return model_class(config=hf_config, - cache_config=cache_config, - quant_config=quant_config, - **extra_kwargs) - - -def _initialize_model( - model_config: ModelConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - cache_config: CacheConfig, - scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module: +def _initialize_model(vllm_config: VllmConfig) -> nn.Module: """Initialize a model with the given configurations.""" + model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) - - return build_model( - model_class, - model_config.hf_config, - cache_config=cache_config, - quant_config=_get_quantization_config(model_config, load_config), - lora_config=lora_config, - multimodal_config=model_config.multimodal_config, - scheduler_config=scheduler_config, - pooler_config=model_config.pooler_config, - ) + return model_class(vllm_config=vllm_config) class BaseModelLoader(ABC): @@ -205,12 +107,7 @@ def download_model(self, model_config: ModelConfig) -> None: raise NotImplementedError @abstractmethod - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, *, vllm_config: VllmConfig) -> nn.Module: """Load a model with the given configurations.""" raise NotImplementedError @@ -396,18 +293,14 @@ def download_model(self, model_config: ModelConfig) -> None: model_config.revision, fall_back_to_pt=True) - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + device_config = vllm_config.device_config + model_config = vllm_config.model_config + target_device = torch.device(device_config.device) with set_default_torch_dtype(model_config.dtype): with target_device: - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config, - scheduler_config) + model = _initialize_model(vllm_config=vllm_config) model.load_weights(self._get_all_weights(model_config, model)) @@ -436,17 +329,12 @@ def __init__(self, load_config: LoadConfig): def download_model(self, model_config: ModelConfig) -> None: pass # Nothing to download - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + device_config = vllm_config.device_config + model_config = vllm_config.model_config with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config, - scheduler_config) + model = _initialize_model(vllm_config=vllm_config) # NOTE(woosuk): For accurate performance evaluation, we assign # random values to the weights. initialize_dummy_weights(model) @@ -488,10 +376,7 @@ def _get_weights_iterator( def _load_model_serialized_cpu( self, - model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - cache_config: CacheConfig, + vllm_config: VllmConfig, ) -> nn.Module: """Load a serialized model with tensorizer to the CPU. @@ -500,42 +385,39 @@ def _load_model_serialized_cpu( default HuggingFace loading, but will be slower than loading a vLLM-tensorized model. """ + device_config = vllm_config.device_config + model_config = vllm_config.model_config with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config) + model = _initialize_model(vllm_config=vllm_config) model.load_weights(self._get_weights_iterator()) return model.eval() def _load_model_serialized( self, - model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - cache_config: CacheConfig, + vllm_config: VllmConfig, ) -> nn.Module: """Load a serialized model with tensorizer. Expects a vLLM-tensorized model. See the examples/tensorize_vllm_model.py example script for serializing vLLM models.""" + + device_config = vllm_config.device_config + model_config = vllm_config.model_config + with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model_class = get_model_architecture(model_config)[0] - quant_config = _get_quantization_config( - model_config, self.load_config) - extra_kwargs = _get_model_initialization_kwargs( - model_class, lora_config, model_config.multimodal_config) - extra_kwargs["quant_config"] = quant_config - extra_kwargs["cache_config"] = cache_config tensorizer_config = copy.copy(self.tensorizer_config) tensorizer_config.model_class = model_class tensorizer_config.hf_config = model_config.hf_config tensorizer_config.dtype = model_config.dtype - model = load_with_tensorizer(tensorizer_config, **extra_kwargs) + model = load_with_tensorizer(tensorizer_config, + vllm_config=vllm_config) return model.eval() def download_model(self, model_config: ModelConfig) -> None: @@ -544,12 +426,9 @@ def download_model(self, model_config: ModelConfig) -> None: with self.tensorizer_config.open_stream(): pass - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + model_config = vllm_config.model_config + parallel_config = vllm_config.parallel_config self._verify_config(model_config, parallel_config) if parallel_config.tensor_parallel_size > 1: @@ -559,10 +438,8 @@ def load_model(self, *, model_config: ModelConfig, % get_tensor_model_parallel_rank() if is_vllm_tensorized(self.tensorizer_config): - return self._load_model_serialized(model_config, device_config, - lora_config, cache_config) - return self._load_model_serialized_cpu(model_config, device_config, - lora_config, cache_config) + return self._load_model_serialized(vllm_config=vllm_config) + return self._load_model_serialized_cpu(vllm_config=vllm_config) @staticmethod def save_model( @@ -648,12 +525,9 @@ def _prepare_weights(self, model_name_or_path: str, def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + device_config = vllm_config.device_config + model_config = vllm_config.model_config from safetensors.torch import safe_open from vllm.distributed import get_tensor_model_parallel_rank @@ -663,8 +537,7 @@ def load_model(self, *, model_config: ModelConfig, with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config) + model = _initialize_model(vllm_config=vllm_config) for _, module in model.named_modules(): quant_method = getattr(module, "quant_method", None) if quant_method is not None: @@ -769,6 +642,10 @@ class BitsAndBytesModelLoader(BaseModelLoader): def __init__(self, load_config: LoadConfig): super().__init__(load_config) + # Save the module names without sharding. + self.unsharded_weights_modules: List[str] = [] + # Save the module names that are sharded by column. + self.column_sharded_weights_modules: List[str] = [] # we don't need to quantize the whole model, only the target modules # that are specified in the adapter config file. If the adapter config # file is not provided, we will quantize the default modules. @@ -783,7 +660,7 @@ def __init__(self, load_config: LoadConfig): config_file_path = self._get_config_file(qlora_adapter) - with open(config_file_path, "r") as f: + with open(config_file_path) as f: config = json.load(f) self.target_modules = config["target_modules"] @@ -929,7 +806,7 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if not weight_name.lower().endswith(".scb"): continue - weight_key = weight_name.lower().replace(".scb", ".qweight") + weight_key = weight_name.lower().replace(".scb", ".weight") quant_state_dict[weight_key] = weight_tensor for weight_name, weight_tensor in self._hf_weight_iter( @@ -938,11 +815,9 @@ def _quantized_8bit_generator(self, hf_weights_files, use_safetensors, if self._is_8bit_weight_name(weight_name): continue - qweight_name = weight_name.replace(".weight", ".qweight") - - if qweight_name in quant_state_dict: + if weight_name in quant_state_dict: set_weight_attrs(weight_tensor, {"load_in_8bit": True}) - yield qweight_name, weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -987,9 +862,8 @@ def _parse_quant_state(param_name: str, (f"{weight_name}.quant_state.bitsandbytes__fp4" \ in temp_state_dict): quant_state = _parse_quant_state(weight_name, temp_state_dict) - weight_name = weight_name.replace(".weight", ".qweight") quant_state_dict[weight_name] = quant_state - yield weight_name.replace(".weight", ".qweight"), weight_tensor + yield weight_name, weight_tensor else: yield weight_name, weight_tensor @@ -1004,17 +878,21 @@ def _unquantized_generator(self, hf_weights_files, use_safetensors, if any(target_module in weight_name for target_module in self.target_modules) and weight_name.endswith(".weight"): - weight_name = weight_name.replace(".weight", ".qweight") - - if any(module in weight_name - for module in self.column_parallel_weights_modules): - + # Without sharding + if any( + weight_name.startswith(module) + for module in self.unsharded_weights_modules): + weight_sub_tensor = weight_tensor + # Shard by column + elif any( + weight_name.startswith(module) + for module in self.column_sharded_weights_modules): total_size = weight_tensor.size(-1) start_index = total_size // tp_size * tp_rank end_index = total_size // tp_size * (tp_rank + 1) weight_sub_tensor = weight_tensor[..., start_index:end_index] - + # Shard by row else: total_size = weight_tensor.size(0) start_index = total_size // tp_size * tp_rank @@ -1063,11 +941,16 @@ def _load_weights(self, model_config: ModelConfig, else: self.target_modules = self.default_target_modules - if hasattr(model, 'column_parallel_weights_modules'): - self.column_parallel_weights_modules = \ - model.column_parallel_weights_modules - else: - self.column_parallel_weights_modules = [] + for name, module in model.named_modules(): + # Some modules like `ReplicatedLinear` should not have their weights + # sharded. The reason for implementing it this way is to avoid new + # static variable in the model implementation. + if isinstance(module, (ReplicatedLinear, )): + self.unsharded_weights_modules.append(name) + # In TP, these weights are partitioned along the column + # dimension (dim=-1) + elif isinstance(module, (RowParallelLinear, )): + self.column_sharded_weights_modules.append(name) self.model_type = type(model).__name__ @@ -1108,14 +991,26 @@ def _load_weights(self, model_config: ModelConfig, param_dict = dict(model.named_parameters()) stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + # TODO: Change this lazy import to normal import + # after the checks are updated to run on a new version + from vllm.model_executor.models.utils import is_pp_missing_parameter for quant_param_name in quant_state_dict: + if is_pp_missing_parameter(quant_param_name, model): + continue + non_stacked_param_name = quant_param_name shard_index = 0 for shard_name, ( weight_name, index ) in model.bitsandbytes_stacked_params_mapping.items(): - if shard_name in quant_param_name: + + shard_pos = quant_param_name.find(shard_name) + # Some models, such as MiniCPM V2.5/2.6, contain both + # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' + # from being incorrectly identified as being present in + # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight + if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": shard_index = index quant_param_name = quant_param_name.replace( shard_name, weight_name) @@ -1157,16 +1052,12 @@ def _load_weights(self, model_config: ModelConfig, def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision) - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + device_config = vllm_config.device_config + model_config = vllm_config.model_config with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config) + model = _initialize_model(vllm_config=vllm_config) self._load_weights(model_config, model) @@ -1235,13 +1126,9 @@ def _get_weights_iterator( def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model) - def load_model(self, *, model_config: ModelConfig, - device_config: DeviceConfig, - lora_config: Optional[LoRAConfig], - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - cache_config: CacheConfig) -> nn.Module: - + def load_model(self, vllm_config: VllmConfig) -> nn.Module: + device_config = vllm_config.device_config + model_config = vllm_config.model_config local_model_path = self._prepare_weights(model_config.model) gguf_weights_map = self._get_gguf_weights_map(model_config) # we can only know if tie word embeddings after mapping weights @@ -1251,8 +1138,7 @@ def load_model(self, *, model_config: ModelConfig, with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): - model = _initialize_model(model_config, self.load_config, - lora_config, cache_config) + model = _initialize_model(vllm_config=vllm_config) model.load_weights( self._get_weights_iterator(local_model_path, gguf_weights_map)) return model diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 573f2a04895d9..e6299295c85a2 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -190,7 +190,7 @@ def get_model( kv_cache_dtype: ov.Type, **kwargs, ) -> torch.nn.Module: - lora_config = kwargs.get("lora_config", None) + lora_config = kwargs.get("lora_config") ov_core = kwargs.get("ov_core") if lora_config: raise ValueError( diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 36f33d6d139ee..c48b287ed181a 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -17,8 +17,6 @@ from vllm.engine.arg_utils import EngineArgs from vllm.engine.llm_engine import LLMEngine from vllm.logger import init_logger -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.utils import FlexibleArgumentParser @@ -268,8 +266,7 @@ class TensorizerAgent: in vllm/model_executor/model_loader/weight_utils.py """ - def __init__(self, tensorizer_config: TensorizerConfig, - quant_config: QuantizationConfig, **extra_kwargs): + def __init__(self, tensorizer_config: TensorizerConfig, vllm_config): if tensorizer_error_msg is not None: raise ImportError( "Tensorizer is not installed. Please install tensorizer " @@ -279,11 +276,7 @@ def __init__(self, tensorizer_config: TensorizerConfig, self.tensorizer_config = tensorizer_config self.tensorizer_args = ( self.tensorizer_config._construct_tensorizer_args()) - self.extra_kwargs = extra_kwargs - if extra_kwargs.get("quant_config", None) is not None: - self.quant_config = extra_kwargs["quant_config"] - else: - self.quant_config = quant_config + self.vllm_config = vllm_config self.model = self._init_model() def _init_model(self): @@ -293,9 +286,7 @@ def _init_model(self): assert self.tensorizer_config.model_class is not None with no_init_or_tensor(): return self.tensorizer_config.model_class( - config=model_args, - quant_config=self.quant_config, - **self.extra_kwargs) + vllm_config=self.vllm_config, ) def _resize_lora_embeddings(self): """Modify LoRA embedding layers to use bigger tensors @@ -380,8 +371,7 @@ def tensorizer_weights_iterator( stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params) with TensorDeserializer(stream, **deserializer_args, device="cpu") as state: - for name, param in state.items(): - yield name, param + yield from state.items() del state diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0c51314bc90df..9488d54edf365 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -188,7 +188,7 @@ def get_quant_config(model_config: ModelConfig, f"{quant_config_files}") quant_config_file = quant_config_files[0] - with open(quant_config_file, "r") as f: + with open(quant_config_file) as f: config = json.load(f) if model_config.quantization == "bitsandbytes": @@ -306,7 +306,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], # Iterate through the weight_map (weight_name: safetensors files) # to identify weights that we should use. - with open(index_file_name, "r") as f: + with open(index_file_name) as f: weight_map = json.load(f)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: @@ -382,7 +382,7 @@ def np_cache_weights_iterator( with open(weight_names_file, "w") as f: json.dump(weight_names, f) - with open(weight_names_file, "r") as f: + with open(weight_names_file) as f: weight_names = json.load(f) for name in weight_names: @@ -423,8 +423,7 @@ def pt_weights_iterator( bar_format=_BAR_FORMAT, ): state = torch.load(bin_file, map_location="cpu") - for name, param in state.items(): - yield name, param + yield from state.items() del state torch.cuda.empty_cache() diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index fd29d4ccc59d8..7d4b9654b54ab 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -6,7 +6,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -23,7 +23,7 @@ from vllm.model_executor.layers.quantization.deepspeedfp import ( DeepSpeedFPConfig, DeepSpeedFPParameter) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -34,7 +34,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -48,7 +49,7 @@ def __init__(self, is_residual_mlp: bool = False, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMLP, self).__init__() + super().__init__() self.hidden_size = config.hidden_size self.expert_id = expert_id self.layer_id = layer_id @@ -89,7 +90,7 @@ def __init__(self, params_dtype: Optional[torch.dtype] = None, quant_config: Optional[QuantizationConfig] = None, reduce_results: bool = True): - super(ArcticMoE, self).__init__() + super().__init__() self.tp_size = tp_size or get_tensor_model_parallel_world_size() self.hidden_size = config.hidden_size @@ -364,14 +365,13 @@ def forward( @support_torch_compile class ArcticModel(nn.Module): - def __init__( - self, - config: ArcticConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( @@ -415,14 +415,13 @@ def forward( class ArcticForCausalLM(nn.Module, SupportsPP): - def __init__(self, - config: ArcticConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - **kwargs) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config - self.model = ArcticModel(config, cache_config, quant_config) + self.model = ArcticModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.lm_head = ParallelLMHead( self.vocab_size, @@ -436,7 +435,7 @@ def __init__(self, self.unpadded_vocab_size = config.vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index f2cfdf8ffd30a..aabbd31192a40 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX @@ -27,7 +26,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -38,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -254,13 +253,18 @@ def forward( @support_torch_compile class BaiChuanModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - position_embedding: str, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + position_embedding: str = "ROPE", + ) -> None: super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -333,27 +337,29 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__( self, - config: PretrainedConfig, - position_embedding: str, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, + *, + vllm_config: VllmConfig, + prefix: str = "", + position_embedding: str = "ROPE", ): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = BaiChuanModel(config, position_embedding, cache_config, - quant_config) + self.model = BaiChuanModel(vllm_config=vllm_config, + prefix=prefix, + position_embedding=position_embedding) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -438,19 +444,16 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM): NOTE: the class name has a lower case 'c'. """ - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config if config.hidden_size == 4096: # baichuan2 7b - super().__init__(config, "ROPE", cache_config, quant_config, - lora_config) + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ROPE") else: # baichuan 13b, baichuan2 13b - super().__init__(config, "ALIBI", cache_config, quant_config, - lora_config) + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ALIBI") class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): @@ -458,12 +461,7 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): NOTE: the class name has an upper case 'C'. """ - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): - super().__init__(config, "ROPE", cache_config, quant_config, - lora_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ROPE") diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index cbdacf779b089..a50a5a5b018e1 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -25,7 +25,7 @@ from transformers.utils import logging from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, LoRAConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -34,13 +34,15 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .utils import maybe_prefix + logger = logging.get_logger(__name__) @@ -393,8 +395,7 @@ def __init__( cache_config=cache_config, quant_config=quant_config) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.activation_fn = get_act_fn(config.activation_function, - quant_config) + self.activation_fn = get_act_fn(config.activation_function) ffn_hidden_size = self.embed_dim ffn_intermediate_size = config.encoder_ffn_dim @@ -405,7 +406,7 @@ def __init__( bias=ffn_has_bias, quant_config=quant_config, ) - self.act = get_act_fn("gelu", quant_config, ffn_intermediate_size) + self.act = get_act_fn("gelu") self.fc2 = RowParallelLinear( ffn_intermediate_size, ffn_hidden_size, @@ -473,8 +474,7 @@ def __init__( config=config, cache_config=cache_config, quant_config=quant_config) - self.activation_fn = get_act_fn(config.activation_function, - quant_config) + self.activation_fn = get_act_fn(config.activation_function) self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) ''' @@ -624,8 +624,6 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, Decoder output torch.Tensor """ # retrieve input_ids and inputs_embeds - - input_ids = input_ids.view(-1, input_ids.shape[-1]) inputs_embeds = self.embed_tokens(input_ids) embed_pos = self.embed_positions( @@ -743,13 +741,14 @@ class BartModel(nn.Module): "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" ] - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id @@ -814,20 +813,16 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, class BartForConditionalGeneration(nn.Module): base_model_prefix = "model" - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config # currently all existing BART models have `tie_word_embeddings` enabled assert config.tie_word_embeddings self.config = config - self.model = BartModel(config, - cache_config, - quant_config, - lora_config=lora_config) + self.model = BartModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -842,7 +837,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index bfed2929d57d2..7dbc7fa0aaba4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,8 +5,7 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.xformers import XFormersImpl -from vllm.config import CacheConfig, PoolerConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -21,6 +20,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput +from .utils import maybe_prefix + class BertEmbedding(nn.Module): @@ -216,11 +217,6 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn") - if not isinstance(self.attn.impl, XFormersImpl): - raise ValueError( - "Encoder-only models currently require XFORMERS attention " - "backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.") - def forward( self, hidden_states: torch.Tensor, @@ -309,12 +305,13 @@ def forward(self, hidden_states: torch.Tensor, class BertModel(nn.Module): - def __init__(self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embeddings = BertEmbedding(config) self.encoder = BertEncoder(config, cache_config, @@ -382,15 +379,11 @@ class BertEmbeddingModel(nn.Module): _pooler: An instance of Pooler used for pooling operations. """ - def __init__( - self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - pooler_config: Optional[PoolerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.model = BertModel(config, cache_config, quant_config) + pooler_config = vllm_config.model_config.pooler_config + self.model = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.CLS, diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 1f2d7384076ed..e612010677364 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -98,6 +98,11 @@ def input_processor_for_blip( if multi_modal_data is None or "image" not in multi_modal_data: return inputs + if "multi_modal_placeholders" in inputs and "image" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + tokenizer = cached_get_tokenizer(model_config.tokenizer) if image_feature_size_override is None: @@ -105,7 +110,7 @@ def input_processor_for_blip( else: image_feature_size = image_feature_size_override - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -116,7 +121,8 @@ def input_processor_for_blip( # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": ranges}) # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index c3b3cc8a4ddb6..03dc1d15ab697 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -8,21 +8,22 @@ apply_chunking_to_forward) from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import CacheConfig, VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData from .blip import (BlipVisionModel, dummy_image_for_blip, get_max_blip_image_tokens) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo @@ -425,7 +426,11 @@ def dummy_seq_data_for_blip2( return SequenceData.from_prompt_token_counts( (image_token_id, image_feature_size * num_images), (0, seq_len - image_feature_size * num_images), - ) + ), { + "image": + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_data_for_blip2(ctx: InputContext, seq_len: int, @@ -434,7 +439,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int, vision_config = hf_config.vision_config num_images = mm_counts["image"] - seq_data = dummy_seq_data_for_blip2( + seq_data, ranges = dummy_seq_data_for_blip2( hf_config, seq_len, num_images, @@ -444,7 +449,7 @@ def dummy_data_for_blip2(ctx: InputContext, seq_len: int, if isinstance(vision_config, Blip2VisionConfig): mm_data = dummy_image_for_blip(vision_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -478,14 +483,13 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): @INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: Blip2Config, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -508,9 +512,8 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -520,7 +523,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 77ab7de6165fb..2c14519fb9e0e 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py # Copyright 2023 The vLLM team. @@ -25,7 +24,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import get_act_fn @@ -34,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -43,7 +42,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -147,7 +147,7 @@ def __init__( 4 * hidden_size, quant_config=quant_config, ) - self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.gelu_impl = get_act_fn("gelu") self.dense_4h_to_h = RowParallelLinear( 4 * hidden_size, hidden_size, @@ -222,14 +222,13 @@ def forward( @support_torch_compile class BloomModel(nn.Module): - def __init__( - self, - config: BloomConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embed_dim = config.hidden_size # Embedding + LN Embedding @@ -284,14 +283,17 @@ class BloomForCausalLM(nn.Module, SupportsPP): def __init__( self, - config: BloomConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = BloomModel(config, cache_config, quant_config) + self.transformer = BloomModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.word_embeddings else: @@ -299,7 +301,7 @@ def __init__( self.config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index aaf559ca386cc..7b59c818e0b60 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -9,10 +9,10 @@ from transformers import ChameleonConfig, ChameleonVQVAEConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -21,7 +21,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -30,13 +30,15 @@ from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # These configs are not part of the model config but the preprocessor # and processor files, so we hardcode them in the model file for now. @@ -73,7 +75,11 @@ def dummy_seq_data_for_chameleon( return SequenceData.from_prompt_token_counts( (image_token_id, image_feature_size * num_images), (0, seq_len - image_feature_size * num_images), - ) + ), { + "image": + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_image_for_chameleon( @@ -97,14 +103,14 @@ def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): num_images = mm_counts["image"] - seq_data = dummy_seq_data_for_chameleon( + seq_data, ranges = dummy_seq_data_for_chameleon( seq_len, num_images, image_token_id=CHAMELEON_IMAGE_TOKEN_ID, ) mm_data = dummy_image_for_chameleon(num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) def input_processor_for_chameleon(ctx: InputContext, @@ -120,9 +126,14 @@ def input_processor_for_chameleon(ctx: InputContext, if multi_modal_data is None or "image" not in multi_modal_data: return inputs + if "multi_modal_placeholders" in inputs and "image" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + model_config = ctx.model_config tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -821,14 +832,13 @@ def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: class ChameleonModel(nn.Module): - def __init__( - self, - config: ChameleonConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -914,17 +924,14 @@ def forward( class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - config: ChameleonConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config - self.model = ChameleonModel(config, cache_config, quant_config) + self.model = ChameleonModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size self.lm_head = ParallelLMHead( self.unpadded_vocab_size, @@ -936,7 +943,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ca90d10e9f9fb..6ec2d5a2a3909 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only ChatGLM model compatible with THUDM weights.""" @@ -12,10 +11,10 @@ from torch.nn import LayerNorm from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -25,15 +24,14 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalInputs) -from vllm.multimodal.base import MultiModalData +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -41,7 +39,8 @@ from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -55,8 +54,9 @@ def mm_input_mapper_for_glmv( data: MultiModalData[object], ) -> Dict: model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer, - trust_remote_code=True) + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) if tokenizer is None: raise RuntimeError("No HuggingFace processor is available " "to process the image object") @@ -75,7 +75,7 @@ def mm_input_mapper_for_glmv( raise pixel_values = raw_batch_data['images'] - return MultiModalInputs({'pixel_values': pixel_values}) + return MultiModalKwargs({'pixel_values': pixel_values}) def merge_glm_vision_embeddings( @@ -117,16 +117,15 @@ def get_max_glmv_image_tokens(ctx: InputContext): raise NotImplementedError(msg) -def dummy_data_for_glmv( - ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] -) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: +def dummy_data_for_glmv(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]) -> DummyData: hf_config = ctx.get_hf_config(ChatGLMConfig) vision_config = getattr(hf_config, 'vision_config', None) if vision_config is None: token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len) seq_data = SequenceData(token_ids) - return seq_data, None + return DummyData(seq_data, None) elif isinstance(vision_config, dict): image_size = vision_config["image_size"] image_placeholder_length = calculate_image_placeholder(vision_config) @@ -141,7 +140,7 @@ def dummy_data_for_glmv( "image": Image.new("RGB", (image_size, image_size), color=0) } - return seq_data, mm_data + return DummyData(seq_data, mm_data) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -483,14 +482,13 @@ def forward( class ChatGLMModel(nn.Module): - def __init__( - self, - config: ChatGLMConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embedding = VocabParallelEmbedding(config.padded_vocab_size, @@ -527,7 +525,7 @@ def _parse_and_validate_image_input( elif isinstance(pixel_values, list): return torch.concat(pixel_values) else: - raise TypeError("""pixel_values must be a torch.Tensor + raise TypeError("""pixel_values must be a torch.Tensor or a list of torch.Tensor """) return GLMImagePixelInputs(pixel_values=pixel_values) @@ -597,14 +595,14 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, def __init__( self, - config: ChatGLMConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.lora_config = lora_config self.multimodal_config = multimodal_config @@ -612,13 +610,15 @@ def __init__( self.quant_config = quant_config self.max_position_embeddings = getattr(config, "max_sequence_length", 8192) - self.transformer = ChatGLMModel(config, cache_config, quant_config) + self.transformer = ChatGLMModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.transformer.output_layer.weight = ( self.transformer.embedding.weight) self.lm_head = self.transformer.output_layer self.logits_processor = LogitsProcessor(config.padded_vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index a3293020c042e..2d81b9266826b 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -19,6 +19,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData @@ -49,14 +50,13 @@ def get_max_clip_image_tokens(hf_config: CLIPVisionConfig) -> int: return get_clip_image_feature_size(hf_config) -def dummy_seq_data_for_clip( - hf_config: CLIPVisionConfig, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): +def dummy_seq_data_for_clip(hf_config: CLIPVisionConfig, + seq_len: int, + num_images: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, + mm_key: str = "image"): if image_feature_size_override is None: image_feature_size = get_clip_image_feature_size(hf_config) else: @@ -65,7 +65,11 @@ def dummy_seq_data_for_clip( return SequenceData.from_prompt_token_counts( (image_token_id, image_feature_size * num_images), (0, seq_len - image_feature_size * num_images), - ) + ), { + mm_key: + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_image_for_clip( @@ -117,6 +121,11 @@ def input_processor_for_clip( if multi_modal_data is None or "image" not in multi_modal_data: return inputs + if "multi_modal_placeholders" in inputs and "image" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + tokenizer = cached_get_tokenizer(model_config.tokenizer) if image_feature_size_override is None: @@ -130,7 +139,7 @@ def input_processor_for_clip( else: image_feature_size = image_feature_size_override - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -141,7 +150,8 @@ def input_processor_for_clip( # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": ranges}) # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 348e6d20f3297..cd5c1d6844716 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 Cohere and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -38,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -50,7 +49,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) @torch.compile @@ -254,15 +254,14 @@ def forward( @support_torch_compile class CohereModel(nn.Module): - def __init__( - self, - config: CohereConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -333,14 +332,11 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {"embed_tokens": "input_embeddings"} embedding_padding_modules = [] - def __init__( - self, - config: CohereConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config # currently all existing command R models have `tie_word_embeddings` # enabled @@ -352,11 +348,9 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, - cache_config, - quant_config, - lora_config=lora_config) - self.sampler = Sampler() + self.model = CohereModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index aae7ab7370b74..d5f9b903183d4 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,11 +1,10 @@ -# coding=utf-8 from typing import Iterable, List, Optional, Tuple, Union import torch import torch.nn as nn from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.fused_moe import FusedMoE @@ -15,7 +14,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -26,7 +25,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DbrxRouter(nn.Module): @@ -295,14 +295,13 @@ def forward( class DbrxModel(nn.Module): - def __init__( - self, - config: DbrxConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.wte = VocabParallelEmbedding( config.vocab_size, config.d_model, @@ -353,18 +352,21 @@ class DbrxForCausalLM(nn.Module, SupportsPP): def __init__( self, - config: DbrxConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config if config.tie_word_embeddings: raise ValueError( "tie_word_embeddings is not supported for Dbrx models.") self.quant_config = quant_config self.unpadded_vocab_size = config.vocab_size - self.transformer = DbrxModel(config, cache_config, quant_config) + self.transformer = DbrxModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead( config.vocab_size, config.d_model, @@ -374,7 +376,7 @@ def __init__( ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index 7ed2b96e65c49..b38fd9fa49c21 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 DeciAI Research Team. All rights reserved. @@ -23,13 +22,11 @@ # limitations under the License. """Inference-only DeciLM model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Tuple +from typing import Iterable, Tuple import torch -from transformers import LlamaConfig -from vllm.config import CacheConfig, LoRAConfig -from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.config import VllmConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.llama import LlamaForCausalLM @@ -54,19 +51,11 @@ class DeciLMForCausalLM(LlamaForCausalLM): instead. """ - def __init__( - self, - config: LlamaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config config.num_key_value_heads = max(config.num_key_value_heads_per_layer) delattr(config, "num_key_value_heads_per_layer") - super().__init__(config=config, - cache_config=cache_config, - quant_config=quant_config, - lora_config=lora_config) + super().__init__(vllm_config=vllm_config) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index 5b4db8f258711..a9bf1440c4d60 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -28,7 +27,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -42,7 +41,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -51,7 +50,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DeepseekMLP(nn.Module): @@ -327,14 +327,13 @@ class DeepseekModel(nn.Module): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -384,23 +383,21 @@ def forward( class DeepseekForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = DeepseekModel(config, cache_config, quant_config) + self.model = DeepseekModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index d4ad0c6b5c99e..4fb1eed15a2e7 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -43,7 +42,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -52,7 +51,8 @@ from .interfaces import SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DeepseekV2MLP(nn.Module): @@ -409,14 +409,13 @@ class DeepseekV2Model(nn.Module): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -480,24 +479,19 @@ def forward( class DeepseekV2ForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = DeepseekV2Model(config, - cache_config, - quant_config, - prefix="model") + self.model = DeepseekV2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index a87e1c0228627..c902829994c7c 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -4,6 +4,7 @@ import torch.nn as nn from vllm.attention.backends.abstract import AttentionMetadata +from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -12,7 +13,8 @@ from vllm.model_executor.models import ModelRegistry from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.eagle import EAGLEConfig + +from .utils import maybe_prefix class EAGLE(nn.Module): @@ -34,14 +36,16 @@ class EAGLE(nn.Module): in the draft checkpoint (using key token_map). Also, the draft config needs to have truncated_vocab_size (=k) as an attribute.""" - def __init__(self, config: EAGLEConfig, *args, **kwargs) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config self.config = config architectures = getattr(self.config.model, "architectures", []) model_cls, _ = ModelRegistry.resolve_model_cls(architectures) - self.model = model_cls(self.config.model, *args, **kwargs) + self.model = model_cls(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.fc = nn.Linear(config.model.hidden_size * 2, config.model.hidden_size, bias=getattr(self.config, "eagle_fc_bias", False)) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 22f194c776b69..cd3e7da657e0e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/modeling_exaone.py # Copyright 2024 The LG U+ CTO AI Tech Lab. @@ -30,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -43,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -55,7 +54,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class ExaoneGatedMLP(nn.Module): @@ -315,15 +315,14 @@ def forward( @support_torch_compile class ExaoneModel(nn.Module): - def __init__( - self, - config: ExaoneConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -439,24 +438,18 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "c_fc_1": ("gate_up_proj", 1), } - def __init__( - self, - config: ExaoneConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.transformer = ExaoneModel( - config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), ) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size @@ -479,7 +472,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 467a33505ee12..562ee5517e7f1 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py # Copyright 2023 The vLLM team. @@ -27,7 +26,8 @@ from transformers import FalconConfig as HF_FalconConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -38,7 +38,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -48,7 +48,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -212,7 +213,7 @@ def __init__( bias=config.bias, skip_bias_add=True, quant_config=quant_config) - self.act = get_act_fn("gelu", quant_config, 4 * hidden_size) + self.act = get_act_fn("gelu") self.reduce_row_parallel_results = not (config.new_decoder_architecture or config.parallel_attn) self.dense_4h_to_h = RowParallelLinear( @@ -329,16 +330,16 @@ def forward( return output +@support_torch_compile class FalconModel(nn.Module): - def __init__( - self, - config: FalconConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -399,19 +400,20 @@ class FalconForCausalLM(nn.Module, SupportsPP): ".dense_h_to_4h.", ".dense_4h_to_h.", ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".dense_4h_to_h.", ".dense."] def __init__( self, - config: FalconConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = FalconModel(config, cache_config, quant_config) + self.transformer = FalconModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) # only Falcon-11B doesn't share lm_head weight with word embeddings # and previous Falcon model doesn't have tie_word_embeddings config # so we set tie_word_embeddings to True by default @@ -427,7 +429,7 @@ def __init__( quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index 6840ac8b9e303..971a71180164b 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,14 +3,11 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, BartParallelLMHead, @@ -23,11 +20,13 @@ class Florence2LanguageModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id @@ -93,15 +92,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, class Florence2LanguageForConditionalGeneration(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + self.config = config - self.model = Florence2LanguageModel(config, - cache_config=cache_config, - quant_config=quant_config) + self.model = Florence2LanguageModel(vllm_config=vllm_config, + prefix=prefix) embed_scale = math.sqrt( config.d_model) if config.scale_embedding else 1.0 @@ -112,7 +110,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, @@ -189,17 +187,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): class Florence2ForConditionalGeneration(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config # TODO(Isotr0py): Add vision backbone self.language_model = Florence2LanguageForConditionalGeneration( - config=config.text_config, - cache_config=cache_config, - quant_config=quant_config) + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=prefix, + ) @property def sampler(self): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 358d1dd288c49..b39dfe706e0df 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/fuyu/modeling_fuyu.py # Copyright 2023 The vLLM team. # Copyright 2023 HuggingFace Inc. team. All rights reserved. @@ -23,23 +22,23 @@ import torch.nn as nn import torch.utils.checkpoint from PIL import Image -from transformers import FuyuConfig, FuyuImageProcessor +from transformers import FuyuImageProcessor from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.linear import ColumnParallelLinear -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) +from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings @@ -103,7 +102,11 @@ def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * (seq_len - image_feature_size * num_images) - return SequenceData(token_ids) + return SequenceData(token_ids), { + "image": + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_image_for_fuyu( @@ -119,15 +122,15 @@ def dummy_image_for_fuyu( def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): num_images = mm_counts["image"] - seq_data = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) + seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) mm_data = dummy_image_for_fuyu(num_images, image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, - data: Image.Image): + data: List[Image.Image]): image_encoding = image_processor.preprocess(data, return_tensors="pt") batch_images = torch.stack([img[0] for img in image_encoding["images"] ]).unsqueeze(1) @@ -158,8 +161,10 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): model_config = ctx.model_config image_data = multi_modal_data["image"] new_multi_modal_data = {} + image_list = image_data if isinstance(image_data, list) else [image_data] + # process image data - if isinstance(image_data, Image.Image): + if is_list_of(image_list, Image.Image): # Fuyu's image_processor can also finish token padding image_processor: FuyuImageProcessor = cached_get_image_processor( model_config.model) @@ -171,7 +176,7 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): ]) new_multi_modal_data["image"] = image_patches - elif isinstance(image_data, torch.Tensor): + elif is_list_of(image_list, torch.Tensor): raise NotImplementedError("Embeddings input is not supported yet") else: raise TypeError(f"Invalid image type: {type(image_data)}") @@ -198,19 +203,20 @@ def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): def input_mapper_for_fuyu(ctx: InputContext, data: object): model_config = ctx.model_config - if isinstance(data, Image.Image): + data_list = data if isinstance(data, list) else [data] + if is_list_of(data_list, Image.Image): # Fuyu's image_processor can also finish token padding image_processor: FuyuImageProcessor = cached_get_image_processor( model_config.model) - model_image_input = _fuyu_image_preprocess(image_processor, data) + model_image_input = _fuyu_image_preprocess(image_processor, data_list) data = torch.stack([ image_patch[0] for image_patch in model_image_input["image_patches"] ]) # image has been processed with prompt in input processor - return MultiModalInputs({"pixel_values": data}) + return MultiModalKwargs({"pixel_values": data}) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) @@ -219,12 +225,11 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): @INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: FuyuConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -239,9 +244,8 @@ def __init__(self, quant_config=quant_config, gather_output=True, ) - self.language_model = PersimmonForCausalLM(config.text_config, - cache_config=cache_config, - quant_config=quant_config) + self.language_model = PersimmonForCausalLM( + vllm_config.with_hf_config(config.text_config)) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 57b2b43c82f89..55baba809e58f 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The vLLM team. # Copyright (c) Google Inc. # @@ -23,7 +22,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -34,7 +33,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -259,14 +258,13 @@ def forward( @support_torch_compile class GemmaModel(nn.Module): - def __init__( - self, - config: GemmaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding( @@ -350,7 +348,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "gate_up_proj", "down_proj", ] - # BitandBytes specific attributes default_bitsandbytes_target_modules = [ ".gate_proj.", @@ -361,8 +358,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj.", ".o_proj.", ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -376,15 +371,11 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - config: GemmaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config # currently all existing Gemma models have `tie_word_embeddings` enabled @@ -392,12 +383,10 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = GemmaModel(config, - cache_config, - quant_config, + self.model = GemmaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 693f32160a289..eeb3fd98a7eac 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. # @@ -22,7 +21,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, PoolerConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import GeluAndMul @@ -34,7 +33,7 @@ from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -44,7 +43,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -244,14 +244,11 @@ def forward( @support_torch_compile class Gemma2Model(nn.Module): - def __init__( - self, - config: Gemma2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config self.config = config self.embed_tokens = VocabParallelEmbedding( @@ -390,8 +387,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj.", ".o_proj.", ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -401,23 +396,21 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - config: Gemma2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config del lora_config # Unused. super().__init__() self.config = config # currently all existing Gemma models have `tie_word_embeddings` enabled assert config.tie_word_embeddings self.quant_config = quant_config - self.model = Gemma2Model(config, cache_config, quant_config) + self.model = Gemma2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -471,16 +464,13 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP): _pooler: An instance of Pooler used for pooling operations. """ - def __init__( - self, - pooler_config: Optional[PoolerConfig] = None, - **kwargs, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.model = Gemma2Model(**kwargs) + self.model = Gemma2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( - pooler_config, + vllm_config.model_config.pooler_config, pooling_type=PoolingType.LAST, normalize=True, softmax=False) diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 3213a8b29a104..025615b0920fd 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/GLM-4 """Inference-only GLM-4v model visual encoder compatible with THUDM weights.""" diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 3330d84021368..adf2a7a51f737 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. @@ -25,7 +24,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed.parallel_state import ( get_pp_group, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import get_act_fn @@ -34,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -43,7 +42,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPT2Attention(nn.Module): @@ -124,8 +124,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.c_proj", ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -186,14 +185,13 @@ def forward( @support_torch_compile class GPT2Model(nn.Module): - def __init__( - self, - config: GPT2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config assert not config.add_cross_attention assert not config.scale_attn_by_inverse_layer_idx @@ -218,9 +216,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor], ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - inputs_embeds = self.wte(input_ids) + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds else: @@ -244,27 +244,30 @@ class GPT2LMHeadModel(nn.Module, SupportsPP): def __init__( self, - config: GPT2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, - cache_config, - quant_config, - prefix="transformer") + self.transformer = GPT2Model(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: self.lm_head = ParallelLMHead(self.config.vocab_size, self.config.hidden_size) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -272,9 +275,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index 24c79a8855475..ae1495ebd7914 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py # Copyright 2023 The vLLM team. @@ -26,7 +25,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -34,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -136,8 +135,7 @@ def __init__( bias=True, quant_config=quant_config, ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -191,15 +189,14 @@ def forward( @support_torch_compile class GPTBigCodeModel(nn.Module): - def __init__( - self, - config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config assert not config.add_cross_attention @@ -262,19 +259,20 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__( self, - config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.transformer = GPTBigCodeModel(config, cache_config, quant_config, - lora_config) + self.transformer = GPTBigCodeModel(vllm_config=vllm_config, + prefix=prefix) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: @@ -287,7 +285,7 @@ def __init__( self.unpadded_vocab_size += lora_config.lora_extra_vocab_size self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 0451d16b6c738..610795b084b44 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py # Copyright 2023 The vLLM team. @@ -24,7 +23,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -33,16 +32,18 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPTJAttention(nn.Module): @@ -130,8 +131,7 @@ def __init__( hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.activation_function, quant_config, - intermediate_size) + self.act = get_act_fn(config.activation_function) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc_in(hidden_states) @@ -178,14 +178,13 @@ def forward( @support_torch_compile class GPTJModel(nn.Module): - def __init__( - self, - config: GPTJConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_dim = config.n_embd self.wte = VocabParallelEmbedding( @@ -232,15 +231,18 @@ class GPTJForCausalLM(nn.Module, SupportsPP): def __init__( self, - config: GPTJConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config assert not config.tie_word_embeddings - self.transformer = GPTJModel(config, cache_config, quant_config) + self.transformer = GPTJModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead( config.vocab_size, config.n_embd, @@ -248,7 +250,7 @@ def __init__( quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -308,6 +310,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, shard_id) break else: + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 1bccef7a5f173..f5603772e9862 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py # Copyright 2023 The vLLM team. @@ -24,7 +23,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -33,7 +32,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -42,7 +41,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPTNeoXAttention(nn.Module): @@ -129,8 +129,7 @@ def __init__( config.hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, - config.intermediate_size) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states): hidden_states, _ = self.dense_h_to_4h(hidden_states) @@ -191,14 +190,13 @@ def forward( @support_torch_compile class GPTNeoXModel(nn.Module): - def __init__( - self, - config: GPTNeoXConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_in = VocabParallelEmbedding( @@ -246,14 +244,16 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP): def __init__( self, - config: GPTNeoXConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.gpt_neox = GPTNeoXModel(config, cache_config, quant_config) + self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "gpt_neox")) self.embed_out = ParallelLMHead( config.vocab_size, config.hidden_size, @@ -262,7 +262,7 @@ def __init__( if self.config.tie_word_embeddings: self.embed_out.weight = self.gpt_neox.embed_in.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.gpt_neox.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c968817747754..d1e6e31f2b8d1 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -43,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -53,7 +52,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers, + maybe_prefix) class GraniteMLP(nn.Module): @@ -258,15 +258,14 @@ def forward( @support_torch_compile class GraniteModel(nn.Module): - def __init__( - self, - config: GraniteConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -371,23 +370,17 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - config: GraniteConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = GraniteModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = GraniteModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -412,7 +405,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 5307bb21adb96..2ed115c56af45 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -40,7 +39,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -48,7 +47,7 @@ from . import mixtral from .interfaces import SupportsLoRA, SupportsPP -from .utils import make_layers +from .utils import make_layers, maybe_prefix class GraniteMoeMoE(nn.Module): @@ -248,15 +247,14 @@ def forward( @support_torch_compile class GraniteMoeModel(nn.Module): - def __init__( - self, - config: GraniteMoeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -334,23 +332,17 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: GraniteMoeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = GraniteMoeModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = GraniteMoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -372,7 +364,7 @@ def __init__( scale=1 / self.config.logits_scaling) - self.sampler = Sampler() + self.sampler = get_sampler() def forward( self, diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py new file mode 100644 index 0000000000000..df7e768fe14d3 --- /dev/null +++ b/vllm/model_executor/models/h2ovl.py @@ -0,0 +1,400 @@ +# adapted from https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/modeling_h2ovl_chat.py +# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/image_process.py +# -------------------------------------------------------- +# H2OVL-Mississippi +# Copyright (c) 2024 H2O.AI +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- +from functools import partial +from typing import List, Optional, Tuple + +import torch +from PIL import Image +from transformers import PretrainedConfig + +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, + token_inputs) +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import cached_get_tokenizer +from vllm.utils import is_list_of + +from .intern_vit import InternVisionModel +from .internvl import (IMG_CONTEXT, IMG_END, IMG_START, InternVLChatModel, + InternVLInputPipeline, build_transform, + find_closest_aspect_ratio, get_internvl_num_patches) + + +# modified to include blocks generated in second pass +def calculate_num_blocks( + orig_width: int, + orig_height: int, + min_num: int, + max_num: int, + image_size: int, + use_thumbnail: bool, + prior_aspect_ratio=None, +) -> Tuple[int, int, int, Tuple[int, int]]: + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set((i, j) for n in range(min_num, max_num + 1) + for i in range(1, n + 1) for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # if prior_aspect_ratio is provided, filter the target ratios + if prior_aspect_ratio is not None: + target_ratios = [ + ratio for ratio in target_ratios if prior_aspect_ratio[0] % + ratio[0] != 0 and prior_aspect_ratio[1] % ratio[1] != 0 + ] + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, + target_ratios, orig_width, + orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + # add thumbnail image if num_blocks > 1 + if use_thumbnail and blocks > 1: + blocks += 1 + return blocks, target_width, target_height, target_aspect_ratio + + +# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B +# refactored to handle prior_aspect_ratio as optional +def dynamic_preprocess( + image: Image.Image, + min_num: int, + max_num: int, + image_size: int, + use_thumbnail: bool, + prior_aspect_ratio: Optional[Tuple[int, int]] = None, +) -> Tuple[List[Image.Image], Tuple[int, int]]: + orig_width, orig_height = image.size + + # calculate the number of blocks based on prior aspect ratio if available + blocks, target_width, target_height, target_aspect_ratio = ( + calculate_num_blocks( + orig_width, + orig_height, + min_num, + max_num, + image_size, + use_thumbnail=False, + prior_aspect_ratio=prior_aspect_ratio, + )) + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size, + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images, target_aspect_ratio + + +def load_image( + image: Image.Image, + input_size=448, + min_num=1, + max_num=6, + use_thumbnail=True, + prior_aspect_ratio: Optional[Tuple[int, int]] = None, +) -> Tuple[torch.Tensor, Tuple[int, int]]: + transform = build_transform(input_size=input_size) + images, target_aspect_ratio = dynamic_preprocess( + image, + image_size=input_size, + use_thumbnail=use_thumbnail, + min_num=min_num, + max_num=max_num, + prior_aspect_ratio=prior_aspect_ratio, + ) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values, target_aspect_ratio + + +# refactored to use the combined load_image function +def image_to_pixel_values( + image: Image.Image, + input_size: int, + min_num: int, + max_num: int, + use_thumbnail: bool, + use_MSAC: bool, +) -> torch.Tensor: + # when MSAC is turned on, we need to process the image twice + if use_MSAC: + # first pass + pixel_values, target_aspect_ratio = load_image( + image, + input_size=input_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=True, + ) + # second pass + pixel_values2, _ = load_image( + image, + input_size=input_size, + min_num=min_num, + max_num=max_num, + prior_aspect_ratio=target_aspect_ratio, + ) + # combine pixel values + pixel_values = torch.cat( + [pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0) + + else: + pixel_values, _ = load_image( + image, + input_size=input_size, + min_num=min_num, + max_num=max_num, + use_thumbnail=use_thumbnail, + ) + + return pixel_values + + +def image_to_pixel_values_wrapper(hf_config: PretrainedConfig, + max_dynamic_patch: Optional[int] = None, + use_MSAC: Optional[bool] = None): + image_size = hf_config.vision_config.image_size + min_num = hf_config.min_dynamic_patch + if max_dynamic_patch is None: + max_dynamic_patch = hf_config.max_dynamic_patch + if use_MSAC is None: + use_MSAC = hf_config.use_msac + use_thumbnail = hf_config.use_thumbnail + return partial( + image_to_pixel_values, + input_size=image_size, + min_num=min_num, + max_num=max_dynamic_patch, + use_thumbnail=use_thumbnail, + use_MSAC=use_MSAC, + ) + + +def get_max_internvl_image_tokens(ctx: InputContext, + *, + max_dynamic_patch: Optional[int] = None): + """ + Calculate the maximum number of tokens with/without MSAC and thumbnail + """ + hf_config = ctx.get_hf_config() + use_thumbnail = hf_config.use_thumbnail + use_MSAC = hf_config.use_msac + + if max_dynamic_patch is None: + max_dynamic_patch = hf_config.max_dynamic_patch + + num_patches = get_internvl_num_patches(hf_config) + + coefficient = 2 if use_MSAC else 1 + num_blocks = coefficient * max_dynamic_patch + (1 if use_thumbnail else 0) + + return num_blocks * num_patches + + +class H2OVLInputPipeline(InternVLInputPipeline): + """ + Input pipeline for processing image and text data for the H2OVL model. + """ + + def input_processor( + self, + ctx: InputContext, + inputs: DecoderOnlyInputs, + *, + max_dynamic_patch: Optional[int] = None, + ) -> DecoderOnlyInputs: + # get multi_modal_data + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + model_config = ctx.model_config + hf_config = ctx.get_hf_config() + use_MSAC = hf_config.use_msac + + image_data = multi_modal_data["image"] + num_patches = get_internvl_num_patches(hf_config) + + image_pixel_values_mapper = image_to_pixel_values_wrapper( + hf_config, max_dynamic_patch=max_dynamic_patch) + + # single image + if isinstance(image_data, Image.Image): + pixel_values = image_pixel_values_mapper(image_data, + use_MSAC=use_MSAC) + num_blocks = pixel_values.shape[0] + image_feature_sizes = [num_blocks * num_patches] + pixel_values = pixel_values.unsqueeze(0) + + # multi images + elif is_list_of(image_data, Image.Image): + # Do not use MSAC for multi images + image_feature_sizes = [] + pixel_values = [ + image_pixel_values_mapper(image, use_MSAC=False) + for image in image_data + ] + for pixel_value in pixel_values: + num_blocks = pixel_value.shape[0] + image_feature_sizes.append(num_blocks * num_patches) + + # image embeddings as input + elif isinstance(image_data, torch.Tensor): + _, image_feature_size, _ = image_data.shape + image_feature_sizes = [image_feature_size] + pixel_values = None + + # multi-image image embeddings + elif is_list_of(image_data, torch.Tensor): + + image_feature_sizes = [] + for image_embed in image_data: + _, image_feature_size, _ = image_embed.shape + image_feature_sizes.append(image_feature_size) + pixel_values = None + + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + + prompt = inputs.get("prompt") + prompt_token_ids = inputs["prompt_token_ids"] + if prompt is None: + prompt = tokenizer.decode(prompt_token_ids) + + new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, + num_patches) + new_prompt_token_ids = tokenizer.encode(new_prompt) + + # Wrap image processing in input_processor to avoid duplication + image_token_id = tokenizer.encode( + self.img_context_token, + add_special_tokens=False, + return_tensors="pt", + )[0] + + # Update multi_modal_data to return + if pixel_values is not None: + multi_modal_data = { + "image": { + "pixel_values": pixel_values, + "image_token_id": image_token_id, + } + } + else: + multi_modal_data = {"image": {"image_embeds": image_data}} + + return token_inputs( + prompt=prompt, + prompt_token_ids=new_prompt_token_ids, + multi_modal_data=multi_modal_data, + ) + + def input_mapper( + self, + ctx: InputContext, + data: object, + *, + max_dynamic_patch: Optional[int] = None, + ) -> MultiModalKwargs: + + # NOTE: Preprocessing for the image data is done in the + # 'input_processor' function during actual inference. + if isinstance(data, dict): + return MultiModalKwargs(data) + + # The section below is only used with dummy data during + # memory profiling. + hf_config = ctx.get_hf_config() + + image_pixel_values_mapper = image_to_pixel_values_wrapper( + hf_config, max_dynamic_patch) + + if isinstance(data, Image.Image): + pixel_values = image_pixel_values_mapper(data) + pixel_values = pixel_values.unsqueeze(0) + + elif is_list_of(data, Image.Image): + hf_config.use_msac = False + pixel_values = [image_pixel_values_mapper(img) for img in data] + + else: + return MultiModalKwargs({"image_embeds": data}) + model_config = ctx.model_config + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + image_token_id = tokenizer.encode( + self.img_context_token, + add_special_tokens=False, + return_tensors="pt", + )[0] + + return MultiModalKwargs({ + "pixel_values": pixel_values, + "image_token_id": image_token_id + }) + + +input_pipeline = H2OVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) +@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) +@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) +class H2OVLChatModel(InternVLChatModel): + + def _init_vision_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + *, + is_mono: bool, + prefix: str, + ): + if not is_mono: + vision_feature_layer = config.select_layer + if vision_feature_layer < 0: + num_hidden_layers = (config.vision_config.num_hidden_layers + + vision_feature_layer + 1) + else: + num_hidden_layers = vision_feature_layer + 1 + + return InternVisionModel( + config.vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + prefix=prefix, + ) + else: + msg = "Monolith mode is not applicable to H2OVL" + raise NotImplementedError(msg) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 43f4f29814e6d..b21bc2a3f9ce1 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -1,5 +1,3 @@ -# coding=utf-8 - # adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py # Copyright 2024 The vLLM team. # Copyright 2024 the HuggingFace Inc. team. All rights reserved. @@ -17,7 +15,7 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Optional +from typing import Iterable, Optional, Tuple import torch from torch import nn @@ -31,6 +29,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader class Idefics2VisionEmbeddings(nn.Module): @@ -331,3 +330,25 @@ def forward( encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py new file mode 100644 index 0000000000000..85f23a1da533b --- /dev/null +++ b/vllm/model_executor/models/idefics3.py @@ -0,0 +1,711 @@ +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Idefics3 model compatible with HuggingFace weights.""" + +import math +from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple, + Optional, Tuple, TypedDict, Union) + +import torch +import torch.utils.checkpoint +from PIL import Image +from torch import nn +# Temporary solution for transformers below 4.46.0. +from transformers import ProcessorMixin as Idefics3ImageProcessor + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.image import cached_get_image_processor +from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of + +# yapf: disable +from .idefics2_vision_model import ( + Idefics2VisionTransformer as Idefics3VisionTransformer) +# yapf: enable +from .interfaces import SupportsLoRA, SupportsMultiModal +from .llama import LlamaModel +from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, + merge_multimodal_embeddings) + +logger = init_logger(__name__) + + +class Idefics3ImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, num_channels, height, width)` + """ + pixel_attention_mask: Optional[torch.BoolTensor] + + +class Idefics3ImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + `hidden_size` must match the hidden size of language model backbone. + """ + + +class Idefics3ProcessorSize(NamedTuple): + """Hashable wrapper for unhashable `size` dict of Idefics3Processor.""" + # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache, + # we need to use NamedTuple instead of TypedDict to avoid hashing issues. + longest_edge: int + + def __contains__(self, key: str) -> bool: + return key in self._asdict() and getattr(self, key) is not None + + def __getitem__(self, key: str) -> int: + return getattr(self, key) + + +ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs] + + +def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict: + mm_processor_kwargs = {} + if size: + mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size) + return mm_processor_kwargs + + +def input_mapper_for_idefics3( + ctx: InputContext, + data: object, + *, + size: Optional[Dict[str, int]] = None, +): + model_config = ctx.model_config + mm_processor_kwargs = get_mm_processor_kwargs(size) + image_processor = cached_get_image_processor( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **mm_processor_kwargs) + if image_processor is None: + raise RuntimeError("No HuggingFace processor is available " + "to process the image object") + + if isinstance(data, Image.Image): + images = [[data]] + elif is_list_of(data, Image.Image): + images = [data] + else: + raise TypeError(f"Invalid image type: {type(data)}") + + try: + batch_data = image_processor(images, + return_tensors="pt", + return_row_col_info=True).data + except Exception: + logger.error("Failed to process image (%s)", data) + raise + + return MultiModalKwargs(batch_data) + + +def _resize_output_size(height: int, + width: int, + max_len: Optional[int] = None, + min_len: Optional[int] = 1, + max_size: Optional[int] = None) -> Tuple[int, int]: + # Set default value for max_len if not provided + max_len = max(height, width) if max_len is None else max_len + aspect_ratio = width / height + + # Handle the maximum size constraint + if max_size is not None: + max_len = min(max_len, max_size) + + # Adjust dimensions according to the aspect ratio + if width >= height: + width = max_len + height = int(width / aspect_ratio) + else: + height = max_len + width = int(height * aspect_ratio) + + # Ensure both width and height are even (if needed) + height += 1 if height % 2 != 0 else 0 + width += 1 if width % 2 != 0 else 0 + + # Ensure dimensions are not smaller than the minimum length + height = max(height, min_len) + width = max(width, min_len) + + return height, width + + +def _get_resize_output_image_size( + image_size: Tuple[int, int], + resolution_max_side: int, + max_image_size: int = 1820, +) -> Tuple[int, int]: + if resolution_max_side > max_image_size: + raise ValueError( + "`resolution_max_side` cannot be larger than `max_image_size`") + + height, width = image_size + + # Find the output size, when rescaling the longest edge to max_len and + # preserving the aspect ratio + height, width = _resize_output_size(height, + width, + max_len=resolution_max_side) + + return height, width + + +def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int, + fake_token_around_image: str, image_token: str, + global_img_token: str) -> str: + """ + Prompt with expanded image tokens for when the image is split + into patches. + """ + text_split_images = "" + for n_h in range(image_rows): + for n_w in range(image_cols): + text_split_images += (fake_token_around_image + + f"" + + image_token * image_seq_len) + text_split_images += "\n" + + text_split_images += "\n" + _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token) + return text_split_images + + +def _prompt_single_image(image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + """Prompt with expanded image tokens for a single image.""" + return (fake_token_around_image + global_img_token + + image_token * image_seq_len + fake_token_around_image) + + +def _get_image_prompt_string(image_rows: int, image_cols: int, + image_seq_len: int, fake_token_around_image: str, + image_token: str, global_img_token: str): + if image_rows == 0 and image_cols == 0: + return _prompt_single_image( + image_seq_len=image_seq_len, + fake_token_around_image=fake_token_around_image, + image_token=image_token, + global_img_token=global_img_token, + ) + return _prompt_split_image(image_seq_len, image_rows, image_cols, + fake_token_around_image, image_token, + global_img_token) + + +def input_processor_for_idefics3(ctx: InputContext, + inputs: DecoderOnlyInputs, + *, + size: Optional[Dict[str, int]] = None): + multi_modal_data = inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return inputs + + model_config = ctx.model_config + mm_processor_kwargs = get_mm_processor_kwargs(size) + processor = cached_get_processor(model_config.model, **mm_processor_kwargs) + image_processor = processor.image_processor + tokenizer = processor.tokenizer + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_list = [image_data] + elif is_list_of(image_data, Image.Image): + image_list = image_data + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + image_rows = [] + image_cols = [] + for image in image_list: + height, width = _get_resize_output_image_size(image.size, size) + + rows = math.ceil(height / max_image_size) + cols = math.ceil(width / max_image_size) + image_rows.append(rows) + image_cols.append(cols) + image_rows = [image_rows] + image_cols = [image_cols] + + n_images_in_text = [] + + text = inputs.get("prompt") + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, " + "or a list of strings") + + fake_image_token = processor.fake_image_token.content + image_token = processor.image_token.content + global_img_token = processor.global_image_tag + + prompt_strings = [] + for sample, sample_rows, sample_cols in zip(text, image_rows, + image_cols): + n_images_in_text.append(sample.count(image_token)) + + # Replace the image token with fake tokens around the expanded + # image token sequence of length `image_seq_len` + image_prompt_strings = [] + for n_rows, n_cols in zip(sample_rows, sample_cols): + image_prompt_string = _get_image_prompt_string( + n_rows, + n_cols, + processor.image_seq_len, + image_token=image_token, + fake_token_around_image=fake_image_token, + global_img_token=global_img_token, + ) + image_prompt_strings.append(image_prompt_string) + + split_sample = sample.split(image_token) + if len(split_sample) == 0: + raise ValueError( + "The image token should be present in the text.") + + # Place in the image prompt strings where the image tokens are + sample = split_sample[0] + for i, image_prompt_string in enumerate(image_prompt_strings): + sample += image_prompt_string + split_sample[i + 1] + prompt_strings.append(sample) + + prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids + + return token_inputs( + prompt_token_ids=prompt_token_ids, + prompt=prompt_strings[0], + multi_modal_data=multi_modal_data, + ) + + +def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int: + size = image_processor.size['longest_edge'] + max_image_size = image_processor.max_image_size['longest_edge'] + resized_height, resized_width = size, size + + grid_h = resized_height // max_image_size + grid_w = resized_width // max_image_size + return (grid_h * grid_w + 1) + + +def get_max_idefics3_image_tokens(ctx: InputContext, + *, + size: Optional[Dict[str, + int]] = None) -> int: + model_config = ctx.model_config + mm_processor_kwargs = get_mm_processor_kwargs(size) + processor = cached_get_processor(model_config.model, **mm_processor_kwargs) + image_seq_len = processor.image_seq_len + image_processor = processor.image_processor + + max_num_image_patches = _get_max_num_image_patch(image_processor) + + return max_num_image_patches * image_seq_len + + +def dummy_data_for_idefics3( + ctx: InputContext, + seq_len: int, + mm_counts: Mapping[str, int], + *, + size: Optional[Dict[str, int]] = None) -> DummyData: + hf_config = ctx.get_hf_config() + num_images = mm_counts["image"] + + mm_processor_kwargs = get_mm_processor_kwargs(size) + processor = cached_get_processor(ctx.model_config.model, + **mm_processor_kwargs) + max_num_image_patches = _get_max_num_image_patch(processor.image_processor) + image_seq_len = processor.image_seq_len + max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images + + if seq_len - max_llm_image_tokens < 0: + raise RuntimeError( + f"Idefics3 cannot process {num_images} images in a prompt, " + "please increase max_model_len or reduce image limit by " + "--limit-mm-per-prompt.") + + seq_data = SequenceData.from_prompt_token_counts( + (hf_config.image_token_id, max_llm_image_tokens), + (0, seq_len - max_llm_image_tokens)) + + width = height = hf_config.vision_config.image_size + image = Image.new("RGB", (width, height), color=0) + mm_data = {"image": [image] if num_images == 1 else [image] * num_images} + + return DummyData(seq_data, mm_data) + + +class Idefics3SimpleMLP(nn.Module): + + def __init__(self, config): + super().__init__() + input_size = config.vision_config.hidden_size * (config.scale_factor** + 2) + output_size = config.text_config.hidden_size + self.proj = ReplicatedLinear(input_size, output_size, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out, _ = self.proj(x) + return out + + +class Idefics3Connector(nn.Module): + + def __init__(self, config): + super().__init__() + self.scale_factor = config.scale_factor + self.modality_projection = Idefics3SimpleMLP(config) + + def pixel_shuffle(self, + x: torch.Tensor, + scale_factor: int = 2) -> torch.Tensor: + bsz, seq, embed_dim = x.size() + height = width = int(seq**0.5) + x = x.view(bsz, height, width, embed_dim) + x = x.view(bsz, height, int(width / scale_factor), + embed_dim * scale_factor) + x = x.permute(0, 2, 1, 3) + x = x.reshape( + bsz, + int(width / scale_factor), + int(height / scale_factor), + embed_dim * (scale_factor**2), + ) + x = x.permute(0, 2, 1, 3) + x = x.reshape(bsz, int(seq / (scale_factor**2)), + embed_dim * (scale_factor**2)) + return x + + def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor: + image_hidden_states = self.pixel_shuffle(image_hidden_states, + self.scale_factor) + image_hidden_states = self.modality_projection(image_hidden_states) + return image_hidden_states + + +class Idefics3Model(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + + self.config = config + self.padding_idx = self.config.text_config.pad_token_id + self.vocab_size = self.config.text_config.vocab_size + self.vision_model = Idefics3VisionTransformer(config.vision_config, + quant_config) + self.connector = Idefics3Connector(config) + self.text_model = LlamaModel( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "text_model"), + ) + + self.image_seq_len = int( + ((config.vision_config.image_size // + config.vision_config.patch_size)**2) / (config.scale_factor**2)) + self.image_token_id = self.config.image_token_id + + def _validate_pixel_values( + self, data: Union[torch.Tensor, List[torch.Tensor]] + ) -> Union[torch.Tensor, List[torch.Tensor]]: + + h = w = self.config.vision_config.image_size + expected_dims = (3, h, w) + + def _validate_shape(d: torch.Tensor): + actual_dims = tuple(d.shape[1:]) + + if actual_dims != expected_dims: + expected_expr = ("num_patches", *map(str, expected_dims)) + raise ValueError( + "The expected shape of pixel values per image per batch " + f"is {expected_expr}. You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[ImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + pixel_attention_mask = kwargs.pop("pixel_attention_mask", None) + + if pixel_values is None and image_embeds is None: + return None + + if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return Idefics3ImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds, concat=True), + ) + + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return Idefics3ImagePixelInputs(type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, + concat=True)), + pixel_attention_mask=flatten_bn( + pixel_attention_mask, + concat=True)) + + raise AssertionError("This line should be unreachable.") + + def _image_pixels_to_features( + self, + pixel_values: torch.Tensor, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + ) -> torch.Tensor: + # NOTE: we skip the step to select the vision feature layer since + # this is already done inside the vision tower + batch_size, num_images, num_channels, height, width = pixel_values.shape + pixel_values = pixel_values.to( + dtype=self.vision_model.embeddings.patch_embedding.weight.dtype + ) # fp16 compatibility + pixel_values = pixel_values.view(batch_size * num_images, + *pixel_values.shape[2:]) + + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum( + dim=(-1, -2, -3)) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() + + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=(pixel_values.size(0), pixel_values.size(2), + pixel_values.size(3)), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask + pixel_attention_mask = pixel_attention_mask.view( + batch_size * num_images, *pixel_attention_mask.shape[2:]) + pixel_attention_mask = pixel_attention_mask[ + real_images_inds].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold(dimension=1, + size=patch_size, + step=patch_size) + patches_subgrid = patches_subgrid.unfold(dimension=2, + size=patch_size, + step=patch_size) + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + + return image_hidden_states + + def _process_image_pixels( + self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor: + assert self.vision_model is not None + + pixel_values = inputs["data"] + pixel_attention_mask = inputs["pixel_attention_mask"] + + return self._image_pixels_to_features(pixel_values, + pixel_attention_mask) + + def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.vision_model is not None + image_features = self._process_image_pixels(image_input) + return self.connector(image_features) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: + input_ids = None + inputs_embeds = None + else: + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + vision_embeddings = self._process_image_input(image_input) + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + else: + inputs_embeds = self.text_model.get_input_embeddings(input_ids) + input_ids = None + + hidden_states = self.text_model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) +@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) +class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + # vision_model + "fc1", + "fc2", + "out_proj", + # text_model + "qkv_proj", # same name with vision encoder + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + + self.config = config + self.multimodal_config = multimodal_config + + self.model = Idefics3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self.image_token_id = self.config.image_token_id + + self.lm_head = ParallelLMHead( + config.text_config.vocab_size, + config.text_config.hidden_size, + quant_config=quant_config, + ) + if self.config.text_config.tie_word_embeddings: + self.lm_head.weight = self.model.text_model.wte.weight + self.logits_processor = LogitsProcessor(config.text_config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model( + input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + **kwargs, + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="model.text_model", + connector="model.connector", + tower_model="model.vision_model") diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 8d2d422f9891c..7bb43beff255c 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -11,9 +11,8 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata - from vllm.config import CacheConfig + from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import PoolerOutput - from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -39,10 +38,8 @@ class VllmModel(Protocol[C_co, T_co]): def __init__( self, - config: C_co, - *, - cache_config: Optional["CacheConfig"], - quant_config: Optional["QuantizationConfig"], + vllm_config: "VllmConfig", + prefix: str = "", ) -> None: ... @@ -58,20 +55,7 @@ def forward( def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: model_init = model.__init__ - vllm_kws = ("cache_config", "quant_config") - missing_kws = tuple(kw for kw in vllm_kws - if not supports_kw(model_init, kw)) - - if missing_kws and (isinstance(model, type) - and issubclass(model, nn.Module)): - logger.warning( - "The model (%s) is missing " - "vLLM-specific keywords from its initializer: %s", - model, - missing_kws, - ) - - return len(missing_kws) == 0 + return supports_kw(model_init, "vllm_config") def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 313d98b649b48..21fa6983063b8 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from functools import partial from typing import Any, Dict, Iterable, List, Optional, Tuple, Union @@ -8,7 +7,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, @@ -21,7 +20,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -251,14 +250,13 @@ def forward( @support_torch_compile class InternLM2Model(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -318,19 +316,13 @@ def forward( class InternLM2ForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = InternLM2Model(config, - cache_config, - quant_config, + self.model = InternLM2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.output = ParallelLMHead(config.vocab_size, config.hidden_size, @@ -339,7 +331,7 @@ def __init__( if self.config.tie_word_embeddings: self.output.weight = self.model.tok_embeddings.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index edd867e4b6457..34889d691a934 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from typing import List, Optional, Tuple, Union import torch @@ -6,7 +5,7 @@ from transformers import PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.quantization import QuantizationConfig @@ -105,14 +104,13 @@ def forward( class InternLM2VEModel(InternLM2Model): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(config, cache_config, quant_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: InternLM2VEDecoderLayer( @@ -160,14 +158,13 @@ def forward( class InternLM2VEForCausalLM(InternLM2ForCausalLM): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(config, cache_config, quant_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.model = InternLM2VEModel(config, cache_config, quant_config, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 1c1fde5b30983..07165ea688f94 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -16,17 +16,16 @@ from transformers import PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.quantization import (AWQConfig, QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -35,7 +34,7 @@ get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) IMG_START = '' IMG_END = '' @@ -346,7 +345,7 @@ def input_mapper( # we can't stack here because images may have different num_patches data = [image_pixel_values_mapper(img) for img in data] else: - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) model_config = ctx.model_config tokenizer = cached_get_tokenizer( model_config.tokenizer, @@ -355,7 +354,7 @@ def input_mapper( add_special_tokens=False, return_tensors="pt")[0] - return MultiModalInputs({ + return MultiModalKwargs({ "pixel_values": data, "image_token_id": image_token_id }) @@ -379,7 +378,7 @@ def dummy_data( model_config.tokenizer, trust_remote_code=model_config.trust_remote_code) - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( hf_config.vision_config, seq_len, num_images, @@ -398,7 +397,7 @@ def dummy_data( image_height_override=max_image_height, ) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) @@ -410,13 +409,13 @@ def dummy_data( @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.config = config self.multimodal_config = multimodal_config self._patch_quant_config(config, quant_config) @@ -435,14 +434,13 @@ def __init__(self, config, quant_config=quant_config, is_mono=self.is_mono, - prefix="vision_model", + prefix=maybe_prefix(prefix, "vision_model"), ) self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.mlp1 = self._init_mlp1(config) @@ -467,7 +465,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _init_vision_model( self, diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index b947f24a693b5..4dc9271703a8d 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/inceptionai/jais-30b-chat-v3/blob/main/modeling_jais.py # Copyright 2023 The vLLM team. @@ -27,7 +26,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -35,7 +34,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -45,7 +44,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class SwiGLUActivation(nn.Module): @@ -216,14 +216,13 @@ def forward( @support_torch_compile class JAISModel(nn.Module): - def __init__( - self, - config: JAISConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config assert not config.add_cross_attention assert not config.scale_attn_by_inverse_layer_idx @@ -289,14 +288,17 @@ class JAISLMHeadModel(nn.Module, SupportsPP): def __init__( self, - config: JAISConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = JAISModel(config, cache_config, quant_config) + self.transformer = JAISModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: @@ -309,7 +311,7 @@ def __init__( config.mup_width_scale) self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size, scale=self.output_logits_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index fddd39fb8c85b..88fb8d5cf555a 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """Inference-only Jamba model.""" from typing import Iterable, List, Optional, Tuple @@ -8,212 +7,33 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - QKVParallelLinear, +from vllm.model_executor.layers.linear import (QKVParallelLinear, ReplicatedLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( - causal_conv1d_fn, causal_conv1d_update) -from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( - selective_scan_fn, selective_state_update) +from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - composed_weight_loader, default_weight_loader, sharded_weight_loader) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, _get_graph_batch_size) from .interfaces import HasInnerState, SupportsLoRA +from .utils import maybe_prefix KVCache = Tuple[torch.Tensor, torch.Tensor] -# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer -class JambaMambaMixer(nn.Module): - """ - Compute ∆, A, B, C, and D the state space parameters and compute - the `contextualized_states`. A, D are input independent - (see Mamba paper [1] Section 3.5.2 "Interpretation of A" - for why A isn't selective) ∆, B, C are input-dependent - (this is a key difference between Mamba and the linear time - invariant S4, and is why Mamba is called - **selective** state spaces) - """ - - def __init__(self, config: JambaConfig): - super().__init__() - self.config = config - self.hidden_size = config.hidden_size - self.ssm_state_size = config.mamba_d_state - self.conv_kernel_size = config.mamba_d_conv - self.intermediate_size = config.mamba_expand * config.hidden_size - self.time_step_rank = config.mamba_dt_rank - self.use_conv_bias = config.mamba_conv_bias - self.use_bias = config.mamba_proj_bias - self.conv1d = ColumnParallelLinear( - input_size=self.conv_kernel_size, - output_size=self.intermediate_size, - bias=self.use_conv_bias, - ) - # unsqueeze to fit conv1d weights shape into the linear weights shape. - # Can't do this in `weight_loader` since it already exists in - # `ColumnParallelLinear` and `set_weight_attrs` - # doesn't allow to override it - self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) - - self.in_proj = MergedColumnParallelLinear(self.hidden_size, - [self.intermediate_size] * 2, - bias=self.use_bias) - # selective projection used to make dt, B and C input dependent - self.x_proj = RowParallelLinear( - self.intermediate_size, - self.time_step_rank + self.ssm_state_size * 2, - bias=False, - ) - # time step projection (discretization) - - # In the forward we need to apply dt_proj without the bias, - # as the bias is added in the selective scan kernel. - self.dt_proj = ColumnParallelLinear(self.time_step_rank, - self.intermediate_size, - bias=True, - skip_bias_add=True) - - tp_size = get_tensor_model_parallel_world_size() - self.A = nn.Parameter( - torch.empty( - self.intermediate_size // tp_size, - self.ssm_state_size, - dtype=torch.float32, - )) - self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size)) - - set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) - a_weight_loader = composed_weight_loader( - sharded_weight_loader(0), lambda x: -torch.exp(x.float())) - set_weight_attrs(self.A, {"weight_loader": a_weight_loader}) - - self.out_proj = RowParallelLinear( - self.intermediate_size, - self.hidden_size, - bias=self.use_bias, - input_is_parallel=True, - ) - self.activation = config.hidden_act - - self.dt_layernorm = RMSNorm(self.time_step_rank, - eps=config.rms_norm_eps) - self.b_layernorm = RMSNorm(self.ssm_state_size, - eps=config.rms_norm_eps) - self.c_layernorm = RMSNorm(self.ssm_state_size, - eps=config.rms_norm_eps) - - def forward(self, hidden_states: torch.Tensor, - attn_metadata: AttentionMetadata, - mamba_cache_params: MambaCacheParams): - - # 1. Gated MLP's linear projection - projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) - hidden_states, gate = projected_states.chunk(2, dim=-2) - - # 2. Convolution sequence transformation - conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), - self.conv1d.weight.size(2)) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - hidden_states = causal_conv1d_fn( - hidden_states, - conv_weights, - self.conv1d.bias, - activation=self.activation, - conv_states=mamba_cache_params.conv_state, - has_initial_state=attn_metadata.context_lens_tensor > 0, - cache_indices=mamba_cache_params.state_indices_tensor, - query_start_loc=attn_metadata.query_start_loc) - else: - hidden_states = causal_conv1d_update( - hidden_states.transpose(0, 1), - mamba_cache_params.conv_state, - conv_weights, - self.conv1d.bias, - self.activation, - conv_state_indices=mamba_cache_params.state_indices_tensor) - hidden_states = hidden_states.transpose(0, 1) - - # 3. State Space Model sequence transformation - # 3.a. input varying initialization of time_step, B and C - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] - - time_step, B, C = torch.split( - ssm_parameters, - [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], - dim=-1, - ) - time_step = self.dt_layernorm(time_step.contiguous()) - B = self.b_layernorm(B.contiguous()) - C = self.c_layernorm(C.contiguous()) - - discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) - # 3.c perform the recurrence y ← SSM(A, B, C)(x) - time_proj_bias = (self.dt_proj.bias.float() if hasattr( - self.dt_proj, "bias") else None) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - scan_outputs = selective_scan_fn( - hidden_states, - mamba_cache_params.ssm_state, - discrete_time_step, - self.A, - B.transpose(-2, -1), - C.transpose(-2, -1), - self.D.float(), - gate, - time_proj_bias, - delta_softplus=True, - cache_indices=mamba_cache_params.state_indices_tensor, - has_initial_state=attn_metadata.context_lens_tensor > 0, - query_start_loc=attn_metadata.query_start_loc) - else: - scan_outputs = selective_state_update( - mamba_cache_params.ssm_state, - hidden_states.transpose(0, 1), - discrete_time_step.transpose(0, 1), - self.A, - B, - C, - self.D, - gate.transpose(0, 1), - time_proj_bias, - dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor) - scan_outputs = scan_outputs.transpose(0, 1) - - # 4. Final linear projection - contextualized_states = self.out_proj(scan_outputs.transpose(-2, - -1))[0] - return contextualized_states - - class JambaMoE(nn.Module): def __init__(self, @@ -284,9 +104,18 @@ def __init__(self, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() - self.layer_idx = layer_idx self.config = config - self.mamba = JambaMambaMixer(config) + self.mamba = MambaMixer(hidden_size= config.hidden_size, + ssm_state_size = config.mamba_d_state, + conv_kernel_size = config.mamba_d_conv, + intermediate_size = config.mamba_expand *\ + config.hidden_size, + time_step_rank = config.mamba_dt_rank, + use_conv_bias = config.mamba_conv_bias, + use_bias = config.mamba_proj_bias, + use_rms_norm=True, + rms_norm_eps=config.rms_norm_eps, + activation=config.hidden_act) num_experts = config.layers_num_experts[layer_idx] ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP @@ -430,14 +259,14 @@ def forward( class JambaModel(nn.Module): - def __init__( - self, - config: JambaConfig, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -520,24 +349,19 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: JambaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - scheduler_config: Optional[SchedulerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ "Jamba currently does not support prefix caching" super().__init__() self.config = config self.scheduler_config = scheduler_config - self.model = JambaModel(config, - cache_config=cache_config, - quant_config=quant_config, - lora_config=lora_config) + self.model = JambaModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -555,7 +379,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 8a9e5203972be..8aed0fead18f9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, PoolerConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -43,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -272,15 +271,14 @@ def forward( @support_torch_compile class LlamaModel(nn.Module): - def __init__( - self, - config: LlamaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -464,8 +462,6 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj.", ".o_proj.", ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -495,24 +491,16 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "norm": "model.norm" } - def __init__( - self, - config: LlamaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - pooler_config: Optional[PoolerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config - self.model = LlamaModel(config, - cache_config, - quant_config, - lora_config=lora_config, + self.model = LlamaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size @@ -539,7 +527,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( @@ -550,6 +538,9 @@ def __init__( normalize=False, softmax=False) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -557,9 +548,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( @@ -630,7 +623,7 @@ def permute(w: torch.Tensor, n_heads: int): return name, loaded_weight -class LlamaEmbeddingModel(nn.Module, SupportsPP): +class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): """ A model that uses Llama with additional embedding functionalities. @@ -641,15 +634,27 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP): model: An instance of LlamaModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ + packed_modules_mapping = { + "qkv_proj": ["q_proj", "k_proj", "v_proj"], + "gate_up_proj": ["gate_proj", "up_proj"] + } - def __init__( - self, - pooler_config: Optional[PoolerConfig] = None, - **kwargs, - ) -> None: + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens" + ] + embedding_modules = { + "embed_tokens": "input_embeddings", + } + embedding_padding_modules = [] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.model = LlamaModel(**kwargs) + pooler_config = vllm_config.model_config.pooler_config + + self.model = LlamaModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.LAST, @@ -682,3 +687,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): def load_kv_cache_scales(self, quantization_param_path: str) -> None: self.model.load_kv_cache_scales(quantization_param_path) + + # LRUCacheWorkerLoRAManager instantiation requires model config. + @property + def config(self): + return self.model.config diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index eda99c029881f..005ae5e03cfed 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -9,13 +9,15 @@ PretrainedConfig, SiglipVisionConfig) from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -31,7 +33,7 @@ dummy_seq_data_for_siglip, get_max_siglip_image_tokens, input_processor_for_siglip) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) class LlavaImagePixelInputs(TypedDict): @@ -111,7 +113,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, image_feature_size = get_max_llava_image_tokens(ctx) if isinstance(vision_config, CLIPVisionConfig): - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( vision_config, seq_len, num_images, @@ -120,9 +122,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, ) mm_data = dummy_image_for_clip(vision_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) elif isinstance(vision_config, SiglipVisionConfig): - seq_data = dummy_seq_data_for_siglip( + seq_data, ranges = dummy_seq_data_for_siglip( vision_config, seq_len, num_images, @@ -131,9 +133,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, ) mm_data = dummy_image_for_siglip(vision_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) elif isinstance(vision_config, PixtralVisionConfig): - seq_data = dummy_seq_data_for_pixtral_hf( + seq_data, ranges = dummy_seq_data_for_pixtral_hf( vision_config, seq_len, num_images, @@ -142,7 +144,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int, ) mm_data = dummy_image_for_pixtral_hf(vision_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -257,13 +259,13 @@ def init_vision_tower_for_llava( @INPUT_REGISTRY.register_input_processor(input_processor_for_llava) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: LlavaConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config + self.config = config self.multimodal_config = multimodal_config @@ -281,7 +283,7 @@ def __init__(self, config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, @@ -289,9 +291,8 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -301,7 +302,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size @@ -448,6 +449,25 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.config.image_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -455,6 +475,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LLaVA-1.5. @@ -493,24 +514,13 @@ def forward( :class:`LlavaImageInputs` """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) # always pass the input via `inputs_embeds` # to make sure the computation graph is consistent - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - else: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index e8540d85ff565..0b621a23ec980 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -11,11 +11,11 @@ from typing_extensions import NotRequired from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig -from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext) from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -31,7 +31,7 @@ dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, - init_vllm_registered_model) + init_vllm_registered_model, maybe_prefix) class LlavaNextImagePixelInputs(TypedDict): @@ -180,7 +180,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, max_feat_height, max_feat_width = pinpoint if isinstance(vision_config, CLIPVisionConfig): - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( vision_config, seq_len, num_images, @@ -195,9 +195,9 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, image_height_override=max_feat_height, ) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) elif isinstance(vision_config, SiglipVisionConfig): - seq_data = dummy_seq_data_for_siglip( + seq_data, ranges = dummy_seq_data_for_siglip( vision_config, seq_len, num_images, @@ -212,7 +212,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, image_height_override=max_feat_height, ) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -281,13 +281,12 @@ def input_processor_for_llava_next(ctx: InputContext, class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: LlavaNextConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - pooler_config: Optional[PoolerConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -297,7 +296,7 @@ def __init__(self, config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( @@ -307,9 +306,8 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) # The same model class supports both language generation and embedding # because the architecture name is the same @@ -326,7 +324,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) @@ -605,7 +603,6 @@ def forward( :class:`LlavaNextImageInputs` """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None else: image_input = self._parse_and_validate_image_input(**kwargs) @@ -617,9 +614,14 @@ def forward( self.language_model.model.get_input_embeddings, lambda _: self._process_image_input(image_input), ) - input_ids = None else: - inputs_embeds = None + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) + + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + # for `torch.compile` integration + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index b8051d5fc6ae2..b030c2f5fdc47 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -10,12 +10,11 @@ SiglipVisionConfig) from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY @@ -30,7 +29,7 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip) from .utils import (AutoWeightsLoader, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # For profile run _MAX_FRAMES_PER_VIDEO = 32 @@ -108,33 +107,35 @@ def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int, video_feature_size = frames_per_video * tokens_per_frame if isinstance(vision_config, CLIPVisionConfig): - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( vision_config, seq_len, num_videos, image_token_id=hf_config.video_token_index, image_feature_size_override=video_feature_size, + mm_key="video", ) pil_frame = dummy_image_for_clip(vision_config, num_images=1) np_frame = np.array(pil_frame["image"]) mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) mm_data = {"video": mm_data_per_video} - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) elif isinstance(vision_config, SiglipVisionConfig): - seq_data = dummy_seq_data_for_siglip( + seq_data, ranges = dummy_seq_data_for_siglip( vision_config, seq_len, num_videos, image_token_id=hf_config.video_token_index, image_feature_size_override=video_feature_size, + mm_key="video", ) pil_frame = dummy_image_for_siglip(vision_config, num_images=1) np_frame = np.array(pil_frame["image"]) mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) mm_data = {"video": mm_data_per_video} - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -145,6 +146,12 @@ def input_processor_for_llava_next_video(ctx: InputContext, multi_modal_data = inputs.get("multi_modal_data") if multi_modal_data is None or "video" not in multi_modal_data: return inputs + + if "multi_modal_placeholders" in inputs and "video" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + video_data = multi_modal_data["video"] model_config = ctx.model_config @@ -160,7 +167,7 @@ def input_processor_for_llava_next_video(ctx: InputContext, tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -170,7 +177,8 @@ def input_processor_for_llava_next_video(ctx: InputContext, return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"video": ranges}) elif is_list_of(video_data, np.ndarray): raise NotImplementedError( @@ -245,12 +253,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: LlavaNextVideoConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -260,7 +267,7 @@ def __init__(self, config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, @@ -268,9 +275,8 @@ def __init__(self, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) @@ -280,7 +286,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_video_pixel_values( self, data: Union[torch.Tensor, List[torch.Tensor]] diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index a0cf208a65f36..c129f140d8d12 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -14,12 +14,11 @@ from typing_extensions import NotRequired from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import (cached_get_tokenizer, @@ -36,7 +35,7 @@ dummy_video_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # Result in the max possible feature size (2x2 grid of 336x336px tiles) MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 @@ -218,31 +217,31 @@ def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int, video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) if isinstance(vision_config, CLIPVisionConfig): - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( vision_config, seq_len, num_videos, image_token_id=hf_config.video_token_index, image_feature_size_override=video_feature_size, - ) + mm_key="video") mm_data = dummy_video_for_clip(vision_config, num_frames=num_frames, num_videos=num_videos) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) elif isinstance(vision_config, SiglipVisionConfig): - seq_data = dummy_seq_data_for_siglip( + seq_data, ranges = dummy_seq_data_for_siglip( vision_config, seq_len, num_videos, image_token_id=hf_config.video_token_index, image_feature_size_override=video_feature_size, - ) + mm_key="video") mm_data = dummy_video_for_siglip(vision_config, num_frames=num_frames, num_videos=num_videos) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -320,7 +319,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext, video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -330,7 +329,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext, return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"video": ranges}) elif is_list_of(video_data, np.ndarray): video_feature_size = [] @@ -340,7 +340,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext, get_llava_onevision_video_tokens(ctx, num_frames)) tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -349,7 +349,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext, ) return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"video": ranges}) else: raise TypeError(f"Invalid video type: {type(video_data)}") @@ -403,12 +404,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: LlavaOnevisionConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -418,13 +418,12 @@ def __init__(self, config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) @@ -436,7 +435,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 9f4f391a6682e..55c575e22a0f6 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,4 +1,3 @@ -# coding=utf-8 """PyTorch MAMBA model.""" from typing import Iterable, List, Optional, Tuple @@ -7,225 +6,52 @@ from transformers import MambaConfig from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (ColumnParallelLinear, - MergedColumnParallelLinear, - RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( - causal_conv1d_fn, causal_conv1d_update) -from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( - selective_scan_fn, selective_state_update) +from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import ( - composed_weight_loader, default_weight_loader, sharded_weight_loader) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import (HasInnerState, IsAttentionFree) from vllm.model_executor.models.mamba_cache import (MambaCacheManager, MambaCacheParams) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.model_executor.utils import set_weight_attrs from vllm.sequence import IntermediateTensors from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, _get_graph_batch_size) -KVCache = Tuple[torch.Tensor, torch.Tensor] - - -# Adapted from transformers.models.mamba.modeling_mamba.MambaMixer -class MambaMixer(nn.Module): - """ - Compute ∆, A, B, C, and D the state space parameters and compute - the `contextualized_states`. A, D are input independent - (see Mamba paper [1] Section 3.5.2 "Interpretation of A" - for why A isn't selective) ∆, B, C are input-dependent - (this is a key difference between Mamba and the linear time - invariant S4, and is why Mamba is called - **selective** state spaces) - """ +from .utils import maybe_prefix - def __init__(self, config: MambaConfig, layer_idx): - super().__init__() - self.config = config - self.layer_idx = layer_idx - self.hidden_size = config.hidden_size - self.ssm_state_size = config.state_size - self.conv_kernel_size = config.conv_kernel - self.intermediate_size = config.intermediate_size - self.time_step_rank = int(config.time_step_rank) - self.is_falcon_mamba = config.model_type == "falcon_mamba" - self.conv1d = ColumnParallelLinear( - input_size=self.conv_kernel_size, - output_size=self.intermediate_size, - bias=config.use_conv_bias, - ) - # unsqueeze to fit conv1d weights shape into the linear weights shape. - # Can't do this in `weight_loader` since it already exists in - # `ColumnParallelLinear` and `set_weight_attrs` - # doesn't allow to override it - self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) - - self.in_proj = MergedColumnParallelLinear(self.hidden_size, - [self.intermediate_size] * 2, - bias=config.use_bias) - # selective projection used to make dt, B and C input dependent - self.x_proj = RowParallelLinear( - self.intermediate_size, - self.time_step_rank + self.ssm_state_size * 2, - bias=False, - ) - # time step projection (discretization) - - # In the forward we need to apply dt_proj without the bias, - # as the bias is added in the selective scan kernel. - self.dt_proj = ColumnParallelLinear(self.time_step_rank, - self.intermediate_size, - bias=True, - skip_bias_add=True) - - tp_size = get_tensor_model_parallel_world_size() - self.A = nn.Parameter( - torch.empty( - self.intermediate_size // tp_size, - self.ssm_state_size, - dtype=torch.float32, - )) - self.D = nn.Parameter(torch.ones(self.intermediate_size // tp_size)) - - set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)}) - a_weight_loader = composed_weight_loader( - sharded_weight_loader(0), lambda x: -torch.exp(x.float())) - set_weight_attrs(self.A, {"weight_loader": a_weight_loader}) - - self.out_proj = RowParallelLinear( - self.intermediate_size, - self.hidden_size, - bias=config.use_bias, - input_is_parallel=True, - ) - self.activation = config.hidden_act - if self.is_falcon_mamba: - self.dt_layernorm = RMSNorm(self.time_step_rank, - eps=config.mixer_rms_eps) - self.b_layernorm = RMSNorm(self.ssm_state_size, - eps=config.mixer_rms_eps) - self.c_layernorm = RMSNorm(self.ssm_state_size, - eps=config.mixer_rms_eps) - - def forward(self, hidden_states: torch.Tensor, - attn_metadata: AttentionMetadata, - mamba_cache_params: MambaCacheParams): - - # 1. Gated MLP's linear projection - projected_states = self.in_proj(hidden_states)[0].transpose(-2, -1) - hidden_states, gate = projected_states.chunk(2, dim=-2) - - # 2. Convolution sequence transformation - conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), - self.conv1d.weight.size(2)) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ---------------------| - # |-- query_len ---| - hidden_states = causal_conv1d_fn( - hidden_states, - conv_weights, - self.conv1d.bias, - activation=self.activation, - conv_states=mamba_cache_params.conv_state, - has_initial_state=attn_metadata.context_lens_tensor > 0, - cache_indices=mamba_cache_params.state_indices_tensor, - query_start_loc=attn_metadata.query_start_loc) - else: - hidden_states = causal_conv1d_update( - hidden_states.transpose(0, 1), - mamba_cache_params.conv_state, - conv_weights, - self.conv1d.bias, - self.activation, - conv_state_indices=mamba_cache_params.state_indices_tensor) - hidden_states = hidden_states.transpose(0, 1) - - # 3. State Space Model sequence transformation - # 3.a. input varying initialization of time_step, B and C - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] - - time_step, B, C = torch.split( - ssm_parameters, - [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], - dim=-1, - ) - # Note that Jamba and FalconMamba normalizes B, C, and time_step here - # but Mamba doesn't. - if self.is_falcon_mamba: - time_step = self.dt_layernorm(time_step.contiguous()) - B = self.b_layernorm(B.contiguous()) - C = self.c_layernorm(C.contiguous()) - - discrete_time_step = self.dt_proj(time_step)[0].transpose(-2, -1) - # 3.c perform the recurrence y ← SSM(A, B, C)(x) - time_proj_bias = (self.dt_proj.bias.float() if hasattr( - self.dt_proj, "bias") else None) - - if attn_metadata.query_start_loc is not None \ - and attn_metadata.context_lens_tensor is not None: - scan_outputs = selective_scan_fn( - hidden_states, - mamba_cache_params.ssm_state, - discrete_time_step, - self.A, - B.transpose(-2, -1), - C.transpose(-2, -1), - self.D.float(), - gate, - time_proj_bias, - delta_softplus=True, - cache_indices=mamba_cache_params.state_indices_tensor, - has_initial_state=attn_metadata.context_lens_tensor > 0, - query_start_loc=attn_metadata.query_start_loc) - else: - scan_outputs = selective_state_update( - mamba_cache_params.ssm_state, - hidden_states.transpose(0, 1), - discrete_time_step.transpose(0, 1), - self.A, - B, - C, - self.D, - gate.transpose(0, 1), - time_proj_bias, - dt_softplus=True, - state_batch_indices=mamba_cache_params.state_indices_tensor) - scan_outputs = scan_outputs.transpose(0, 1) - - # 4. Final linear projection - contextualized_states = self.out_proj(scan_outputs.transpose(-2, - -1))[0] - return contextualized_states +KVCache = Tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): def __init__(self, config: MambaConfig, - layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() - self.layer_idx = layer_idx self.config = config self.is_falcon_mamba = config.model_type == "falcon_mamba" - self.mixer = MambaMixer(config, layer_idx) + mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None + self.mixer = MambaMixer(hidden_size=config.hidden_size, + ssm_state_size=config.state_size, + conv_kernel_size=config.conv_kernel, + intermediate_size=config.intermediate_size, + time_step_rank=config.time_step_rank, + use_conv_bias=config.use_conv_bias, + use_bias=config.use_bias, + use_rms_norm=self.is_falcon_mamba, + rms_norm_eps=mixer_rms_eps, + activation=config.hidden_act) + self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) def forward( @@ -249,14 +75,14 @@ def forward( class MambaModel(nn.Module): - def __init__( - self, - config: MambaConfig, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -274,7 +100,6 @@ def __init__( for i in range(config.num_hidden_layers): decoder_layers.append( MambaDecoderLayer(config, - layer_idx=i, cache_config=cache_config, quant_config=quant_config)) self.layers = nn.ModuleList(decoder_layers) @@ -307,24 +132,19 @@ def forward( class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree): - def __init__( - self, - config: MambaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - scheduler_config: Optional[SchedulerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + lora_config = vllm_config.lora_config + scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ "Mamba does not support prefix caching" super().__init__() self.config = config self.scheduler_config = scheduler_config - self.backbone = MambaModel(config, - cache_config=cache_config, - quant_config=quant_config, - lora_config=lora_config) + self.backbone = MambaModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "backbone")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -346,7 +166,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def forward(self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 619a5cd00d6b6..4cb1b4a929b9f 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -3,13 +3,13 @@ import torch import torch.nn as nn +from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.transformers_utils.configs.medusa import MedusaConfig class ResidualBlock(nn.Module): @@ -44,7 +44,8 @@ class Medusa(nn.Module): in the draft checkpoint (using key token_map). Also, the draft config needs to have truncated_vocab_size (=k) as an attribute.""" - def __init__(self, config: MedusaConfig, **_) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + config = vllm_config.model_config.hf_config super().__init__() self.config = config self.blocks = nn.ModuleList([ diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 03fb036020f2f..2db953329fd91 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -30,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -44,7 +43,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -54,7 +53,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MiniCPMMoE(nn.Module): @@ -352,15 +352,14 @@ def forward( @support_torch_compile class MiniCPMModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.cache_config = cache_config self.quant_config = quant_config @@ -462,22 +461,22 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.prefix = prefix + self.vllm_config = vllm_config self.config = config self.lora_config = lora_config self.cache_config = cache_config self.quant_config = quant_config self.num_experts = getattr(self.config, "num_experts", 0) - self._init_model() + self._init_model(vllm_config=vllm_config, prefix=prefix) unpadded_vocab_size = config.vocab_size if lora_config: unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -497,15 +496,13 @@ def __init__( self.logits_processor = LogitsProcessor(unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - def _init_model(self): - self.model = MiniCPMModel(config=self.config, - cache_config=self.cache_config, - quant_config=self.quant_config, - lora_config=self.lora_config) + def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): + self.model = MiniCPMModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) def forward( self, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 3b5fd95328d74..278c4bbe6e563 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2024 The ModelBest team. @@ -29,7 +28,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -41,7 +40,7 @@ MiniCPMForCausalLM, MiniCPMModel) -from .utils import make_layers +from .utils import make_layers, maybe_prefix class MiniCPM3Attention(nn.Module): @@ -239,8 +238,6 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): # `embedding_modules` and `embedding_padding_modules` # are inherited from MiniCPMForCausalLM - def _init_model(self): - self.model = MiniCPM3Model(config=self.config, - cache_config=self.cache_config, - quant_config=self.quant_config, - lora_config=self.lora_config) + def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): + self.model = MiniCPM3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index a270282d87bc8..999739ccd98bf 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -35,14 +34,14 @@ from typing_extensions import NotRequired from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -52,15 +51,14 @@ from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import is_pp_missing_parameter +from .utils import is_pp_missing_parameter, maybe_prefix _KEYS_TO_MODIFY_MAPPING = { "llm.lm_head": "lm_head", @@ -131,16 +129,22 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): class Resampler2_5(BaseResampler): - def __init__( - self, - num_queries: int, - embed_dim: int, - num_heads: int, - kv_dim: Optional[int] = None, - norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: Tuple[int, int] = (70, 70), - ) -> None: - super().__init__(num_queries, embed_dim, num_heads, kv_dim, norm_layer) + def __init__(self, + num_queries: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + max_size: Tuple[int, int] = (70, 70), + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> None: + super().__init__(num_queries, + embed_dim, + num_heads, + kv_dim, + norm_layer, + quant_config=quant_config, + prefix=prefix) self.max_size = max_size self._set_2d_pos_cache(self.max_size) @@ -277,7 +281,7 @@ def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int, seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images) mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data) def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs): @@ -369,7 +373,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object): batch_data["slice_start_id"] = data[0]["slice_start_id"] batch_data["slice_end_id"] = data[0]["slice_end_id"] - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): @@ -380,11 +384,12 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): def __init__( self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): + config = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config + quant_config = vllm_config.quant_config super().__init__() # All MiniCPM-V models disable `tie_word_embeddings` but # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot @@ -394,25 +399,30 @@ def __init__( self.multimodal_config = multimodal_config self.version = get_version_by_config(self.config) - self.llm = self.init_llm(config, - cache_config, - quant_config, - prefix="llm") - self.vpm = self.init_vision_module(config, quant_config, prefix="vpm") + self.llm = self.init_llm(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "llm")) + self.vpm = self.init_vision_module(config, + quant_config, + prefix=maybe_prefix(prefix, "vpm")) param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else self.vpm.embeddings.embed_dim) self.embed_dim = self.config.hidden_size - self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) + self.resampler = self.init_resampler(self.embed_dim, + self.vision_dim, + quant_config=quant_config, + prefix=maybe_prefix( + prefix, "resampler")) self.resampler.to(device="cuda", dtype=param_dtype) # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config, - prefix="llm.lm_head") + prefix=maybe_prefix( + prefix, "llm.lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.llm.make_empty_intermediate_tensors) @@ -564,8 +574,13 @@ def forward( vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + # for `torch.compile` integration + input_ids = None + output = self.llm( - input_ids=None, + input_ids=input_ids, positions=positions, kv_caches=kv_caches, attn_metadata=attn_metadata, @@ -646,9 +661,7 @@ def get_mm_mapping(self) -> MultiModelKeys: def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: raise NotImplementedError @@ -661,7 +674,11 @@ def init_vision_module( ) -> nn.Module: raise NotImplementedError - def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: + def init_resampler(self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: raise NotImplementedError def get_vision_embedding( @@ -684,26 +701,18 @@ class MiniCPMV2_0(MiniCPMVBaseModel): def __init__( self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): - super().__init__(config, multimodal_config, cache_config, quant_config) + super().__init__(vllm_config) assert self.version == (2, 0) def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - - return LLMWrapper(MiniCPMModel(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -738,16 +747,21 @@ def init_vision_module( def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.embed_tokens(input_ids) - def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: + def init_resampler(self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: with set_default_torch_dtype(torch.float16): - resampler = Resampler2( - embed_dim=embed_dim, - num_heads=embed_dim // 128, - grid_size=int(math.sqrt(self.config.query_num)), - kv_dim=vision_dim, - adaptive=False, - do_post_projection=True, - ) + resampler = Resampler2(embed_dim=embed_dim, + num_heads=embed_dim // 128, + grid_size=int( + math.sqrt(self.config.query_num)), + kv_dim=vision_dim, + adaptive=False, + do_post_projection=True, + quant_config=quant_config, + prefix=prefix) return resampler @@ -810,31 +824,53 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA): # resampler "kv_proj", ] + + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_proj.", + ".down_proj.", + ".up_proj.", + ".q_proj.", + ".k_proj.", + ".v_proj.", + ".o_proj.", + # vision encoder + ".fc1.", + ".fc2.", + # Currently, vllm does not support BNB quantization for the `out_proj` + # of the resampler, so it's necessary to distinguish between the + # vision encoder and the resampler's out_proj. The same applies to + # MiniCPMV2_6. + ".self_attn.out_proj.", # vision encoder out_proj + # resampler + ".kv_proj.", + ] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + embedding_modules = {} embedding_padding_modules = [] def __init__( self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): - super().__init__(config, multimodal_config, cache_config, quant_config) + super().__init__(vllm_config) assert self.version == (2, 5) def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - return LLMWrapper(LlamaModel(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -850,14 +886,18 @@ def init_vision_module( model.encoder.layers = model.encoder.layers[:-1] return model - def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: + def init_resampler(self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: with set_default_torch_dtype(torch.float16): - resampler = Resampler2_5( - num_queries=self.config.query_num, - embed_dim=embed_dim, - num_heads=embed_dim // 128, - kv_dim=vision_dim, - ) + resampler = Resampler2_5(num_queries=self.config.query_num, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + quant_config=quant_config, + prefix=prefix) return resampler def get_vision_embedding( @@ -931,31 +971,48 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA): "kv_proj", ] + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_proj.", + ".down_proj.", + ".up_proj.", + ".q_proj.", + ".k_proj.", + ".v_proj.", + ".o_proj.", + # vision encoder + ".fc1.", + ".fc2.", + ".self_attn.out_proj.", + # resampler + ".kv_proj.", + ] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + embedding_modules = {} embedding_padding_modules = [] def __init__( self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, + prefix: str = "", ): - super().__init__(config, multimodal_config, cache_config, quant_config) + super().__init__(vllm_config) assert self.version == (2, 6) def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - - return LLMWrapper(Qwen2Model(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -971,15 +1028,19 @@ def init_vision_module( model.encoder.layers = model.encoder.layers[:-1] return model - def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: + def init_resampler(self, + embed_dim: int, + vision_dim: int, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "") -> nn.Module: with set_default_torch_dtype(torch.float16): # The resampler in 2.6 remains consistent with the one in 2.5. - resampler = Resampler2_5( - num_queries=self.config.query_num, - embed_dim=embed_dim, - num_heads=embed_dim // 128, - kv_dim=vision_dim, - ) + resampler = Resampler2_5(num_queries=self.config.query_num, + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + quant_config=quant_config, + prefix=prefix) return resampler def get_vision_embedding( @@ -1056,12 +1117,8 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA): embedding_modules = {} embedding_padding_modules = [] - def __new__(cls, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None): + def __new__(cls, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config if not hasattr(config, "version"): if config.hidden_size == 2304 and config.query_num == 64: version = (2, 0) @@ -1075,5 +1132,4 @@ def __new__(cls, if instance_class is None: raise ValueError( "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") - return instance_class(config, multimodal_config, cache_config, - quant_config) + return instance_class(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1514243ad59c9..3eb2f60fd4fc7 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -39,7 +38,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -49,7 +48,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MixtralMoE(nn.Module): @@ -249,15 +249,14 @@ def forward( @support_torch_compile class MixtralModel(nn.Module): - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -333,23 +332,16 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = MixtralModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = MixtralModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -367,7 +359,7 @@ def __init__( self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 63e2c60a84271..95cfb6f54dc10 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -30,7 +29,7 @@ from transformers import MixtralConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -41,7 +40,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -50,7 +49,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MixtralMLP(nn.Module): @@ -294,14 +294,13 @@ def forward( class MixtralModel(nn.Module): - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -351,23 +350,21 @@ def forward( class MixtralForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = MixtralModel(config, cache_config, quant_config) + self.model = MixtralModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 5cf5272cae878..db7ee7b2d8537 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -33,11 +32,13 @@ import vllm.distributed.parallel_state as ps from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.attention.backends.xformers import XFormersMetadata from vllm.attention.ops.paged_attn import PagedAttention -from vllm.config import CacheConfig, MultiModalConfig +from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputContext) +from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs, + InputContext, TokenInputs, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -45,17 +46,19 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SequenceData +from vllm.utils import is_list_of from .clip import CLIPMLP from .interfaces import SupportsMultiModal from .llama import LlamaDecoderLayer, LlamaMLP +from .utils import maybe_prefix logger = init_logger(__name__) MLLAMA_IMAGE_TOKEN_ID = 128256 @@ -86,41 +89,58 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: return num_images -def input_processor_for_mllama(ctx: InputContext, - inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): - # move encoder_prompt to prompt - if inputs.get("prompt") is None: - inputs["prompt"] = inputs["encoder_prompt"] - inputs["prompt_token_ids"] = inputs["encoder_prompt_token_ids"] +def input_processor_for_mllama( + ctx: InputContext, + inputs: EncoderDecoderInputs, +) -> EncoderDecoderInputs: + # Example input to processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000], + # }, + # } + + # move encoder prompt to decoder + dec_inputs = TokenInputs(**inputs["encoder"]) + + multi_modal_data = dec_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + # text-only + return EncoderDecoderInputs( + encoder=token_inputs([]), + decoder=dec_inputs, + ) - # process multi-modal data - multi_modal_data = inputs.get("encoder_multi_modal_data") + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_data = [image_data] - if multi_modal_data is None or "image" not in multi_modal_data \ - or multi_modal_data["image"] is None: - # text-only - inputs["encoder_prompt"] = "" - inputs["encoder_prompt_token_ids"] = [] - inputs["encoder_multi_modal_data"] = {} - return inputs + assert is_list_of(image_data, Image.Image) - if isinstance(multi_modal_data['image'], Image.Image): - multi_modal_data['image'] = [multi_modal_data['image']] # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tiles for those images. num_decode_images = _get_num_image_in_last_group( - inputs["prompt_token_ids"]) + dec_inputs["prompt_token_ids"]) + hf_config = ctx.model_config.hf_config + vision_config = hf_config.vision_config + num_tiles = 0 - for image in multi_modal_data["image"][::-1]: + for image in image_data[::-1]: width, height = image.size - tile_size = hf_config.vision_config.image_size + tile_size = vision_config.image_size canvas_height, canvas_width = get_optimal_tiled_canvas( image_height=height, image_width=width, - max_image_tiles=hf_config.vision_config.max_num_tiles, + max_image_tiles=vision_config.max_num_tiles, tile_size=tile_size, ) num_tiles_height = canvas_height // tile_size @@ -133,14 +153,34 @@ def input_processor_for_mllama(ctx: InputContext, # Set encoder prompt length based on the number of tiles. # This tells the block manager to allocate correct number # of slots for encoder tokens. - assert hf_config.vision_config.image_size % 14 == 0, \ + assert vision_config.image_size % 14 == 0, \ "chunk size should be multiple of 14" - token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1 + token_per_chunk = (vision_config.image_size // 14)**2 + 1 num_tokens = num_tiles * token_per_chunk - inputs["encoder_prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens - inputs["encoder_prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens - return inputs + # Example output from processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128256, 128256, ..., 128256], + # 'prompt': '<|image|><|image|>...<|image|>', + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # } + return EncoderDecoderInputs( + encoder=token_inputs( + prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens, + prompt=MLLAMA_IMAGE_TOKEN * num_tokens, + multi_modal_data=multi_modal_data, + ), + decoder=dec_inputs, + ) def get_max_mllama_image_tokens(ctx: InputContext) -> int: @@ -176,13 +216,14 @@ def dummy_image(num_images: int, ): def dummy_decoder_data_for_mllama(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): num_images = mm_counts["image"] - return dummy_decoder_seq_data(seq_len, num_images), None + return DummyData(dummy_decoder_seq_data(seq_len, num_images)) def dummy_encoder_data_for_mllama(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): num_images = mm_counts["image"] - return dummy_encoder_seq_data(ctx, num_images), dummy_image(num_images) + return DummyData(dummy_encoder_seq_data(ctx, num_images), + dummy_image(num_images)) def _prepare_aspect_ratio_attention_mask( @@ -760,12 +801,13 @@ def forward( q = self.q_norm(q) if attention_mask is not None: - output = self.attention_with_mask(q, k, v, kv_cache, - attention_mask, - kv_range_for_decode, - attn_metadata) + output = self._attention_with_mask(q, k, v, kv_cache, + attention_mask, + kv_range_for_decode, + attn_metadata) else: - output = self.attn(q, + output = self.attn(q.view(-1, + self.num_local_heads * self.head_dim), k, v, kv_cache, @@ -774,7 +816,7 @@ def forward( out, _ = self.o_proj(output) return out - def attention_with_mask( + def _attention_with_mask( self, q: torch.Tensor, k: torch.Tensor, @@ -785,14 +827,35 @@ def attention_with_mask( attn_metadata: AttentionMetadata, ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. - if len(kv_cache.shape) == 3: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_local_key_value_heads, self.head_dim) - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - PagedAttention.write_to_paged_cache( - cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + if len(kv_cache.shape) > 1: + if isinstance(attn_metadata, FlashAttentionMetadata): + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + torch.ops._C_cache_ops.reshape_and_cache_flash( + cached_k, + cached_v, + kv_cache[0], + kv_cache[1], + attn_metadata. + cross_slot_mapping, # type: ignore[union-attr] + "auto", + 1.0, + 1.0, + ) + elif isinstance(attn_metadata, XFormersMetadata): + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_local_key_value_heads, self.head_dim) + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + PagedAttention.write_to_paged_cache( + cached_k, cached_v, key_cache, value_cache, + attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + else: + raise ValueError( + f"Unsupported AttentionMetadata {type(attn_metadata)} " + f"class found. Expected the AttentionMetadata to " + f"be either XFormersMetadata or FlashAttentionMetadata.") + # We have to call torch.sdpa for prefill when using a # custom cross-attention mask. Because the mask is not a # standard causal mask, neither a block diagonal mask which @@ -901,15 +964,13 @@ class MllamaTextModel(nn.Module): config_class = config_mllama.MllamaTextConfig base_model_prefix = "model" - def __init__( - self, - config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config.text_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, @@ -991,18 +1052,14 @@ class MllamaForCausalLM(nn.Module): "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" ] - def __init__( - self, - config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config.text_config + quant_config = vllm_config.quant_config + self.vocab_size = config.vocab_size - self.model = MllamaTextModel(config, - cache_config, - quant_config, + self.model = MllamaTextModel(vllm_config=vllm_config, prefix=f"{prefix}.model") self.lm_head = ParallelLMHead( config.vocab_size, @@ -1055,9 +1112,12 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): ".k_proj.", ".v_proj.", ".o_proj.", + ".fc1.", + ".fc2.", + # The `multi_modal_projector` is at the top level of the model, + # so we can't add a dot in front of it. + "multi_modal_projector." ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -1067,12 +1127,10 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): "up_proj": ("gate_up_proj", 1), } - def __init__(self, - config: config_mllama.MllamaConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.vocab_size = config.text_config.vocab_size self.hidden_size = config.text_config.hidden_size self.max_num_tiles = config.vision_config.max_num_tiles @@ -1083,12 +1141,11 @@ def __init__(self, self.vision_model = MllamaVisionModel(config.vision_config, quant_config, - prefix="vision_model") + prefix=maybe_prefix( + prefix, "vision_model")) self.language_model = MllamaForCausalLM( - config.text_config, - cache_config=cache_config, - quant_config=quant_config, - prefix="language_model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), ) self.multi_modal_projector = ColumnParallelLinear( config.vision_config.vision_output_dim, @@ -1096,11 +1153,11 @@ def __init__(self, bias=True, quant_config=quant_config, gather_output=True, - prefix="multi_modal_projector", + prefix=maybe_prefix(prefix, "multi_modal_projector"), ) self.logits_processor = LogitsProcessor(config.output_hidden_states, config.text_config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() def compute_logits( self, @@ -1121,7 +1178,7 @@ def sample( def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by - # MultiModalInputs.batch, so pixel_values here can be: + # MultiModalKwargs.batch, so pixel_values here can be: # - List[List[torch.Tensor]]: # with shape (num_tiles, 3, image_res, image_res) # - List[torch.Tensor]: diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 42ccd01298169..6aa43f22f4c93 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -4,13 +4,13 @@ import torch import torch.nn as nn +from vllm.config import VllmConfig from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.transformers_utils.configs import MLPSpeculatorConfig SQRT2 = 2**0.5 @@ -37,7 +37,7 @@ def __init__( eps=1e-06, elementwise_scale_and_shift=True, ): - super(MLPSpeculatorLayerNorm, self).__init__() + super().__init__() self.elementwise_scale_and_shift = elementwise_scale_and_shift if self.elementwise_scale_and_shift: self.weight = nn.Parameter(torch.empty(normalized_shape)) @@ -65,8 +65,9 @@ class MLPSpeculator(nn.Module): https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite """ - def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None: + def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() + config = vllm_config.model_config.hf_config self.n_predict = config.n_predict self.vocab_size = config.vocab_size self.emb_dim = config.emb_dim @@ -137,7 +138,7 @@ def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None: self.config = config self.logits_processor = LogitsProcessor(config.vocab_size, config.vocab_size, 1.0) - self.sampler = Sampler() + self.sampler = get_sampler() def generate_proposals( self, diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 3c34227767e05..035a1e2ab7b02 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -3,8 +3,7 @@ from array import array from dataclasses import dataclass from functools import lru_cache, partial -from typing import (Any, Iterable, List, Mapping, Optional, Tuple, TypedDict, - Union) +from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union import torch from einops import rearrange @@ -15,13 +14,14 @@ from vllm.attention import Attention, AttentionMetadata from vllm.attention.selector import _Backend -from vllm.config import CacheConfig, MultiModalConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -32,11 +32,11 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -44,7 +44,8 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (get_vit_attn_backend, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # TODO: hard-coded for now. Consider making it configurable. VIT_LAYERS = [-2, -9] @@ -713,16 +714,16 @@ def forward( return image_features +@support_torch_compile class MolmoModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embedding_size = config.embedding_size or config.vocab_size @@ -842,9 +843,10 @@ def get_max_tokens(max_crops: int, crop_patches: int, left_margin: int, def get_max_molmo_image_tokens(ctx: InputContext) -> int: - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor max_llm_image_tokens = get_max_tokens( image_processor.max_crops, @@ -863,14 +865,15 @@ def image_input_mapper_for_molmo( ctx: InputContext, data: object, ): - return MultiModalInputs(data) + return MultiModalKwargs(data) def dummy_data_for_molmo(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=ctx.model_config.trust_remote_code, + revision=ctx.model_config.code_revision) image_processor = processor.image_processor base_image_input_d = image_processor.image_patch_size @@ -911,7 +914,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long) - return dummy_seqdata, {"image": dummy_imgdata} + return DummyData(dummy_seqdata, {"image": dummy_imgdata}) def pad_images( @@ -933,11 +936,11 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): multi_modal_data = inputs.get("multi_modal_data") image = None if multi_modal_data is None else multi_modal_data.get("image") - processor = cached_get_processor(ctx.model_config.model, - trust_remote_code=True, - revision=ctx.model_config.code_revision) - model_config = ctx.model_config + processor = cached_get_processor( + ctx.model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=ctx.model_config.code_revision) tokenizer = cached_get_tokenizer( model_config.tokenizer, trust_remote_code=model_config.trust_remote_code) @@ -1021,22 +1024,19 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - multimodal_config: Optional[MultiModalConfig] = None, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[Mapping[str, Any]] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config vision_config = VisionBackboneConfig() self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config) - self.model = MolmoModel(config, cache_config, quant_config) + self.model = MolmoModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if self.config.weight_tying: self.lm_head = self.model.transformer.wte @@ -1049,7 +1049,7 @@ def __init__( self.logits_processor = LogitsProcessor(config.embedding_size or config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -1117,9 +1117,9 @@ def _merge_multimodal_embeddings( batch_size * num_image * num_patch, -1).contiguous() image_input_idx = image_input_idx * valid.to(image_input_idx.dtype) - offset = torch.cat( - [seq_len.new_zeros( - (1)), seq_len.cumsum(dim=0)[:-1]], dim=0)[:, None] + offset = torch.cat([seq_len.new_zeros(1), + seq_len.cumsum(dim=0)[:-1]], + dim=0)[:, None] image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx.flatten()[:, None] mat = image_input_idx == torch.arange( @@ -1141,7 +1141,6 @@ def forward( **kwargs: object, ) -> SamplerOutput: if intermediate_tensors is not None: - input_ids = None inputs_embeds = None else: image_input = self._parse_and_validate_image_input(**kwargs) @@ -1156,10 +1155,13 @@ def forward( image_input["image_input_idx"], image_input["seq_len"], ) - - input_ids = None else: - inputs_embeds = None + inputs_embeds = self.model.embed_tokens(input_ids) + + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + # for `torch.compile` integration + input_ids = None hidden_states = self.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index ee802030a5ef3..e15c0fe8db060 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math from typing import Iterable, List, Optional, Tuple, Union @@ -8,7 +7,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import get_act_fn @@ -17,7 +16,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -27,7 +26,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def _get_alibi_slopes( @@ -154,7 +154,7 @@ def __init__( bias=not config.no_bias, quant_config=quant_config, ) - self.act = get_act_fn("gelu", quant_config, intermediate_size) + self.act = get_act_fn("gelu") self.down_proj = RowParallelLinear( intermediate_size, hidden_size, @@ -208,14 +208,13 @@ def forward( @support_torch_compile class MPTModel(nn.Module): - def __init__( - self, - config: MPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + assert config.embedding_fraction == 1.0 assert config.norm_type == "low_precision_layernorm" @@ -268,21 +267,19 @@ def forward( class MPTForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: MPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config assert config.tie_word_embeddings self.quant_config = quant_config - self.transformer = MPTModel(config, cache_config, quant_config) + self.transformer = MPTModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "transformer")) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 72a09129fed63..e09d7088a69ce 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -28,7 +27,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -37,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -48,7 +47,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # The architecture is pretty similar to Llama, with these changes: # - There is no gate_proj, just up_proj @@ -294,15 +294,14 @@ def forward( @support_torch_compile class NemotronModel(nn.Module): - def __init__( - self, - config: NemotronConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -402,25 +401,18 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "v_proj": ("qkv_proj", 2), } - def __init__( - self, - config: NemotronConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config assert isinstance(config, NemotronConfig) self.config = config self.lora_config = lora_config - self.model = NemotronModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = NemotronModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: @@ -442,7 +434,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() self.make_empty_intermediate_tensors = ( diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 90ab8abcb84b4..3467ae5896494 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py # Copyright 2024 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -38,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -47,7 +46,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OlmoAttention(nn.Module): @@ -225,12 +225,13 @@ def forward( @support_torch_compile class OlmoModel(nn.Module): - def __init__(self, - config: OlmoConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -292,13 +293,13 @@ class OlmoForCausalLM(nn.Module, SupportsPP): Extremely barebones HF model wrapper. """ - def __init__(self, - config: OlmoConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config - self.model = OlmoModel(config, cache_config, quant_config) + self.model = OlmoModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if config.tie_word_embeddings: self.lm_head = self.model.embed_tokens else: @@ -310,7 +311,7 @@ def __init__(self, quant_config=quant_config, ) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 374cbb8df1fcd..3d31919edd862 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -18,7 +18,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -28,7 +28,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -38,7 +38,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OlmoeMoE(nn.Module): @@ -243,14 +244,13 @@ def forward( @support_torch_compile class OlmoeModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -309,21 +309,19 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = OlmoeModel(config, cache_config, quant_config) + self.model = OlmoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 10cca8b56268a..997fe642439e6 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py # Copyright 2023 The vLLM team. @@ -25,7 +24,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -34,7 +33,7 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -148,8 +147,7 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.fc1", ) - self.activation_fn = get_act_fn(config.activation_function, - quant_config, config.ffn_dim) + self.activation_fn = get_act_fn(config.activation_function) self.fc2 = RowParallelLinear( config.ffn_dim, self.embed_dim, @@ -295,14 +293,13 @@ def forward( @support_torch_compile class OPTModel(nn.Module): - def __init__( - self, - config: OPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.decoder = OPTDecoder(config, cache_config, quant_config, @@ -343,22 +340,15 @@ class OPTForCausalLM(nn.Module, SupportsPP): default_bitsandbytes_target_modules = [ ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2." ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".out_proj.", ".fc2."] - def __init__( - self, - config: OPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config super().__init__() self.config = config self.quant_config = quant_config - self.model = OPTModel(config, - cache_config, - quant_config, + self.model = OPTModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if self.config.tie_word_embeddings: self.lm_head = self.model.decoder.embed_tokens @@ -366,10 +356,13 @@ def __init__( self.lm_head = ParallelLMHead(config.vocab_size, config.word_embed_proj_dim) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -377,9 +370,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 055407587c598..38821c8288347 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/modeling_orion.py # Copyright (c) OrionStar Inc. @@ -12,7 +11,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -21,7 +20,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -30,7 +29,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OrionMLP(nn.Module): @@ -209,14 +209,13 @@ def forward( @support_torch_compile class OrionModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -269,23 +268,21 @@ def forward( class OrionForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = OrionModel(config, cache_config, quant_config) + self.model = OrionModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8e29c6079b994..eea229359255e 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -6,13 +6,11 @@ from transformers import PaliGemmaConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.models.gemma import GemmaForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import cached_get_tokenizer @@ -21,7 +19,8 @@ from .interfaces import SupportsMultiModal, SupportsPP from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens) -from .utils import AutoWeightsLoader, merge_multimodal_embeddings +from .utils import (AutoWeightsLoader, init_vllm_registered_model, + maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -58,7 +57,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int, vision_config = hf_config.vision_config num_images = mm_counts["image"] - seq_data = dummy_seq_data_for_siglip( + seq_data, ranges = dummy_seq_data_for_siglip( vision_config, seq_len, num_images, @@ -66,7 +65,7 @@ def dummy_data_for_paligemma(ctx: InputContext, seq_len: int, ) mm_data = dummy_image_for_siglip(vision_config, num_images) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) def input_processor_for_paligemma(ctx: InputContext, @@ -132,28 +131,28 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: PaliGemmaConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config self.vision_tower = SiglipVisionModel(config.vision_config, quant_config, - prefix="vision_tower") + prefix=maybe_prefix( + prefix, "vision_tower")) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config - self.language_model = GemmaForCausalLM(config.text_config, - cache_config, - quant_config, - prefix="language_model") + config.text_config.architectures = ["GemmaForCausalLM"] + self.language_model = init_vllm_registered_model( + config.text_config, + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) logit_scale = getattr(config, "logit_scale", 1.0) self.language_model.logits_processor.scale *= logit_scale diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index fc9ef15db26c0..2e34a7cc30873 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -1,4 +1,3 @@ -# coding=utf-8 # adapted from https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/persimmon/modeling_persimmon.py # Copyright 2023 The vLLM team. # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. @@ -28,7 +27,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -37,7 +36,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -46,7 +45,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PersimmonMLP(nn.Module): @@ -61,7 +61,7 @@ def __init__(self, self.dense_4h_to_h = RowParallelLinear(config.intermediate_size, config.hidden_size, quant_config=quant_config) - self.act = get_act_fn(config.hidden_act, quant_config) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states) -> torch.Tensor: hidden_states, _ = self.dense_h_to_4h(hidden_states) @@ -213,12 +213,13 @@ def forward( @support_torch_compile class PersimmonModel(nn.Module): - def __init__(self, - config: PersimmonConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -266,21 +267,18 @@ def forward( class PersimmonForCausalLM(nn.Module, SupportsPP): - def __init__(self, - config: PersimmonConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config self.config = config self.vocab_size = config.vocab_size - self.model = PersimmonModel(config, - cache_config=cache_config, - quant_config=quant_config) + self.model = PersimmonModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=False) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index ec20cb249ba9b..262f6996fc374 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/microsoft/phi-1_5/blob/main/modeling_phi.py # Copyright 2023 The vLLM team. @@ -42,7 +41,8 @@ from transformers import PhiConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -51,7 +51,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -60,7 +60,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PhiAttention(nn.Module): @@ -152,7 +153,7 @@ def __init__(self, config.hidden_size, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, n_inner) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states): hidden_states, _ = self.fc1(hidden_states) @@ -193,14 +194,16 @@ def forward( return hidden_states +@support_torch_compile class PhiModel(nn.Module): - def __init__(self, - config: PhiConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -272,21 +275,15 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): default_bitsandbytes_target_modules = [ ".q_proj.", ".k_proj.", ".v_proj.", ".fc1.", ".fc2.", ".dense." ] - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".fc2.", ".dense."] embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - config: PhiConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config # lm_head use bias, cannot share word embeddings assert not config.tie_word_embeddings @@ -294,14 +291,15 @@ def __init__( self.quant_config = quant_config - self.model = PhiModel(config, cache_config, quant_config) + self.model = PhiModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=True, quant_config=quant_config) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3.py b/vllm/model_executor/models/phi3.py index 02b2ff01c3832..34141511ea791 100644 --- a/vllm/model_executor/models/phi3.py +++ b/vllm/model_executor/models/phi3.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from llama.py """Inference-only Phi3 model code inherit from Llama.py""" diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 3a7afc606bb9a..8a5fb6d303e60 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -6,7 +6,7 @@ from transformers.configuration_utils import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -15,7 +15,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -24,7 +24,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def load_column_parallel_weight(param: torch.nn.Parameter, @@ -299,14 +300,13 @@ def forward( class Phi3SmallModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) @@ -363,17 +363,14 @@ def forward( class Phi3SmallForCausalLM(nn.Module, SupportsPP): _tied_weights_keys = ["lm_head.weight"] - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Phi3SmallModel(config, cache_config, quant_config) + self.model = Phi3SmallModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.mup_width_multiplier = config.mup_width_multiplier self.lm_head = ParallelLMHead( @@ -386,7 +383,7 @@ def __init__( if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 0fc4556831fd7..4db65edc174f1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The vLLM team. # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # @@ -26,14 +25,13 @@ from transformers import CLIPVisionConfig, PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig, - PoolerConfig) -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import ModelConfig, VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.clip import CLIPVisionModel @@ -41,13 +39,14 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -380,7 +379,7 @@ def dummy_data_for_phi3v(ctx: InputContext, image_feature_size = get_max_phi3v_image_tokens(ctx, num_crops=num_crops) - seq_data = dummy_seq_data_for_clip( + seq_data, ranges = dummy_seq_data_for_clip( CLIP_VIT_LARGE_PATCH14_336_CONFIG, seq_len, num_images, @@ -394,7 +393,7 @@ def dummy_data_for_phi3v(ctx: InputContext, image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, ) - return seq_data, mm_data + return DummyData(seq_data, mm_data, ranges) @lru_cache @@ -502,15 +501,20 @@ def input_processor_for_phi3v(ctx: InputContext, # TODO: Move this to utils or integrate with clip. new_token_ids: List[int] = [] + placeholder_ranges: List[PlaceholderRange] = [] placeholder_idx = 0 while merged_token_ids: token_id = merged_token_ids.pop(0) if token_id == _IMAGE_TOKEN_ID: - new_token_ids.extend( - repeat_and_pad_token( - _IMAGE_TOKEN_ID, - repeat_count=image_feature_size[placeholder_idx], - )) + replacement_ids = repeat_and_pad_token( + _IMAGE_TOKEN_ID, + repeat_count=image_feature_size[placeholder_idx], + ) + placeholder_ranges.append({ + "offset": len(new_token_ids), + "length": len(replacement_ids) + }) + new_token_ids.extend(replacement_ids) placeholder_idx += 1 else: new_token_ids.append(token_id) @@ -518,7 +522,8 @@ def input_processor_for_phi3v(ctx: InputContext, # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) @MULTIMODAL_REGISTRY.register_image_input_mapper() @@ -527,14 +532,12 @@ def input_processor_for_phi3v(ctx: InputContext, @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - pooler_config: Optional[PoolerConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config self.image_token_id = _IMAGE_TOKEN_ID @@ -544,17 +547,19 @@ def __init__(self, config.hidden_size, org_num_embeddings=config.vocab_size, quant_config=quant_config, - prefix="model.embed_tokens", + prefix=maybe_prefix(prefix, "model.embed_tokens"), ) # TODO: Optionally initializes this for supporting input embeddings. self.vision_embed_tokens = Phi3HDImageEmbedding( - config, quant_config, prefix="model.vision_embed_tokens") + config, + quant_config, + prefix=maybe_prefix(prefix, "model.vision_embed_tokens")) # The prefix is empty intentionally because default prefix of # LlamaForCausalLM is "model" - self.language_model = LlamaForCausalLM(config, cache_config, - quant_config) + self.language_model = LlamaForCausalLM(vllm_config=vllm_config, + prefix="") # The same model class supports both language generation and embedding # because the architecture name is the same @@ -571,7 +576,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor: expected_dims = (2, ) @@ -671,28 +676,42 @@ def _process_image_input( return image_embeds + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.embed_tokens(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + return inputs_embeds + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object): if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.embed_tokens(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - input_ids = None - else: - inputs_embeds = None + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index bb8a9327b4ac8..6d71a8949111b 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -29,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -39,7 +38,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -49,7 +48,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PhiMoEConfig(PretrainedConfig): @@ -433,15 +433,14 @@ def forward( @support_torch_compile class PhiMoEModel(nn.Module): - def __init__( - self, - config: PhiMoEConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) @@ -530,22 +529,15 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: PhiMoEConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = PhiMoEModel(config, - cache_config, - quant_config, - lora_config=lora_config) + self.model = PhiMoEModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -563,7 +555,7 @@ def __init__( ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6b53bf5660096..a3e30ea2dd299 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -9,32 +9,35 @@ import torch.nn.functional as F from mistral_common.protocol.instruct.messages import ImageChunk from PIL import Image -from transformers import PixtralVisionConfig, PretrainedConfig +from transformers import PixtralVisionConfig from transformers.models.pixtral.image_processing_pixtral import ( _num_image_tokens) from transformers.models.pixtral.modeling_pixtral import ( PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid) from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, ModelConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) -from vllm.model_executor.layers.activation import get_act_fn +from vllm.config import ModelConfig, VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) +from vllm.model_executor.layers.activation import get_act_and_mul_fn from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs -from vllm.multimodal.utils import cached_get_tokenizer +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.processor import cached_get_processor from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP -from .utils import init_vllm_registered_model +from .utils import init_vllm_registered_model, maybe_prefix try: from xformers import ops as xops @@ -81,12 +84,17 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int, ) mm_data = {"image": num_images * [image]} - return seq_data, mm_data + mm_placeholders = { + "image": + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } + return DummyData(seq_data, mm_data, mm_placeholders) def input_mapper_for_pixtral(ctx: InputContext, - data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). + data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -94,7 +102,7 @@ def input_mapper_for_pixtral(ctx: InputContext, to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -112,7 +120,7 @@ def input_mapper_for_pixtral(ctx: InputContext, dtype=torch.float16) images.append(image) - return MultiModalInputs({"images": images}) + return MultiModalKwargs({"images": images}) def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): @@ -127,11 +135,11 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): if image_token_id not in inputs['prompt_token_ids']: raise ValueError( - (f"You've passed {inputs=} without {image_token_id=}" - " Make sure to process your input via mistral_common's" - " tokenizer or pass a chat completion request. For more" - " For more info, see: " - "https://github.com/vllm-project/vllm/issues/8411.")) + f"You've passed {inputs=} without {image_token_id=}" + " Make sure to process your input via mistral_common's" + " tokenizer or pass a chat completion request. For more" + " For more info, see: " + "https://github.com/vllm-project/vllm/issues/8411.") return inputs @@ -143,13 +151,10 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -165,9 +170,8 @@ def __init__(self, # init MistralForCausalLM self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) self.vision_encoder = VisionTransformer(self.vision_args) self.vision_language_adapter = VisionLanguageAdapter( @@ -181,7 +185,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def forward( self, @@ -630,13 +634,13 @@ def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: def dummy_seq_data_for_pixtral_hf( - hf_config: PixtralVisionConfig, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): + hf_config: PixtralVisionConfig, + seq_len: int, + num_images: int, + *, + image_token_id: int, + image_feature_size_override: Optional[int] = None, + mm_key: str = "image"): if image_feature_size_override is None: image_feature_size = get_max_pixtral_hf_image_feature_size(hf_config) else: @@ -645,7 +649,11 @@ def dummy_seq_data_for_pixtral_hf( return SequenceData.from_prompt_token_counts( (image_token_id, image_feature_size * num_images), (0, seq_len - image_feature_size * num_images), - ) + ), { + mm_key: + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_image_for_pixtral_hf( @@ -788,20 +796,24 @@ def __init__( super().__init__() assert config.intermediate_size is not None - # TODO: Use quant_config and prefix after optimizing this - self.gate_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.up_proj = nn.Linear(config.hidden_size, - config.intermediate_size, - bias=False) - self.down_proj = nn.Linear(config.intermediate_size, - config.hidden_size, - bias=False) - self.act = get_act_fn(config.hidden_act) + self.gate_up_proj = MergedColumnParallelLinear( + input_size=config.hidden_size, + output_sizes=[config.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(input_size=config.intermediate_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.down_proj") + self.act_and_mul = get_act_and_mul_fn(config.hidden_act) def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x)) + gate_up, _ = self.gate_up_proj(x) + x = self.act_and_mul(gate_up) + x, _ = self.down_proj(x) + return x class PixtralHFAttention(nn.Module): @@ -820,21 +832,21 @@ def __init__( self.n_heads = config.num_attention_heads self.head_dim = config.hidden_size // config.num_attention_heads - self.scale = self.head_dim**-0.5 - - # TODO: Use quant_config and prefix after optimizing this - self.q_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.k_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.v_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) - self.o_proj = nn.Linear(config.hidden_size, - config.hidden_size, - bias=False) + self.qkv_proj = QKVParallelLinear( + hidden_size=config.hidden_size, + head_size=self.head_dim, + total_num_heads=self.n_heads, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + self.o_proj = RowParallelLinear( + input_size=config.hidden_size, + output_size=config.hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) def forward( self, @@ -844,13 +856,13 @@ def forward( ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() - q = self.q_proj(hidden_states) - k = self.k_proj(hidden_states) - v = self.v_proj(hidden_states) + qkv_states, _ = self.qkv_proj(hidden_states) + q, k, v = qkv_states.chunk(3, dim=-1) # Transpose q and k to apply HF's Rotary Position Embedding q = q.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) k = k.view(batch, patches, self.n_heads, self.head_dim).transpose(1, 2) + v = v.view(batch, patches, self.n_heads, self.head_dim) cos, sin = position_embeddings q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0) @@ -858,22 +870,21 @@ def forward( # Transpose q and k back for attention q = q.transpose(1, 2).contiguous() k = k.transpose(1, 2).contiguous() - v = v.reshape(batch, patches, self.n_heads, self.head_dim) out = xops.memory_efficient_attention(q, k, v, attn_bias=attention_mask) else: - v = v.reshape(batch, patches, self.n_heads, - self.head_dim).transpose(1, 2) + v = v.transpose(1, 2) out = nn.functional.scaled_dot_product_attention( q, k, v, attn_mask=attention_mask) out = out.transpose(1, 2) - out = out.reshape(batch, patches, self.n_heads * self.head_dim) + out = out.view(batch, patches, self.n_heads * self.head_dim) + attn_output, _ = self.o_proj(out) - return self.o_proj(out) + return attn_output, None class PixtralHFTransformerBlock(nn.Module): @@ -902,9 +913,9 @@ def forward( attention_mask: torch.Tensor, position_embeddings: torch.Tensor, ) -> torch.Tensor: - r = self.attention.forward(self.attention_norm(hidden_states), - attention_mask=attention_mask, - position_embeddings=position_embeddings) + r, _ = self.attention.forward(self.attention_norm(hidden_states), + attention_mask=attention_mask, + position_embeddings=position_embeddings) h = hidden_states + r r = self.feed_forward.forward(self.ffn_norm(h)) out = h + r @@ -1043,10 +1054,24 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [] + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] params_dict = dict(self.named_parameters()) + layer_count = len(self.transformer.layers) for name, loaded_weight in weights: + # omit layers when num_hidden_layers_override is set + if name.startswith("transformer.layers"): + layer_idx = int(name.split(".")[2]) + if layer_idx >= layer_count: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 0a1b40927e9f9..3d26ede722dd1 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py # Copyright (c) Alibaba Cloud. @@ -20,10 +19,11 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -36,21 +36,21 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import is_list_of from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (flatten_bn, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -203,7 +203,7 @@ def __init__( intermediate_size, bias=True, quant_config=quant_config) - self.act_fn = get_act_fn("gelu", quant_config, intermediate_size) + self.act_fn = get_act_fn("gelu") self.c_proj = RowParallelLinear( intermediate_size, hidden_size, @@ -549,16 +549,16 @@ def forward( return hidden_states, residual +@support_torch_compile class QWenModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.vocab_size = config.vocab_size @@ -721,8 +721,8 @@ def input_processor_for_qwen(ctx: InputContext, multi_modal_data=multi_modal_data) -def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: - """Maps the input data to its MultiModalInputs (if any). +def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs: + """Maps the input data to its MultiModalKwargs (if any). Args: ctx: Context of the loaded model. @@ -730,7 +730,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: to pixel_values in .forward() for a visual QWenLMHeadModel model. Returns: - MultiModalInputs containing the stacked normalized images tensor or + MultiModalKwargs containing the stacked normalized images tensor or image embeddings. """ # Early exit if we have provided an image to a language only Qwen model @@ -739,7 +739,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: logger.warning( "Images were provided but this model has no visual config; " "multimodal inputs will not be forwarded to the model.") - return MultiModalInputs() + return MultiModalKwargs() model_config = ctx.model_config tokenizer = cached_get_tokenizer( @@ -783,7 +783,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: data = [data] transformed_images = [transform(datum) for datum in data] pixel_values = torch.stack(transformed_images, dim=0) - return MultiModalInputs({"pixel_values": pixel_values}) + return MultiModalKwargs({"pixel_values": pixel_values}) def build_normalization_transform(image_size: int) -> transforms.Compose: @@ -808,7 +808,7 @@ def dummy_data_for_qwen( ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int], -) -> Tuple[SequenceData, Optional[Dict]]: +) -> DummyData: """Build dummy data for warming up Qwen models; this will only contain text matching the defaults for VLLM unless the model has a visual config. @@ -827,7 +827,7 @@ def dummy_data_for_qwen( if not hasattr(hf_config, "visual"): seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) mm_data = None - return seq_data, mm_data + return DummyData(seq_data, mm_data) # We have a visual component - use images to warm up num_images = mm_counts["image"] @@ -859,31 +859,29 @@ def dummy_data_for_qwen( # the data will get resized and the # of tokens per image is constant image = Image.new("RGB", (224, 224), color=0) mm_data = {"image": image if num_images == 1 else [image] * num_images} - return seq_data, mm_data + return DummyData(seq_data, mm_data) class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - def __init__( - self, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config self.quant_config = quant_config - self.transformer = QWenModel(config, cache_config, quant_config) + self.transformer = QWenModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.transformer.wte.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) @@ -1048,7 +1046,7 @@ def get_mm_mapping(self) -> MultiModelKeys: @MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) -class QWenLMHeadModel(QWenBaseModel): +class QWenLMHeadModel(QWenBaseModel, SupportsLoRA): """ QWenLMHeadModel is not only applicable to LLM but also to VL, which is not conducive to the current integration logic of LoRA in vLLM. Therefore, it @@ -1063,17 +1061,13 @@ class QWenLMHeadModel(QWenBaseModel): def __new__( cls, - config: PretrainedConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ): + vllm_config: VllmConfig, + prefix: str = "", + ) -> None: + config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): - return QWenVL(config, multimodal_config, cache_config, - quant_config, lora_config) + return QWenVL(vllm_config=vllm_config) # Initialize LLM else: - return QWenLLM(config, multimodal_config, cache_config, - quant_config, lora_config) + return QWenLLM(vllm_config=vllm_config) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index db1029345a8ac..b623c576bb673 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2/modeling_qwen2.py # Copyright 2024 The Qwen team. @@ -29,7 +28,8 @@ from transformers import Qwen2Config from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -39,7 +39,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -237,16 +237,16 @@ def forward( return hidden_states, residual +@support_torch_compile class Qwen2Model(nn.Module): - def __init__( - self, - config: Qwen2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -393,9 +393,6 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ".v_proj.", ".o_proj.", ] - - # in TP, these weights are partitioned along the column dimension (dim=-1) - column_parallel_weights_modules = [".down_proj.", ".o_proj."] bitsandbytes_stacked_params_mapping = { # shard_name, weight_name, index "q_proj": ("qkv_proj", 0), @@ -405,35 +402,29 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - config: Qwen2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) - super().__init__() - self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, - cache_config, - quant_config, + self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if config.tie_word_embeddings: @@ -446,10 +437,13 @@ def __init__( prefix, "lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -457,9 +451,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 3d049eeb920b7..d30950361ad89 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. @@ -27,23 +26,22 @@ import numpy as np import torch import torch.nn as nn -from transformers import Qwen2AudioConfig, Qwen2AudioEncoder +from transformers import Qwen2AudioEncoder from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP @@ -85,7 +83,8 @@ def forward(self, audio_features): def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int]): num_audios = mm_counts["audio"] - max_llm_audio_tokens = get_max_qwen2_audio_audio_tokens(ctx) * num_audios + max_tokens_per_audio = get_max_qwen2_audio_audio_tokens(ctx) + max_llm_audio_tokens = max_tokens_per_audio * num_audios if seq_len - max_llm_audio_tokens - 2 < 0: raise RuntimeError( f"Qwen2-Audio cannot process {num_audios} audios in a prompt, " @@ -99,7 +98,12 @@ def dummy_data_for_qwen2_audio(ctx: InputContext, seq_len: int, (0, seq_len - max_llm_audio_tokens), ) dummy_audio = np.full((max_llm_audio_tokens * 2 * 2 * 160, ), 0.) - return dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios} + return DummyData( + dummy_seqdata, {"audio": [(dummy_audio, 16000)] * num_audios}, { + "audio": + consecutive_placeholder_ranges(num_items=num_audios, + item_size=max_tokens_per_audio) + }) def get_processor( @@ -215,13 +219,13 @@ def input_processor_for_qwen2_audio( def input_mapper_for_qwen2_audio( ctx: InputContext, multi_modal_data: Union[np.ndarray, List[np.ndarray]], -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-Audio.""" if not isinstance(multi_modal_data, list): multi_modal_data = [multi_modal_data] if len(multi_modal_data) == 0: - return MultiModalInputs() + return MultiModalKwargs() processor = cached_get_processor(ctx.model_config.model) audio_feature_extractor = processor.feature_extractor @@ -248,7 +252,7 @@ def input_mapper_for_qwen2_audio( logger.error("Failed to process audio (%s)", multi_modal_data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio) @@ -260,13 +264,11 @@ def input_mapper_for_qwen2_audio( class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: Qwen2AudioConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config @@ -276,8 +278,9 @@ def __init__(self, self.quant_config = quant_config - self.language_model = Qwen2Model(config.text_config, cache_config, - quant_config) + self.language_model = Qwen2Model( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=prefix) self.unpadded_vocab_size = config.text_config.vocab_size if config.text_config.tie_word_embeddings: self.lm_head = self.language_model.embed_tokens @@ -289,7 +292,7 @@ def __init__(self, self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.text_config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 2d6f3e90f761c..020af88aadd98 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 Kakao Corp. (Kanana-X Team) @@ -9,19 +8,16 @@ import torch from torch import nn -from transformers import Qwen2Config from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, PoolerConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput -from .utils import AutoWeightsLoader +from .utils import AutoWeightsLoader, maybe_prefix class Qwen2ForSequenceClassification(nn.Module): @@ -47,33 +43,31 @@ class Qwen2ForSequenceClassification(nn.Module): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - config: Qwen2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - pooler_config: Optional[PoolerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + pooler_config = vllm_config.model_config.pooler_config # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) - super().__init__() - self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.score = RowParallelLinear(config.hidden_size, config.num_labels, diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index d4475b7ca27af..51c0cd5664fd2 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py # Copyright 2024 The Qwen team. @@ -30,7 +29,8 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -44,7 +44,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -54,7 +54,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class Qwen2MoeMLP(nn.Module): @@ -312,16 +313,16 @@ def forward( return hidden_states, residual +@support_torch_compile class Qwen2MoeModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -376,23 +377,21 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Qwen2MoeModel(config, cache_config, quant_config) + self.model = Qwen2MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 901b1daaa14a4..89768ec9dff37 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py # Copyright 2024 The Qwen team. @@ -8,20 +7,18 @@ import torch from torch import nn -from transformers import Qwen2Config from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, PoolerConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsPP from .qwen2 import Qwen2Model -from .utils import AutoWeightsLoader +from .utils import AutoWeightsLoader, maybe_prefix class ReLU(nn.Module): @@ -58,33 +55,31 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - config: Qwen2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - pooler_config: Optional[PoolerConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + pooler_config = vllm_config.model_config.pooler_config # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): raise ValueError("Sliding window for some but all layers is not " "supported. This model uses sliding window " - "but `max_window_layers` = %s is less than " - "`num_hidden_layers` = %s. Please open an issue " - "to discuss this feature." % ( + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( config.max_window_layers, config.num_hidden_layers, )) - super().__init__() - self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.score = nn.Sequential( ColumnParallelLinear(config.hidden_size, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 1e12c2332b65e..2335baf459771 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/19e6e80e10118f855137b90740936c0b11ac397f/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py # Copyright 2024 The Qwen team. @@ -23,8 +22,8 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from functools import partial -from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, - Tuple, Type, TypedDict, Union) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Tuple, Type, TypedDict, Union) import torch import torch.nn as nn @@ -41,35 +40,39 @@ from vllm.attention import AttentionMetadata from vllm.attention.selector import _Backend -from vllm.config import CacheConfig, MultiModalConfig +from vllm.config import VllmConfig from vllm.distributed import get_pp_group, parallel_state from vllm.distributed import utils as dist_utils -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.quantization import (GPTQConfig, + GPTQMarlinConfig, + QuantizationConfig) +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalInputs) -from vllm.multimodal.base import MultiModalData +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, + MultiModalKwargs) from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (PPMissingLayer, get_vit_attn_backend, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory) + make_empty_intermediate_tensors_factory, maybe_prefix) logger = init_logger(__name__) @@ -78,7 +81,7 @@ class Qwen2VLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: torch.Tensor + pixel_values: torch.Tensor """Shape: `(num_patches, num_channels * patch_size * patch_size)` """ @@ -91,9 +94,22 @@ class Qwen2VLImagePixelInputs(TypedDict): class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` - `hidden_size` must match the hidden size of language model backbone. + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. """ @@ -101,7 +117,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): Qwen2VLImageEmbeddingInputs] -class Qwen2VLVideoInputs(TypedDict): +class Qwen2VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] pixel_values_videos: torch.Tensor """Shape: `(num_patches, @@ -115,6 +132,30 @@ class Qwen2VLVideoInputs(TypedDict): """ +class Qwen2VLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + video_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all videos' features. + Each tensor holds an video's features. + - `torch.Tensor`: A tensor holding all videos' features + (concatenation of all videos' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the videos. + - `hidden_size` must match the hidden size of language model backbone. + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs, + Qwen2VLVideoEmbeddingInputs] + # === Vision Encoder === # @@ -246,9 +287,8 @@ def forward( q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) batch_size = q.shape[1] - q, k, v = [ - rearrange(x, "s b ... -> b s ...").contiguous() for x in (q, k, v) - ] + q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() + for x in (q, k, v)) if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) @@ -258,7 +298,7 @@ def forward( # flash_attn_varlen_func) from flash_attn import flash_attn_varlen_func - q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() output = flash_attn_varlen_func(q, @@ -276,7 +316,7 @@ def forward( b=batch_size) elif self.attn_backend == _Backend.TORCH_SDPA: seq_length = q.size(1) - q, k, v = [rearrange(x, "b s h d -> b h s d") for x in [q, k, v]] + q, k, v = (rearrange(x, "b s h d -> b h s d") for x in [q, k, v]) attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool) @@ -560,6 +600,17 @@ def forward( # === Vision input helpers === # +def get_mm_processor_kwargs( + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None) -> Dict[str, int]: + mm_processor_kwargs = {} + if min_pixels: + mm_processor_kwargs["min_pixels"] = min_pixels + if max_pixels: + mm_processor_kwargs["max_pixels"] = max_pixels + return mm_processor_kwargs + + def mm_input_mapper_for_qwen2_vl( ctx: InputContext, data: MultiModalData[object], @@ -567,22 +618,24 @@ def mm_input_mapper_for_qwen2_vl( *, min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, -) -> MultiModalInputs: +) -> MultiModalKwargs: """Input mapper for Qwen2-VL.""" if data_type_key == "image" and isinstance(data, dict): - return MultiModalInputs({ + return MultiModalKwargs({ "image_embeds": data.get("image_embeds"), "image_grid_thw": data.get("image_grid_thw"), }) + if data_type_key == "video" and isinstance(data, dict): + return MultiModalKwargs({ + "video_embeds": data.get("video_embeds"), + "video_grid_thw": data.get("video_grid_thw"), + }) + model_config = ctx.model_config # Handle mm processor kwargs; we pass these at creation time # because preprocess() in transformers doesn't expose them - mm_processor_kwargs = {} - if min_pixels: - mm_processor_kwargs["min_pixels"] = min_pixels - if max_pixels: - mm_processor_kwargs["max_pixels"] = max_pixels - + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) image_processor = cached_get_image_processor( model_config.model, trust_remote_code=model_config.trust_remote_code, @@ -608,7 +661,7 @@ def mm_input_mapper_for_qwen2_vl( logger.error("Failed to process image (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl, @@ -685,7 +738,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, *, min_pixels=None, max_pixels=None) -> int: - image_processor = cached_get_image_processor(ctx.model_config.model) + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) + image_processor = cached_get_image_processor(ctx.model_config.model, + **mm_processor_kwargs) max_resized_height, max_resized_width, max_llm_image_tokens = \ _get_max_image_info(image_processor, data_type_key=data_type_key, mm_count=1, min_pixels=min_pixels, @@ -707,7 +763,10 @@ def dummy_data_for_qwen2_vl( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: - image_processor = cached_get_image_processor(ctx.model_config.model) + mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels, + max_pixels=max_pixels) + image_processor = cached_get_image_processor(ctx.model_config.model, + **mm_processor_kwargs) num_images = mm_counts["image"] max_resized_height, max_resized_width, max_llm_image_tokens = \ @@ -744,9 +803,10 @@ def dummy_data_for_qwen2_vl( dummy_image = Image.new("RGB", (max_resized_width, max_resized_height), color=0) - return dummy_seqdata, { - "image": dummy_image if num_images == 1 else [dummy_image] * num_images - } + return DummyData(dummy_seqdata, { + "image": + dummy_image if num_images == 1 else [dummy_image] * num_images + }) def _get_llm_num_vision_tokens( @@ -876,16 +936,33 @@ def input_processor_for_qwen2_vl( idx for idx, token in enumerate(prompt_token_ids) if token == hf_config.image_token_id ] - image_cnt = len(image_indices) - embed_dim = image_inputs.get('image_embeds').size(0) - assert embed_dim % image_cnt == 0 - num_pad_tokens = embed_dim // image_cnt + + # ensure all image tokens have grid_thw + assert \ + len(image_indices) == image_inputs["image_grid_thw"].size(0), \ + "image token num does not match image_grid_thw.shape" + + image_counter = 0 + pad_token_counter = 0 for idx, token in enumerate(prompt_token_ids): if idx in image_indices: + grid_thw = image_inputs["image_grid_thw"][image_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) prompt_token_ids_with_image.extend([token] * num_pad_tokens) + image_counter += 1 + pad_token_counter += num_pad_tokens else: prompt_token_ids_with_image.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == image_inputs["image_embeds"].size(0), \ + "image_embeds.shape does not match image_grid_thw" + prompt_token_ids = prompt_token_ids_with_image else: prompt_token_ids = _expand_pad_tokens(image_inputs, @@ -898,14 +975,49 @@ def input_processor_for_qwen2_vl( max_pixels=max_pixels) if video_inputs is not None: - prompt_token_ids = _expand_pad_tokens(video_inputs, - hf_config.video_token_id, - make_batched_videos, - "video", - image_processor, - prompt_token_ids, - min_pixels=min_pixels, - max_pixels=max_pixels) + if isinstance(video_inputs, dict): + prompt_token_ids_with_video = [] + video_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.video_token_id + ] + + # ensure all video tokens have grid_thw + assert \ + len(video_indices) == video_inputs["video_grid_thw"].size(0), \ + "video token num does not match video_grid_thw.shape" + + video_counter = 0 + pad_token_counter = 0 + for idx, token in enumerate(prompt_token_ids): + if idx in video_indices: + grid_thw = video_inputs["video_grid_thw"][video_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) + prompt_token_ids_with_video.extend([token] * + num_pad_tokens) + video_counter += 1 + pad_token_counter += num_pad_tokens + else: + prompt_token_ids_with_video.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == video_inputs["video_embeds"].size(0), \ + "video_embeds.shape does not match video_grid_thw" + + prompt_token_ids = prompt_token_ids_with_video + else: + prompt_token_ids = _expand_pad_tokens(video_inputs, + hf_config.video_token_id, + make_batched_videos, + "video", + image_processor, + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) prompt = inputs.get("prompt") if prompt is None: @@ -928,15 +1040,37 @@ def input_processor_for_qwen2_vl( @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl) @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, - SupportsPP): + SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } - def __init__(self, - config: Qwen2VLConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: - super().__init__() + # LoRA specific attributes + # TODO Support LoRA for the visual encoder in the future. + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config + multimodal_config = vllm_config.model_config.multimodal_config assert not cache_config.enable_prefix_caching, \ "Qwen2-VL currently does not support prefix caching" @@ -946,14 +1080,12 @@ def __init__(self, self.visual = Qwen2VisionTransformer( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - prefix="visual", + quant_config=self._maybe_ignore_quant_config(quant_config), + prefix=maybe_prefix(prefix, "visual"), ) - self.model = Qwen2Model(config, - cache_config, - quant_config, - prefix="model") + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: if config.tie_word_embeddings: @@ -962,16 +1094,30 @@ def __init__(self, self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config, - prefix="lm_head") + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): + # GPTQ configs do not have a list of ignored modules, however AutoGPTQ + # seems to avoid vision encoder sections for some models. + # See: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4 + if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)): + return None + return quant_config + def _validate_and_reshape_mm_tensor(self, mm_input: Union[torch.Tensor, List[torch.Tensor]], @@ -1009,49 +1155,71 @@ def _parse_and_validate_image_input( f"Got type: {type(pixel_values)}") return Qwen2VLImagePixelInputs(type="pixel_values", - data=pixel_values, + pixel_values=pixel_values, image_grid_thw=image_grid_thw) if image_embeds is not None: image_embeds = self._validate_and_reshape_mm_tensor( image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") if not isinstance(image_embeds, torch.Tensor): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") return Qwen2VLImageEmbeddingInputs(type="image_embeds", - data=image_embeds) + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) video_grid_thw = kwargs.pop("video_grid_thw", None) - if pixel_values_videos is None: + if pixel_values_videos is None and video_embeds is None: return None - pixel_values_videos = self._validate_and_reshape_mm_tensor( - pixel_values_videos, "video pixel values") - video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw") - - return Qwen2VLVideoInputs( - pixel_values_videos=pixel_values_videos, - video_grid_thw=video_grid_thw, - ) + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2VLVideoEmbeddingInputs(type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: if image_input["type"] == "image_embeds": - return image_input["data"].type(self.visual.dtype) + return image_input["image_embeds"].type(self.visual.dtype) - pixel_values = image_input["data"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"].type(self.visual.dtype) image_embeds = self.visual(pixel_values, grid_thw=image_input["image_grid_thw"]) return image_embeds def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor: + if video_input["type"] == "video_embeds": + return video_input["video_embeds"].type(self.visual.dtype) + pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) video_embeds = self.visual(pixel_values_videos, @@ -1158,6 +1326,13 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f50ceaccb1bbe..f172c06c4a26a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -94,31 +94,22 @@ _EMBEDDING_MODELS = { # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), + "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "LlamaModel": ("llama", "LlamaEmbeddingModel"), + **{ + # Multiple models share the same architecture, so we include them all + k: (mod, arch) for k, (mod, arch) in _TEXT_GENERATION_MODELS.items() + if arch == "LlamaForCausalLM" + }, "MistralModel": ("llama", "LlamaEmbeddingModel"), - "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), - "Qwen2ForSequenceClassification": ( - "qwen2_cls", "Qwen2ForSequenceClassification"), - "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), - "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), + "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"), # noqa: E501 # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), -} - -def add_embedding_models(base_models, embedding_models): - with_pooler_method_models = {} - embedding_models_name = embedding_models.keys() - for name, (path, arch) in base_models.items(): - if arch in embedding_models_name: - with_pooler_method_models[name] = (path, arch) - return with_pooler_method_models - -_EMBEDDING_MODELS = { - **add_embedding_models(_TEXT_GENERATION_MODELS, _EMBEDDING_MODELS), - **_EMBEDDING_MODELS, + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501, } _MULTIMODAL_MODELS = { @@ -128,7 +119,9 @@ def add_embedding_models(base_models, embedding_models): "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), + "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), + "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 @@ -342,6 +335,11 @@ def register_model( def _raise_for_unsupported(self, architectures: List[str]): all_supported_archs = self.get_supported_archs() + if any(arch in all_supported_archs for arch in architectures): + raise ValueError( + f"Model architectures {architectures} failed " + "to be inspected. Please check the logs for more details.") + raise ValueError( f"Model architectures {architectures} are not supported for now. " f"Supported architectures: {all_supported_archs}") @@ -482,4 +480,4 @@ def _run() -> None: if __name__ == "__main__": - _run() + _run() \ No newline at end of file diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 2e7ae32055aaf..acaf4afdecfe5 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -23,6 +23,7 @@ VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) from vllm.sequence import SequenceData @@ -61,6 +62,7 @@ def dummy_seq_data_for_siglip( *, image_token_id: int, image_feature_size_override: Optional[int] = None, + mm_key: str = "image", ): if image_feature_size_override is None: image_feature_size = get_siglip_image_feature_size(hf_config) @@ -70,7 +72,11 @@ def dummy_seq_data_for_siglip( return SequenceData.from_prompt_token_counts( (image_token_id, image_feature_size * num_images), (0, seq_len - image_feature_size * num_images), - ) + ), { + mm_key: + consecutive_placeholder_ranges(num_items=num_images, + item_size=image_feature_size) + } def dummy_image_for_siglip( @@ -122,6 +128,11 @@ def input_processor_for_siglip( if multi_modal_data is None or "image" not in multi_modal_data: return inputs + if "multi_modal_placeholders" in inputs and "image" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + tokenizer = cached_get_tokenizer(model_config.tokenizer) if image_feature_size_override is None: @@ -135,7 +146,7 @@ def input_processor_for_siglip( else: image_feature_size = image_feature_size_override - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -144,11 +155,10 @@ def input_processor_for_siglip( ) # NOTE: Create a defensive copy of the original inputs - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - ) + return token_inputs(prompt_token_ids=new_token_ids, + prompt=new_prompt, + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": ranges}) # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index e3e7ccb5cf179..4f03ca501fb68 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py # Copyright 2023 The vLLM team. @@ -30,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -43,7 +42,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( @@ -54,7 +53,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class SolarMLP(nn.Module): @@ -267,15 +267,14 @@ def forward( @support_torch_compile class SolarModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -410,24 +409,17 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.model = SolarModel( - config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), ) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size @@ -450,7 +442,7 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() else: self.lm_head = PPMissingLayer() diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 083a48588d01a..1125f9e9f9617 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. # All rights reserved. # @@ -26,7 +25,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -35,7 +34,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -44,7 +43,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class StablelmMLP(nn.Module): @@ -194,12 +194,13 @@ def forward( class StableLMEpochModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '') -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -246,23 +247,21 @@ def forward( class StablelmForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = StableLMEpochModel(config, cache_config, quant_config) + self.model = StableLMEpochModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 8f0644bca3e2e..ce7a7957f52c4 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX @@ -26,7 +25,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -35,7 +34,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -44,7 +43,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class Starcoder2Attention(nn.Module): @@ -140,8 +140,7 @@ def __init__(self, bias=config.use_bias, quant_config=quant_config, ) - self.act = get_act_fn(config.hidden_act, quant_config, - config.intermediate_size) + self.act = get_act_fn(config.hidden_act) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.c_fc(hidden_states) @@ -197,12 +196,13 @@ def forward( @support_torch_compile class Starcoder2Model(nn.Module): - def __init__(self, - config: Starcoder2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -247,15 +247,13 @@ def forward( class Starcoder2ForCausalLM(nn.Module, SupportsPP): - def __init__(self, - config: Starcoder2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config self.config = config - self.model = Starcoder2Model(config, - cache_config, - quant_config=quant_config) + self.model = Starcoder2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.unpadded_vocab_size = config.vocab_size if config.tie_word_embeddings: @@ -271,7 +269,7 @@ def __init__(self, ) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index f08e4aa355086..9fde22c016de0 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -2,7 +2,6 @@ """PyTorch Ultravox model.""" import math -from array import array from functools import cached_property, lru_cache from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union, cast) @@ -16,28 +15,27 @@ from transformers.models.whisper.modeling_whisper import WhisperEncoder from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, MultiModalConfig -from vllm.inputs import INPUT_REGISTRY -from vllm.inputs.data import DecoderOnlyInputs, token_inputs -from vllm.inputs.registry import InputContext +from vllm.config import VllmConfig +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs, NestedTensors +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + NestedTensors) from vllm.multimodal.utils import (cached_get_tokenizer, + consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) +from vllm.sequence import IntermediateTensors, SequenceData from vllm.transformers_utils.configs.ultravox import UltravoxConfig from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, merge_multimodal_embeddings) + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings_from_map) _AUDIO_PLACEHOLDER_TOKEN = 128002 _AUDIO_TOKENS_PER_SECOND = 6.25 @@ -46,13 +44,13 @@ class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] data: NestedTensors - """Shape: `(batch_size, num_audios, 80, M)""" + """Shape: `(batch_size, num_audios, 80, M)`""" class UltravoxAudioEmbeddingInputs(TypedDict): type: Literal["audio_embeds"] data: NestedTensors - """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)""" + """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)`""" UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, @@ -79,17 +77,16 @@ def dummy_seq_data_for_ultravox( seq_len: int, audio_count: int, ): - audio_placeholder = array( - VLLM_TOKEN_ID_ARRAY_TYPE, - [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx) + audio_length = min(get_ultravox_max_audio_tokens(ctx), + seq_len // audio_count) - # Add a separator between each chunk. - audio_token_ids = (audio_placeholder + - array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count - other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0]) * (seq_len - len(audio_token_ids)) - - return SequenceData(audio_token_ids + other_token_ids) + return SequenceData.from_prompt_token_counts( + (_AUDIO_PLACEHOLDER_TOKEN, audio_length * audio_count), + (0, seq_len - audio_length * audio_count)), { + "audio": + consecutive_placeholder_ranges(num_items=audio_count, + item_size=audio_length) + } def dummy_audio_for_ultravox( @@ -107,10 +104,10 @@ def dummy_data_for_ultravox( mm_counts: Mapping[str, int], ): audio_count = mm_counts["audio"] - seq_data = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count) + seq_data, ranges = dummy_seq_data_for_ultravox(ctx, seq_len, audio_count) mm_dict = dummy_audio_for_ultravox(ctx, audio_count) - return (seq_data, mm_dict) + return DummyData(seq_data, mm_dict, ranges) def input_mapper_for_ultravox(ctx: InputContext, data: object): @@ -118,11 +115,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): data = [data] if len(data) == 0: - return MultiModalInputs() + return MultiModalKwargs() # If the audio inputs are embeddings, no need for preprocessing if is_list_of(data, torch.Tensor, check="all"): - return MultiModalInputs({"audio_embeds": data}) + return MultiModalKwargs({"audio_embeds": data}) audio_features = [] for audio_input in data: @@ -136,9 +133,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): if sr != feature_extractor.sampling_rate: try: import librosa - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc audio = librosa.resample(audio, orig_sr=sr, target_sr=feature_extractor.sampling_rate) @@ -156,7 +153,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): # Remove the batch dimension because we're wrapping it in a list. audio_features.append(single_audio_features.squeeze(0)) - return MultiModalInputs({"audio_features": audio_features}) + return MultiModalKwargs({"audio_features": audio_features}) def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): @@ -164,6 +161,11 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): if multi_modal_data is None or "audio" not in multi_modal_data: return inputs + if "multi_modal_placeholders" in inputs and "audio" in inputs[ + "multi_modal_placeholders"]: + # The inputs already have placeholders. + return inputs + feature_extractor = whisper_feature_extractor(ctx) audios = multi_modal_data["audio"] if not isinstance(audios, list): @@ -197,7 +199,7 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) - new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( + new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( tokenizer, inputs.get("prompt"), inputs["prompt_token_ids"], @@ -208,7 +210,8 @@ def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs): # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"audio": ranges}) class StackAudioFrames(nn.Module): @@ -336,12 +339,10 @@ def forward( @INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, - config: UltravoxConfig, - multimodal_config: MultiModalConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional["QuantizationConfig"] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multi_modal_config = multimodal_config assert self.multi_modal_config @@ -349,6 +350,8 @@ def __init__(self, self.secondary_weights = [] self.audio_tower = ModifiedWhisperEncoder(config.audio_config) if config.audio_model_id is not None: + # this prefix is not for initialization, but for loading weights + # note the trailing dot self.secondary_weights.append( DefaultModelLoader.Source( model_or_path=config.audio_model_id, @@ -358,10 +361,11 @@ def __init__(self, self.multi_modal_projector = UltravoxProjector(config) self.language_model = init_vllm_registered_model( config.text_config, - cache_config, - quant_config, - prefix="language_model") + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) if config.text_model_id is not None: + # this prefix is not for initialization, but for loading weights + # note the trailing dot self.secondary_weights.append( DefaultModelLoader.Source(model_or_path=config.text_model_id, revision=None, @@ -375,7 +379,7 @@ def sampler(self): if hasattr(self.language_model, "sampler"): return self.language_model.sampler - return Sampler() + return get_sampler() def _audio_features_to_embeddings( self, input_features: torch.Tensor) -> torch.Tensor: @@ -472,9 +476,9 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, inputs_embeds = self.language_model.model.get_input_embeddings( input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, audio_embeddings, - _AUDIO_PLACEHOLDER_TOKEN) + merge_multimodal_embeddings_from_map( + inputs_embeds, audio_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) input_ids = None else: inputs_embeds = None diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0aecb5d151a45..1fc6c1be4b7bb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -11,14 +11,11 @@ import vllm.envs as envs from vllm.attention.selector import (_Backend, backend_name_to_enum, get_global_forced_attn_backend) -from vllm.config import (CacheConfig, LoRAConfig, MultiModalConfig, - SchedulerConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.model_loader.loader import build_model from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import ModelRegistry -from vllm.multimodal.base import NestedTensors +from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import is_pin_memory_available @@ -236,12 +233,7 @@ def load_weights( def init_vllm_registered_model( hf_config: PretrainedConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - *, - lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, - scheduler_config: Optional[SchedulerConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: """ @@ -250,14 +242,8 @@ def init_vllm_registered_model( """ model_class, _ = ModelRegistry.resolve_model_cls(hf_config.architectures) - return build_model( - model_class, - hf_config, - cache_config, - quant_config, - lora_config=lora_config, - multimodal_config=multimodal_config, - scheduler_config=scheduler_config, + return model_class( + vllm_config=vllm_config.with_hf_config(hf_config), prefix=prefix, ) @@ -326,6 +312,22 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: _embedding_count_expression(inner) for inner in embeddings) +def merge_multimodal_embeddings_from_map( + inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, + placeholder_map: MultiModalPlaceholderMap.IndexMap) -> torch.Tensor: + """ + Merge ``multimodal_embeddings`` into ``inputs_embeds`` using the provided + placeholder map . + + Note: + This updates ``inputs_embeds`` in place. + """ + flattened_embeddings = _flatten_embeddings(multimodal_embeddings) + inputs_embeds[placeholder_map.dest] = flattened_embeddings[ + placeholder_map.src] + return inputs_embeds + + def _merge_multimodal_embeddings( inputs_embeds: torch.Tensor, is_multimodal: torch.Tensor, diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 036789642d3c4..153527da20d75 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://huggingface.co/xverse/XVERSE-7B/blob/main/modeling_xverse.py # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. @@ -28,7 +27,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -38,7 +37,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -47,7 +46,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class XverseMLP(nn.Module): @@ -224,15 +224,12 @@ def forward( @support_torch_compile class XverseModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -315,27 +312,26 @@ class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = XverseModel(config, cache_config, quant_config) + self.model = XverseModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) if self.config.tie_word_embeddings: self.lm_head.weight = self.model.embed_tokens.weight self.logits_processor = LogitsProcessor(config.vocab_size) - self.sampler = Sampler() + self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index ee02368bec8a8..84f35f75a0c32 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -284,7 +284,8 @@ def _prepare_seq_groups( else: # Decode prompt_logprob_len = 0 - query_len = query_lens[i] if query_lens is not None else 1 + query_len = query_lens[i] if query_lens is not None and len( + query_lens) > 0 else 1 sample_len = len(seq_ids) * query_len if do_sample else 0 if sampling_params.seed is not None and generators is not None: diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 489e1e51f05cb..03a5f3a91f7a1 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,6 +1,8 @@ -from .base import (BatchedTensorInputs, MultiModalDataBuiltins, - MultiModalDataDict, MultiModalInputs, MultiModalPlugin, - NestedTensors) +from .base import MultiModalPlaceholderMap, MultiModalPlugin +from .inputs import (BatchedTensorInputs, MultiModalData, + MultiModalDataBuiltins, MultiModalDataDict, + MultiModalKwargs, MultiModalPlaceholderDict, + NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -14,11 +16,29 @@ __all__ = [ "BatchedTensorInputs", + "MultiModalData", "MultiModalDataBuiltins", "MultiModalDataDict", - "MultiModalInputs", + "MultiModalKwargs", + "MultiModalPlaceholderDict", + "MultiModalPlaceholderMap", "MultiModalPlugin", "NestedTensors", "MULTIMODAL_REGISTRY", "MultiModalRegistry", ] + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 04d71826f29fa..1a230602966d4 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,5 +1,7 @@ from vllm.inputs.registry import InputContext -from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin + +from .base import MultiModalPlugin +from .inputs import AudioItem, MultiModalData, MultiModalKwargs class AudioPlugin(MultiModalPlugin): @@ -8,8 +10,12 @@ class AudioPlugin(MultiModalPlugin): def get_data_key(self) -> str: return "audio" - def _default_input_mapper(self, ctx: InputContext, data: object, - **mm_processor_kwargs) -> MultiModalInputs: + def _default_input_mapper( + self, + ctx: InputContext, + data: MultiModalData[AudioItem], + **mm_processor_kwargs, + ) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 84e71cbf60df7..6eec660e42ac4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,158 +1,26 @@ -import sys from abc import ABC, abstractmethod -from collections import UserDict, defaultdict -from typing import (Any, Callable, Dict, List, Mapping, Optional, Tuple, Type, - TypedDict, TypeVar, Union, cast, final) - -import numpy as np -import torch -import torch.types -from PIL import Image +from collections import defaultdict +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Sequence, Tuple, Type, TypeVar, Union) + from torch import nn -from typing_extensions import TypeAlias -from vllm.config import ModelConfig from vllm.inputs import InputContext from vllm.logger import init_logger -from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of, - json_map_leaves, resolve_mm_processor_kwargs) - -logger = init_logger(__name__) - -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] -""" -Uses a list instead of a tensor if the dimensions of each element do not match. -""" - -BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] -""" -A dictionary containing nested tensors which have been batched via -:meth:`MultiModalInputs.batch`. -""" - -if sys.version_info < (3, 9): - # UserDict cannot be subscripted - class _MultiModalInputsBase(UserDict): - pass -else: - - class _MultiModalInputsBase(UserDict[str, NestedTensors]): - pass - - -class MultiModalInputs(_MultiModalInputsBase): - """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. - """ - - @staticmethod - def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: - """ - Recursively stacks lists of tensors when they all have the same shape. - """ - if isinstance(nested_tensors, torch.Tensor): - return nested_tensors - - if isinstance(nested_tensors, np.ndarray): - return torch.from_numpy(nested_tensors) - - if isinstance(nested_tensors, (int, float)): - return torch.tensor(nested_tensors) - - stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors] - if not is_list_of(stacked, torch.Tensor, check="all"): - # Only tensors (not lists) can be stacked. - return stacked - - tensors_ = cast(List[torch.Tensor], stacked) - if any(t.shape != tensors_[0].shape for t in tensors_): - # The tensors have incompatible shapes and can't be stacked. - return tensors_ - - return torch.stack(tensors_) - - @staticmethod - def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs: - """ - Batch multiple inputs together into a dictionary. +from vllm.utils import (get_allowed_kwarg_only_overrides, + resolve_mm_processor_kwargs) - The resulting dictionary has the same keys as the inputs. - If the corresponding value from each input is a tensor and they all - share the same shape, the output value is a single batched tensor; - otherwise, the output value is a list containing the original value - from each input. - """ - if len(inputs_list) == 0: - return {} - - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) - - for inputs in inputs_list: - # For models that supports multiple modalities (e.g. Qwen2-VL), - # different modalities will return different data keys, - # so batch() should skip the same key check. - - for k, v in inputs.items(): - item_lists[k].append(v) - - return { - k: MultiModalInputs._try_stack(item_list) - for k, item_list in item_lists.items() - } - - @staticmethod - def as_kwargs( - batched_inputs: BatchedTensorInputs, - *, - device: torch.types.Device, - ) -> BatchedTensorInputs: - json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) - - json_mapped = json_map_leaves( - lambda x: x.to(device, non_blocking=True), - json_inputs, - ) - - return cast(BatchedTensorInputs, json_mapped) - - -_T = TypeVar("_T") - -MultiModalData: TypeAlias = Union[_T, List[_T]] -""" -Either a single data instance, or a list of data instances. - -The number of data instances allowed per modality is restricted by -`--limit-mm-per-prompt`. -""" - - -@final -class MultiModalDataBuiltins(TypedDict, total=False): - """Modality types that are predefined by vLLM.""" - - image: MultiModalData[Image.Image] - """The input image(s).""" +if TYPE_CHECKING: + from vllm.config import ModelConfig + from vllm.sequence import SequenceGroupMetadata - audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]] - """The input audio item(s) and corresponding sampling rate(s).""" +from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, + PlaceholderRange) - -MultiModalDataDict = Union[MultiModalDataBuiltins, - Mapping[str, MultiModalData[object]]] -""" -A dictionary containing an item for each modality type to input. - -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalDataBuiltins` as long as a customized plugin is registered - through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. -""" +logger = init_logger(__name__) MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], - MultiModalInputs] + MultiModalKwargs] """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers @@ -167,6 +35,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): model. This does not include tokens that correspond to the input text. """ +_T = TypeVar("_T") N = TypeVar("N", bound=Type[nn.Module]) @@ -199,9 +68,9 @@ def get_data_key(self) -> str: def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[Any], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to @@ -234,18 +103,23 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already has an input mapper " "registered to %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) - self._input_mappers[model_cls] = mapper \ - or self._default_input_mapper + self._input_mappers[model_cls] = (mapper + or self._default_input_mapper) return model_cls return wrapper - def map_input(self, model_config: ModelConfig, - data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any]) -> MultiModalInputs: + def map_input( + self, + model_config: "ModelConfig", + data: MultiModalData[Any], + mm_processor_kwargs: Optional[Dict[str, Any]], + ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the input mapper registered for that model. @@ -259,6 +133,7 @@ def map_input(self, model_config: ModelConfig, - :ref:`input_processing_pipeline` - :ref:`enabling_multimodal_inputs` """ + # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture @@ -270,6 +145,9 @@ def map_input(self, model_config: ModelConfig, raise KeyError(f"No input mapper in {self} is registered for " f"model class {model_cls.__name__}.") + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + # In the case of the default mapper, we have to get resource # processor through its HuggingFace autoclass; since this goes # through **kwargs, we can't inspect it the same way, so we allow @@ -320,19 +198,21 @@ def wrapper(model_cls: N) -> N: logger.warning( "Model class %s already calculates maximum number of " "tokens in %s. It is overwritten by the new one.", - model_cls, self) + model_cls, + self, + ) if isinstance(max_mm_tokens, int): self._validate_max_multimodal_tokens(max_mm_tokens) - self._max_mm_tokens[model_cls] = max_mm_tokens \ - or self._default_max_multimodal_tokens + self._max_mm_tokens[model_cls] = ( + max_mm_tokens or self._default_max_multimodal_tokens) return model_cls return wrapper - def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: + def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. @@ -366,3 +246,205 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: self._validate_max_multimodal_tokens(max_mm_tokens) return max_mm_tokens + + +class MultiModalPlaceholderMap: + """ + Relates multi-modal embeddings to their corresponding placeholders. + """ + + class IndexMap(NamedTuple): + src: List[int] + dest: List[int] + + src_ranges: List[range] + """ + The indices of the multi-modal embeddings that will replace the + corresponding placeholder embeddings pointed to by ``dest_ranges``. + """ + + src_len: int + """ + The total number of flattened multi-modal embeddings. + """ + + dest_ranges: List[range] + """ + The indices of the placeholder embeddings that will be replaced by the + multimodal embeddings. + """ + + dest_len: int + """ + The total number of embeddings in the destination tensor. + """ + + def __init__(self): + self.src_ranges = [] + self.src_len = 0 + self.dest_ranges = [] + self.dest_len = 0 + + @classmethod + def from_seq_group( + cls, seq_group: "SequenceGroupMetadata", positions: range + ) -> Tuple[Optional[MultiModalDataDict], Dict[str, + "MultiModalPlaceholderMap"]]: + """ + Returns the multi-modal items that intersect with the portion of a + prompt (``seq_group``) represented by ``positions``, as well as a + ``MultiModalPlaceholderMap`` that relates the multi-modal embedding + vectors to their corresponding placeholders. + + Consider the following scenarios: + + Prompt: |AAAA BBBB What's in these images?| + Positions: |.................................| + + images = [A, B] + src_ranges = [(0, 4), (4, 8)] + dest_ranges = [(0, 4), (5, 9)] + + Prompt: |AAAA BBBB What's in these images?| + Positions: | ..... | + + images = [A, B] + src_ranges = [(2, 4), (4, 6)] + dest_ranges = [(0, 2), (3, 5)] + + Prompt: |AAAA BBBB What's in these images?| + Positions: | ......... | + + images = [B] + src_ranges = [(0, 4)] + dest_ranges = [(0, 4)] + + Prompt: |AAAA BBBB What's in these images?| + Positions: | .......................| + + images = [] + src_ranges = [] + dest_ranges = [] + """ + if (not seq_group.multi_modal_data + or not seq_group.multi_modal_placeholders): + return seq_group.multi_modal_data, {} + + mm_data = {**seq_group.multi_modal_data} + placeholder_maps: Dict[str, MultiModalPlaceholderMap] = defaultdict( + MultiModalPlaceholderMap) + + for ( + modality, + placeholders, + ) in seq_group.multi_modal_placeholders.items(): + mm_items = mm_data.pop(modality) + if not isinstance(mm_items, list): + mm_items = [mm_items] + + if positions: + intersecting_items = placeholder_maps[ + modality].append_items_from_seq_group( + positions, mm_items, placeholders) + + if intersecting_items: + mm_data[modality] = intersecting_items + + return mm_data, placeholder_maps + + def append_items_from_seq_group( + self, + positions: range, + multi_modal_items: List[_T], + multi_modal_placeholders: Sequence[PlaceholderRange], + ) -> List[_T]: + """ + Adds the multi-modal items that intersect ```positions`` to this + placeholder map and returns the intersecting items. + """ + intersecting_items = [] + + if len(multi_modal_items) != len(multi_modal_placeholders): + raise ValueError( + "Multi-modal placeholders and items must have the same length." + ) + for placeholder_dict, mm_item in zip(multi_modal_placeholders, + multi_modal_items): + placeholder = range( + placeholder_dict["offset"], + placeholder_dict["offset"] + placeholder_dict["length"], + ) + intersection = range( + max(positions.start, placeholder.start), + min(positions.stop, placeholder.stop), + ) + + if not intersection: + # Skip this multi-modal item. + continue + + token_embedding_range = range( + intersection.start - positions.start, + intersection.stop - positions.start, + ) + + multimodal_embedding_range = range( + intersection.start - placeholder.start + self.src_len, + intersection.stop - placeholder.start + self.src_len, + ) + + intersecting_items.append(mm_item) + self.dest_ranges.append(token_embedding_range) + self.src_ranges.append(multimodal_embedding_range) + self.src_len += len(placeholder) + + self.dest_len += len(positions) + return intersecting_items + + def extend(self, other: "MultiModalPlaceholderMap"): + """ + Adds the placeholders from another ``MultiModalPlaceholderMap`` to this + instance based on the source and destination tensors being + concatenated. + """ + + self.src_ranges.extend( + range(self.src_len + r.start, self.src_len + r.stop) + for r in other.src_ranges) + self.src_len += other.src_len + self.dest_ranges.extend( + range(self.dest_len + r.start, self.dest_len + r.stop) + for r in other.dest_ranges) + self.dest_len += other.dest_len + + def index_map(self) -> "IndexMap": + """ + Finalizes the placeholder map into lists of indices that can be used to + index the source and destination tensors. + """ + + src_indices = [i for r in self.src_ranges for i in r] + dest_indices = [i for r in self.dest_ranges for i in r] + + if len(src_indices) != len(dest_indices): + raise ValueError( + f"The number of source ({len(src_indices)}) and destination " + f"indices ({len(dest_indices)}) must be the same.") + + return MultiModalPlaceholderMap.IndexMap(src=src_indices, + dest=dest_indices) + + +def __getattr__(name: str): + import warnings + + if name == "MultiModalInputs": + msg = ("MultiModalInputs has been renamed to MultiModalKwargs. " + "The original name will take another meaning in an upcoming " + "version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return MultiModalKwargs + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 5f74bcea65ce2..97bbce1ce1570 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,17 +1,19 @@ from functools import lru_cache -from typing import Any, Dict, Optional +from typing import TYPE_CHECKING, Any, Dict, Optional import torch from PIL import Image -from transformers.image_processing_base import BatchFeature -from vllm.config import ModelConfig from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalInputs, MultiModalPlugin +from .base import MultiModalPlugin +from .inputs import ImageItem, MultiModalData, MultiModalKwargs + +if TYPE_CHECKING: + from vllm.config import ModelConfig logger = init_logger(__name__) @@ -26,7 +28,7 @@ def get_data_key(self) -> str: def _get_hf_image_processor( self, - model_config: ModelConfig, + model_config: "ModelConfig", mm_processor_kwargs: Optional[Dict[str, Any]] = None, ): if mm_processor_kwargs is None: @@ -39,15 +41,11 @@ def _get_hf_image_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[ImageItem], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config - # Processed by input processor - if isinstance(data, BatchFeature): - return MultiModalInputs(data.data) - # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): image_processor = self._get_hf_image_processor( @@ -76,11 +74,11 @@ def _default_input_mapper( type(image_processor).__name__) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) # Image embedding elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor): - return MultiModalInputs({"image_embeds": data}) + return MultiModalKwargs({"image_embeds": data}) raise TypeError(f"Invalid image type: {type(data)}") diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py new file mode 100644 index 0000000000000..64a4c58d5509c --- /dev/null +++ b/vllm/multimodal/inputs.py @@ -0,0 +1,225 @@ +from collections import UserDict, defaultdict +from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, + TypedDict, TypeVar, Union, cast, final) + +import numpy as np +import torch +import torch.types +from PIL.Image import Image +from typing_extensions import TypeAlias + +from vllm.utils import JSONTree, is_list_of, json_map_leaves + +_T = TypeVar("_T") + +# yapf: disable +ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image, +which can be passed to a HuggingFace :code:`ImageProcessor`. +""" + +VideoItem: TypeAlias = Union[ + List[Image], + np.ndarray, + torch.Tensor, + List[np.ndarray], + List[torch.Tensor], +] +""" + +A :class:`transformers.image_utils.VideoInput` representing a single video, +which can be passed to a HuggingFace :code:`VideoProcessor`. +""" + +AudioItem: TypeAlias = Union[ + np.ndarray, + List[float], + Tuple[np.ndarray, float], # DEPRECATED: Use mm_processor_kwargs instead +] +""" +Represents a single audio that can be inputted to a HuggingFace +:code:`AudioProcessor`. +""" +# yapf: enable + +MultiModalData: TypeAlias = Union[_T, List[_T]] +""" +Either a single data item, or a list of data items. + +The number of data items allowed per modality is restricted by +:code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalData[ImageItem] + """The input image(s).""" + + video: MultiModalData[VideoItem] + """The input video(s).""" + + audio: MultiModalData[AudioItem] + """The input audio(s).""" + + +MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +class PlaceholderRange(TypedDict): + """ + Placeholder location information for multi-modal data. + + For example: + Prompt: AAAA BBBB What is in these images? + Images A and B will have: + A: { "offset": 0, "length": 4 } + B: { "offset": 5, "length": 4 } + """ + + offset: int + """The start index of the placeholder in the prompt.""" + + length: int + """The length of the placeholder.""" + + +NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] +""" +Uses a list instead of a tensor if the dimensions of each element do not match. +""" + +BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] +""" +A dictionary containing nested tensors which have been batched via +:meth:`MultiModalKwargs.batch`. +""" + + +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. + """ + + @staticmethod + def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: + """ + Stack the inner dimensions that have the same shape in + a nested list of tensors. + + Thus, a dimension represented by a list means that the inner + dimensions are different for each element along that dimension. + """ + if isinstance(nested_tensors, torch.Tensor): + return nested_tensors + + # TODO: Remove these once all models have been migrated + if isinstance(nested_tensors, np.ndarray): + return torch.from_numpy(nested_tensors) + if isinstance(nested_tensors, (int, float)): + return torch.tensor(nested_tensors) + + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] + if not is_list_of(stacked, torch.Tensor, check="all"): + # Only tensors (not lists) can be stacked. + return stacked + + tensors_ = cast(List[torch.Tensor], stacked) + if any(t.shape != tensors_[0].shape for t in tensors_): + # The tensors have incompatible shapes and can't be stacked. + return tensors_ + + return torch.stack(tensors_) + + @staticmethod + def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: + """ + Batch multiple inputs together into a dictionary. + + The resulting dictionary has the same keys as the inputs. + If the corresponding value from each input is a tensor and they all + share the same shape, the output value is a single batched tensor; + otherwise, the output value is a list containing the original value + from each input. + """ + if len(inputs_list) == 0: + return {} + + # We need to consider the case where each item in the batch + # contains different modalities (i.e. different keys). + item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) + + for inputs in inputs_list: + for k, v in inputs.items(): + item_lists[k].append(v) + + return { + k: MultiModalKwargs._try_stack(item_list) + for k, item_list in item_lists.items() + } + + @staticmethod + def as_kwargs( + batched_inputs: BatchedTensorInputs, + *, + device: torch.types.Device, + ) -> BatchedTensorInputs: + json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + + json_mapped = json_map_leaves( + lambda x: x.to(device, non_blocking=True), + json_inputs, + ) + + return cast(BatchedTensorInputs, json_mapped) + + +MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +""" +A dictionary containing placeholder ranges. +""" + + +class MultiModalInputsV2(TypedDict): + """ + Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`, + ready to be passed to vLLM internals. + """ + + type: Literal["multimodal"] + """The type of inputs.""" + + prompt: str + """ + The original, unprocessed prompt text. + + Note: + Since prompt text is not required by vLLM internals, we leave this + unprocessed to save CPU computation. You can still call + :code:`tokenizer.decode(prompt_token_ids)` to get the processed text. + """ + + prompt_token_ids: List[int] + """The processed token IDs which includes placeholder tokens.""" + + mm_kwargs: MultiModalKwargs + """Keyword arguments to be directly passed to the model after batching.""" + + mm_placeholders: MultiModalPlaceholderDict + """ + For each modality, information about the placeholder tokens in + :code:`prompt_token_ids`. + """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py new file mode 100644 index 0000000000000..88a924da174a6 --- /dev/null +++ b/vllm/multimodal/processing.py @@ -0,0 +1,273 @@ +from dataclasses import dataclass +from functools import lru_cache, partial +from typing import (Any, Callable, Collection, Generic, List, Mapping, + Optional, TypedDict, TypeVar, final) + +from transformers import BatchFeature +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import is_list_of + +from .inputs import (AudioItem, ImageItem, MultiModalDataDict, + MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, + VideoItem) + +_T = TypeVar("_T") + +ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]] +""" +Given the original data item, HF-processed data, and index of the processed +item, output the replacement token IDs to be allocated in vLLM. +""" + + +@dataclass +class ModalityProcessingMetadata(Generic[_T]): + placeholder_replacements: Mapping[str, ReplacementFunc] + """ + A dictionary where each item represents the original placeholder in the + prompt text and the corresponding replacement. + """ + + +class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: ModalityProcessingMetadata[ImageItem] + video: ModalityProcessingMetadata[VideoItem] + audio: ModalityProcessingMetadata[AudioItem] + + +MultiModalProcessingMetadata: TypeAlias = \ + Mapping[str, ModalityProcessingMetadata[Any]] +""" +A dictionary containing an entry for each modality type to process. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + +MultiModalMultiData: TypeAlias = List[_T] +""" +A list of data items, where the number of data items allowed +per modality is restricted by :code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalMultiDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalMultiData[ImageItem] + """The input images.""" + + video: MultiModalMultiData[VideoItem] + """The input videos.""" + + audio: MultiModalMultiData[AudioItem] + """The input audios.""" + + +MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalMultiDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict: + """ + Convert a :class:`MultiModalDataDict` containing single data items + to a :class:`MultiModalMultiDataDict` containing multiple data items + per entry. + """ + multi_data: Mapping[str, MultiModalMultiData[Any]] = {} + + for k, v in data.items(): + # yapf: disable + if k == "video": + # Special case since even a single item can be a list + multi_data[k] = v if is_list_of(v, list) else [v] # type: ignore[index] + elif k in ("image", "audio"): + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + else: + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + # yapf: enable + + return multi_data + + +def encode_no_special_tokens( + tokenizer: AnyTokenizer, + text: str, +) -> List[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=False)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, bos=False, eos=False) + + return tokenizer.encode(text, add_special_tokens=False) + + +@lru_cache +def candidate_placeholders( + tokenizer: AnyTokenizer, + placeholder_text: str, +) -> Collection[List[int]]: + """Generate token ID sequences that may represent a placeholder text.""" + # When the placeholder text is not mapped to a special token ID, + # it may be tokenized differently based on whether it is at the start/end + # of the string. So, we go through each combination of whether the text + # is at the start and end boundaries of the string + + # Matches the placeholder when it is in the middle of the string + start_id, = encode_no_special_tokens(tokenizer, "a") + end_id, = encode_no_special_tokens(tokenizer, "b") + + candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text) + + start_id_, *candidate_a = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}", + ) + assert start_id == start_id_ + + start_id_, *candidate_ab, end_id_ = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}b", + ) + assert start_id == start_id_ and end_id == end_id_ + + *candidate_b, end_id_ = encode_no_special_tokens( + tokenizer, + f"{placeholder_text}b", + ) + assert end_id == end_id_ + + # Remove duplicates (need to convert to tuple to be hashable) + unique_candidates = { + tuple(c) + for c in [candidate_basic, candidate_a, candidate_ab, candidate_b] + } + + # Convert back to list + return [list(c) for c in unique_candidates] + + +def apply_placeholders( + token_ids: List[int], + placeholder_ids: List[int], + get_replacement_ids: Callable[[], List[int]], +) -> Optional[PlaceholderRange]: + """ + Find the first occurrence of :code:`placeholder_ids`, + and replace it with the output of :code:`get_replacement_ids`. + + This function updates :code:`token_ids` in place. + """ + placeholder_length = len(placeholder_ids) + + for start_idx in range(len(token_ids) - placeholder_length + 1): + if token_ids[start_idx:placeholder_length] == placeholder_ids: + token_ids[start_idx:placeholder_length] = get_replacement_ids() + + return PlaceholderRange(offset=start_idx, + length=placeholder_length) + + return None + + +class MultiModalProcessor: + """ + Helper class to process multi-modal inputs to be used in vLLM. + """ + + def __init__( + self, + ctx: InputProcessingContext, + metadata: MultiModalProcessingMetadata, + ) -> None: + super().__init__() + + self.ctx = ctx + self.metadata = metadata + + def __call__( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + return self.apply(prompt, mm_data, mm_processor_kwargs) + + def apply( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + tokenizer = self.ctx.tokenizer + hf_processor = self.ctx.get_hf_processor() + + processed_inputs = hf_processor( + text=prompt, # type: ignore + **mm_data, + **mm_processor_kwargs, + ) + new_token_ids, = processed_inputs.pop("input_ids").tolist() + mm_kwargs = MultiModalKwargs(processed_inputs) + + mm_placeholders: Mapping[str, List[PlaceholderRange]] = {} + + for modality, orig_inputs in to_multi_format(mm_data).items(): + assert isinstance(orig_inputs, list) + + metadata = self.metadata[modality] + placeholder_replacements = metadata.placeholder_replacements + + modality_placeholders: List[PlaceholderRange] = [] + + for item_idx, orig_item in enumerate(orig_inputs): + for match_text, replace_fn in placeholder_replacements.items(): + candidates = candidate_placeholders(tokenizer, match_text) + get_replacement_ids = partial( + replace_fn, + orig_item, + processed_inputs, + item_idx, + ) + + for match_ids in candidates: + # TODO(youkaichao): Don't update new_token_ids + placeholders = apply_placeholders( + new_token_ids, + match_ids, + get_replacement_ids, + ) + + if placeholders is not None: + modality_placeholders.append(placeholders) + + # yapf: disable + mm_placeholders[modality] = modality_placeholders # type: ignore[index] + # yapf: enable + + return MultiModalInputsV2( + type="multimodal", + prompt=prompt, + prompt_token_ids=new_token_ids, + mm_kwargs=mm_kwargs, + mm_placeholders=mm_placeholders, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 5e9b8bd518de3..b992442d3b314 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,26 +1,45 @@ import functools from collections import UserDict -from typing import Any, Dict, Mapping, Optional, Sequence +from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, + Sequence, Type, TypeVar) -from vllm.config import ModelConfig +import torch.nn as nn +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer from .audio import AudioPlugin -from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs, - MultiModalPlugin, MultiModalTokensCalc, NestedTensors) +from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin +from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors +from .processing import MultiModalProcessor from .video import VideoPlugin +if TYPE_CHECKING: + from vllm.config import ModelConfig + logger = init_logger(__name__) +N = TypeVar("N", bound=Type[nn.Module]) + +MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], + MultiModalProcessor] +""" +Constructs a :class:`MultiModalProcessor` instance from the context. + +The processing metadata should be derived from the context. +""" -class _MultiModalLimits(UserDict): + +class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message when attempting to access a model that does not exist. """ - def __getitem__(self, key: ModelConfig) -> Dict[str, int]: + def __getitem__(self, key: "ModelConfig") -> Dict[str, int]: try: return super().__getitem__(key) except KeyError as exc: @@ -43,6 +62,9 @@ def __init__( plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: self._plugins = {p.get_data_key(): p for p in plugins} + self._processor_factories: Dict[Type[nn.Module], + MultiModalProcessorFactory] = {} + # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} @@ -98,10 +120,10 @@ def register_image_input_mapper( def map_input( self, - model_config: ModelConfig, + model_config: "ModelConfig", data: MultiModalDataDict, mm_processor_kwargs: Optional[Dict[str, Any]] = None, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: """ Apply an input mapper to the data passed to the model. @@ -137,9 +159,9 @@ def map_input( merged_dict[input_key] = input_tensor - return MultiModalInputs(merged_dict) + return MultiModalKwargs(merged_dict) - def create_input_mapper(self, model_config: ModelConfig): + def create_input_mapper(self, model_config: "ModelConfig"): """ Create an input mapper (see :meth:`map_input`) for a specific model. """ @@ -177,7 +199,7 @@ def register_max_image_tokens( """ return self.register_max_multimodal_tokens("image", max_mm_tokens) - def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: + def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: """ Get the maximum number of multi-modal tokens for profiling the memory usage of a model. @@ -195,7 +217,7 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int: def init_mm_limits_per_prompt( self, - model_config: ModelConfig, + model_config: "ModelConfig", ) -> None: """ Initialize the maximum number of multi-modal input instances for each @@ -231,7 +253,7 @@ def init_mm_limits_per_prompt( def get_mm_limits_per_prompt( self, - model_config: ModelConfig, + model_config: "ModelConfig", ) -> Mapping[str, int]: """ Get the maximum number of multi-modal input instances for each modality @@ -241,3 +263,59 @@ def get_mm_limits_per_prompt( This should be called after :meth:`init_mm_limits_per_prompt`. """ return self._limits_by_model[model_config] + + def register_processor( + self, + factory: MultiModalProcessorFactory, + ): + """ + Register a multi-modal processor to a model class. + + When the model receives multi-modal data, the provided function is + invoked to transform the data into a dictionary of model inputs. + + See also: + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` + """ + + def wrapper(model_cls: N) -> N: + if model_cls in self._processor_factories: + logger.warning( + "Model class %s already has an input mapper " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._processor_factories[model_cls] = factory + + return model_cls + + return wrapper + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + """ + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + return model_cls in self._processor_factories + + def create_processor( + self, + model_config: "ModelConfig", + tokenizer: AnyTokenizer, + ) -> MultiModalProcessor: + """ + Create a multi-modal processor for a specific model and tokenizer. + """ + + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + processor_factory = self._processor_factories[model_cls] + + ctx = InputProcessingContext(model_config, tokenizer) + return processor_factory(ctx) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3c801464383ad..40194716bbf94 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ import base64 +import os from functools import lru_cache from io import BytesIO from typing import Any, List, Optional, Tuple, TypeVar, Union @@ -7,30 +8,72 @@ import numpy.typing as npt from PIL import Image +import vllm.envs as envs from vllm.connections import global_http_connection -from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT from vllm.logger import init_logger -from vllm.multimodal.base import MultiModalDataDict from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .inputs import MultiModalDataDict, PlaceholderRange + logger = init_logger(__name__) cached_get_tokenizer = lru_cache(get_tokenizer) -def _load_image_from_bytes(b: bytes): +def _load_image_from_bytes(b: bytes) -> Image.Image: image = Image.open(BytesIO(b)) image.load() return image -def _load_image_from_data_url(image_url: str): +def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool: + # Get the common path + common_path = os.path.commonpath([ + os.path.abspath(image_path), + os.path.abspath(allowed_local_media_path) + ]) + # Check if the common path is the same as allowed_local_media_path + return common_path == os.path.abspath(allowed_local_media_path) + + +def _load_image_from_file(image_url: str, + allowed_local_media_path: str) -> Image.Image: + if not allowed_local_media_path: + raise ValueError("Invalid 'image_url': Cannot load local files without" + "'--allowed-local-media-path'.") + if allowed_local_media_path: + if not os.path.exists(allowed_local_media_path): + raise ValueError( + "Invalid '--allowed-local-media-path': " + f"The path {allowed_local_media_path} does not exist.") + if not os.path.isdir(allowed_local_media_path): + raise ValueError( + "Invalid '--allowed-local-media-path': " + f"The path {allowed_local_media_path} must be a directory.") + + # Only split once and assume the second part is the image path + _, image_path = image_url.split("file://", 1) + if not _is_subpath(image_path, allowed_local_media_path): + raise ValueError( + f"Invalid 'image_url': The file path {image_path} must" + " be a subpath of '--allowed-local-media-path'" + f" '{allowed_local_media_path}'.") + + image = Image.open(image_path) + image.load() + return image + + +def _load_image_from_data_url(image_url: str) -> Image.Image: # Only split once and assume the second part is the base64 encoded image _, image_base64 = image_url.split(",", 1) return load_image_from_base64(image_base64) -def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image: +def fetch_image(image_url: str, + *, + image_mode: str = "RGB", + allowed_local_media_path: str = "") -> Image.Image: """ Load a PIL image from a HTTP or base64 data URL. @@ -38,21 +81,26 @@ def fetch_image(image_url: str, *, image_mode: str = "RGB") -> Image.Image: """ if image_url.startswith('http'): image_raw = global_http_connection.get_bytes( - image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT) + image_url, + timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) image = _load_image_from_bytes(image_raw) elif image_url.startswith('data:image'): image = _load_image_from_data_url(image_url) + elif image_url.startswith('file://'): + image = _load_image_from_file(image_url, allowed_local_media_path) else: raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image' or 'http'.") + "with either 'data:image', 'file://' or 'http'.") return image.convert(image_mode) async def async_fetch_image(image_url: str, *, - image_mode: str = "RGB") -> Image.Image: + image_mode: str = "RGB", + allowed_local_media_path: str = "") -> Image.Image: """ Asynchronously load a PIL image from a HTTP or base64 data URL. @@ -60,25 +108,108 @@ async def async_fetch_image(image_url: str, """ if image_url.startswith('http'): image_raw = await global_http_connection.async_get_bytes( - image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT) + image_url, + timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) image = _load_image_from_bytes(image_raw) elif image_url.startswith('data:image'): image = _load_image_from_data_url(image_url) + elif image_url.startswith('file://'): + image = _load_image_from_file(image_url, allowed_local_media_path) else: raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image' or 'http'.") + "with either 'data:image', 'file://' or 'http'.") return image.convert(image_mode) +def _load_video_frames_from_bytes(b: bytes): + frame = Image.open(BytesIO(b)) + return np.array(frame) + + +def load_video_frames_from_base64(frame: Union[bytes, str]): + """Load frame from base64 format.""" + return _load_video_frames_from_bytes(base64.b64decode(frame)) + + +def _load_video_from_bytes(b: bytes, num_frames: int = 32): + _, decord = try_import_video_packages() + + video_path = BytesIO(b) + vr = decord.VideoReader(video_path, num_threads=1) + total_frame_num = len(vr) + + if total_frame_num > num_frames: + uniform_sampled_frames = np.linspace(0, + total_frame_num - 1, + num_frames, + dtype=int) + frame_idx = uniform_sampled_frames.tolist() + else: + frame_idx = [i for i in range(0, total_frame_num)] + frames = vr.get_batch(frame_idx).asnumpy() + + return frames + + +def _load_video_from_data_url(video_url: str): + # Only split once and assume the second part is the base64 encoded image + frames_base64 = video_url.split(",")[1:] + return np.stack([ + load_video_frames_from_base64(frame_base64) + for frame_base64 in frames_base64 + ]) + + +def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: + """ + Load video from a HTTP or base64 data URL. + """ + if video_url.startswith('http') or video_url.startswith('https'): + video_raw = global_http_connection.get_bytes( + video_url, + timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) + video = _load_video_from_bytes(video_raw, num_frames) + elif video_url.startswith('data:video'): + video = _load_video_from_data_url(video_url) + else: + raise ValueError("Invalid 'video_url': A valid 'video_url' must start " + "with either 'data:video' or 'http'.") + return video + + +async def async_fetch_video(video_url: str, + *, + num_frames: int = 32) -> npt.NDArray: + """ + Asynchronously load video from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + if video_url.startswith('http') or video_url.startswith('https'): + video_raw = await global_http_connection.async_get_bytes( + video_url, + timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) + video = _load_video_from_bytes(video_raw, num_frames) + elif video_url.startswith('data:video'): + video = _load_video_from_data_url(video_url) + else: + raise ValueError("Invalid 'video_url': A valid 'video_url' must start " + "with either 'data:video' or 'http'.") + return video + + def try_import_audio_packages() -> Tuple[Any, Any]: try: import librosa import soundfile - except ImportError: + except ImportError as exc: raise ImportError( - "Please install vllm[audio] for audio support.") from None + "Please install vllm[audio] for audio support.") from exc return librosa, soundfile @@ -90,7 +221,9 @@ def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: if audio_url.startswith("http"): audio_bytes = global_http_connection.get_bytes( - audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT) + audio_url, + timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + ) elif audio_url.startswith("data:audio"): _, audio_base64 = audio_url.split(",", 1) audio_bytes = base64.b64decode(audio_base64) @@ -110,7 +243,9 @@ async def async_fetch_audio( if audio_url.startswith("http"): audio_bytes = await global_http_connection.async_get_bytes( - audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT) + audio_url, + timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + ) elif audio_url.startswith("data:audio"): _, audio_base64 = audio_url.split(",", 1) audio_bytes = base64.b64decode(audio_base64) @@ -126,21 +261,39 @@ def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: return {"audio": (audio, sr)} -def get_and_parse_image(image_url: str) -> MultiModalDataDict: - image = fetch_image(image_url) +def get_and_parse_image( + image_url: str, + *, + allowed_local_media_path: str = "") -> MultiModalDataDict: + image = fetch_image(image_url, + allowed_local_media_path=allowed_local_media_path) return {"image": image} +def get_and_parse_video(video_url: str) -> MultiModalDataDict: + video = fetch_video(video_url) + return {"video": video} + + async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: audio, sr = await async_fetch_audio(audio_url) return {"audio": (audio, sr)} -async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict: - image = await async_fetch_image(image_url) +async def async_get_and_parse_image( + image_url: str, + *, + allowed_local_media_path: str = "") -> MultiModalDataDict: + image = await async_fetch_image( + image_url, allowed_local_media_path=allowed_local_media_path) return {"image": image} +async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict: + video = await async_fetch_video(video_url) + return {"video": video} + + def encode_audio_base64( audio: np.ndarray, sampling_rate: int, @@ -191,14 +344,15 @@ def rescale_image_size(image: Image.Image, def try_import_video_packages() -> Any: try: import cv2 - except ImportError: + import decord + except ImportError as exc: raise ImportError( - "Please install vllm[video] for video support.") from None - return cv2 + "Please install vllm[video] for video support.") from exc + return cv2, decord def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray: - cv2 = try_import_video_packages() + cv2, _ = try_import_video_packages() num_frames, _, _, channels = frames.shape new_height, new_width = size @@ -229,6 +383,15 @@ def sample_frames_from_video(frames: npt.NDArray, return sampled_frames +def encode_video_base64(frames: npt.NDArray): + base64_frames = [] + frames_list = [frames[i] for i in range(frames.shape[0])] + for frame in frames_list: + img_base64 = encode_image_base64(Image.fromarray(frame)) + base64_frames.append(img_base64) + return ",".join(base64_frames) + + # Utilities for input processors _T = TypeVar("_T", str, int) @@ -258,7 +421,7 @@ def repeat_and_pad_placeholder_tokens( repeat_count: Union[int, List[int]], pad_token_left: Optional[int] = None, pad_token_right: Optional[int] = None, -) -> Tuple[Optional[str], List[int]]: +) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]: if isinstance(repeat_count, int): repeat_count = [repeat_count] @@ -301,6 +464,7 @@ def repeat_and_pad_placeholder_tokens( new_prompt += prompt_parts[-1] new_token_ids: List[int] = [] + placeholder_ranges: List[PlaceholderRange] = [] placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: @@ -310,6 +474,10 @@ def repeat_and_pad_placeholder_tokens( pad_token_left=pad_token_left, pad_token_right=pad_token_right, ) + placeholder_ranges.append({ + "offset": len(new_token_ids), + "length": len(replacement_ids) + }) new_token_ids.extend(replacement_ids) placeholder_token_idx += 1 @@ -320,4 +488,14 @@ def repeat_and_pad_placeholder_tokens( else: new_token_ids.append(token) - return new_prompt, new_token_ids + return new_prompt, new_token_ids, placeholder_ranges + + +def consecutive_placeholder_ranges(num_items: int, + item_size: int) -> List[PlaceholderRange]: + """Returns a list of consecutive PlaceholderRanges of a fixed size""" + + return [ + PlaceholderRange(offset=i * item_size, length=item_size) + for i in range(num_items) + ] diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index c3235c4acb6fd..ba9bf58a4a20c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,34 +1,26 @@ from functools import lru_cache -from typing import Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np -from vllm.config import ModelConfig from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_video_processor from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalInputs +from .base import MultiModalData from .image import ImagePlugin +from .inputs import MultiModalKwargs, VideoItem + +if TYPE_CHECKING: + from vllm.config import ModelConfig logger = init_logger(__name__) cached_get_video_processor = lru_cache(get_video_processor) cached_get_tokenizer = lru_cache(get_tokenizer) -VideoInput = Union[ - "np.ndarray", # single video input - List["np.ndarray"], - # TODO: support more types - # List[Image.Image], List[List[Image.Image]], - # "torch.Tensor", - # List["torch.Tensor"], - # List[List["np.ndarrray"]], - # List[List["torch.Tensor"]], -] - class VideoPlugin(ImagePlugin): """Plugin for video data.""" @@ -38,7 +30,7 @@ def get_data_key(self) -> str: def _get_hf_video_processor( self, - model_config: ModelConfig, + model_config: "ModelConfig", mm_processor_kwargs: Optional[Dict[str, Any]] = None, ): if mm_processor_kwargs is None: @@ -51,11 +43,14 @@ def _get_hf_video_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[VideoItem], **mm_processor_kwargs, - ) -> MultiModalInputs: + ) -> MultiModalKwargs: model_config = ctx.model_config + if isinstance(data, list) and len(data) == 1: + data = data[0] # type: ignore + if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray): video_processor = self._get_hf_video_processor( model_config, @@ -74,7 +69,7 @@ def _default_input_mapper( logger.error("Failed to process video (%s)", data) raise - return MultiModalInputs(batch_data) + return MultiModalKwargs(batch_data) raise TypeError(f"Invalid video type: {type(data)}") diff --git a/vllm/outputs.py b/vllm/outputs.py index 951976310e7ae..badf50d0602d6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -83,10 +83,11 @@ class RequestOutput: finished: Whether the whole request is finished. metrics: Metrics associated with the request. lora_request: The LoRA request that was used to generate the output. - encoder_prompt: The encoder prompt string of the request; - None if decoder-only - encoder_prompt_token_ids: The token IDs of the encoder prompt; - None if decoder-only + encoder_prompt: The encoder prompt string of the request. + None if decoder-only. + encoder_prompt_token_ids: The token IDs of the encoder prompt. + None if decoder-only. + num_cached_tokens: The number of tokens with prefix cache hit. """ def __init__( @@ -101,6 +102,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, encoder_prompt: Optional[str] = None, encoder_prompt_token_ids: Optional[List[int]] = None, + num_cached_tokens: Optional[int] = None, ) -> None: self.request_id = request_id self.prompt = prompt @@ -112,6 +114,37 @@ def __init__( self.lora_request = lora_request self.encoder_prompt = encoder_prompt self.encoder_prompt_token_ids = encoder_prompt_token_ids + self.num_cached_tokens = num_cached_tokens + + @classmethod + def new( + cls, + request_id: str, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]], + text: str, + token_ids: List[int], + finished: bool = False, + ) -> "RequestOutput": + """Initialize a new RequestOutput object.""" + + # TODO: Support `n` > 1. + completion_output = CompletionOutput( + index=0, + text=text, + token_ids=token_ids, + cumulative_logprob=None, + logprobs=None, # TODO + ) + + return RequestOutput( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, # TODO + outputs=[completion_output], + finished=finished, + ) @classmethod def from_seq_group( @@ -162,6 +195,8 @@ def from_seq_group( outputs = [] include_prompt = True + # num_cached_tokens should be the same for all the sequences + num_cached_tokens = None for i, seq in enumerate(top_n_seqs): output_text = seq.get_output_text_to_return( text_buffer_length, delta) @@ -169,6 +204,7 @@ def from_seq_group( output_token_ids = seq.get_output_token_ids_to_return(delta) num_output_tokens = 1 if isinstance(output_token_ids, int) else len(output_token_ids) + num_cached_tokens = seq.data.get_num_cached_tokens() output_logprobs = seq.output_logprobs if include_logprobs else None @@ -242,7 +278,7 @@ def from_seq_group( init_args = (seq_group.request_id, prompt, prompt_token_ids, prompt_logprobs, outputs, finished, seq_group.metrics, seq_group.lora_request, encoder_prompt, - encoder_prompt_token_ids) + encoder_prompt_token_ids, num_cached_tokens) if use_cache: request_output = seq_group.cached_request_output @@ -263,7 +299,8 @@ def __repr__(self) -> str: f"outputs={self.outputs}, " f"finished={self.finished}, " f"metrics={self.metrics}, " - f"lora_request={self.lora_request})") + f"lora_request={self.lora_request}, " + f"num_cached_tokens={self.num_cached_tokens})") class EmbeddingRequestOutput: diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 524150920b854..9e740837381f8 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -42,6 +42,13 @@ except Exception: pass +is_hpu = False +try: + from importlib import util + is_hpu = util.find_spec('habana_frameworks') is not None +except Exception: + pass + is_xpu = False try: @@ -86,6 +93,9 @@ elif is_rocm: from .rocm import RocmPlatform current_platform = RocmPlatform() +elif is_hpu: + from .hpu import HpuPlatform + current_platform = HpuPlatform() elif is_xpu: from .xpu import XPUPlatform current_platform = XPUPlatform() diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py new file mode 100644 index 0000000000000..170cfff94f90d --- /dev/null +++ b/vllm/platforms/hpu.py @@ -0,0 +1,11 @@ +import torch + +from .interface import Platform, PlatformEnum + + +class HpuPlatform(Platform): + _enum = PlatformEnum.HPU + + @staticmethod + def inference_mode(): + return torch.no_grad() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index c3a3e7a284457..81d8bdae2383c 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -10,6 +10,7 @@ class PlatformEnum(enum.Enum): CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() + HPU = enum.auto() XPU = enum.auto() CPU = enum.auto() NEURON = enum.auto() @@ -46,6 +47,9 @@ def is_rocm(self) -> bool: def is_tpu(self) -> bool: return self._enum == PlatformEnum.TPU + def is_hpu(self) -> bool: + return self._enum == PlatformEnum.HPU + def is_xpu(self) -> bool: return self._enum == PlatformEnum.XPU diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index 35dbe22abf7ff..31fe3f1fcbfe4 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -1,10 +1,12 @@ import torch import vllm.envs as envs -from vllm.utils import print_warning_once +from vllm.logger import init_logger from .interface import Platform, PlatformEnum +logger = init_logger(__name__) + class OpenVinoPlatform(Platform): _enum = PlatformEnum.OPENVINO @@ -27,5 +29,5 @@ def is_openvino_gpu(self) -> bool: @classmethod def is_pin_memory_available(self) -> bool: - print_warning_once("Pin memory is not supported on OpenViNO.") + logger.warning("Pin memory is not supported on OpenViNO.") return False diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 4338cbc37f6c1..8373e11cfff9f 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,8 +1,14 @@ import logging -from typing import Callable, Optional, Union +from typing import TYPE_CHECKING, Callable, Optional, Union import vllm.envs as envs -from vllm.compilation.config import CompilationConfig + +if TYPE_CHECKING: + from vllm.compilation.config import CompilationConfig + from vllm.config import VllmConfig +else: + CompilationConfig = None + VllmConfig = None logger = logging.getLogger(__name__) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 7461fb51989c6..2635c0bccd1c4 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -7,7 +7,7 @@ class PoolingParams( msgspec.Struct, omit_defaults=True, # type: ignore[call-arg] array_like=True): # type: ignore[call-arg] - """Pooling parameters for pooling. + """Pooling parameters for embeddings API. Attributes: additional_data: Any additional data needed for pooling. @@ -16,7 +16,7 @@ class PoolingParams( def clone(self) -> "PoolingParams": """Returns a deep copy of the PoolingParams instance.""" - return PoolingParams(additional_data=self.additional_data, ) + return PoolingParams(additional_data=self.additional_data) def __repr__(self) -> str: return (f"PoolingParams(" diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 4cde2a0254b90..473b87c89c21d 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -37,9 +37,8 @@ def load_peft_weights(model_id: str, Additional arguments to pass to the `hf_hub_download` method when loading from the HuggingFace Hub. """ - path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) - if hf_hub_download_kwargs.get("subfolder", None) is not None else - model_id) + path = (os.path.join(model_id, hf_hub_download_kwargs["subfolder"]) if + hf_hub_download_kwargs.get("subfolder") is not None else model_id) if device is None: device = infer_device() @@ -51,19 +50,19 @@ def load_peft_weights(model_id: str, filename = os.path.join(path, WEIGHTS_NAME) use_safetensors = False else: - token = hf_hub_download_kwargs.get("token", None) + token = hf_hub_download_kwargs.get("token") if token is None: - token = hf_hub_download_kwargs.get("use_auth_token", None) + token = hf_hub_download_kwargs.get("use_auth_token") hub_filename = (os.path.join(hf_hub_download_kwargs["subfolder"], SAFETENSORS_WEIGHTS_NAME) - if hf_hub_download_kwargs.get("subfolder", None) - is not None else SAFETENSORS_WEIGHTS_NAME) + if hf_hub_download_kwargs.get("subfolder") is not None + else SAFETENSORS_WEIGHTS_NAME) has_remote_safetensors_file = file_exists( repo_id=model_id, filename=hub_filename, - revision=hf_hub_download_kwargs.get("revision", None), - repo_type=hf_hub_download_kwargs.get("repo_type", None), + revision=hf_hub_download_kwargs.get("revision"), + repo_type=hf_hub_download_kwargs.get("repo_type"), token=token, ) use_safetensors = has_remote_safetensors_file diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 5e191c6e715e0..5c6df5aaf5446 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -485,8 +485,8 @@ def __repr__(self) -> str: f"skip_special_tokens={self.skip_special_tokens}, " "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " - f"truncate_prompt_tokens={self.truncate_prompt_tokens}), " - f"guided_decoding={self.guided_decoding}") + f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " + f"guided_decoding={self.guided_decoding})") class BeamSearchParams( diff --git a/vllm/sequence.py b/vllm/sequence.py index ff59f333f00b4..3b41d25a2fe42 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,24 +5,21 @@ from array import array from collections import defaultdict from dataclasses import dataclass, field -from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional +from functools import reduce +from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union, cast +from typing import Set, Tuple, Union import msgspec import torch -from vllm.inputs.parse import is_encoder_decoder_inputs +from vllm.inputs import SingletonInputs, SingletonInputsAdapter from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams -if TYPE_CHECKING: - from vllm.inputs import SingletonInputs - from vllm.multimodal.base import MultiModalDataDict - VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 @@ -166,6 +163,8 @@ class SequenceData(msgspec.Struct, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 + # The number of tokens with prefix cache hit. + _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) @@ -256,7 +255,8 @@ def output_token_ids(self) -> Tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter - def output_token_ids(self, new_output_token_ids: List[int]) -> None: + def output_token_ids(self, + new_output_token_ids: GenericSequence[int]) -> None: self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, new_output_token_ids) self._update_cached_all_tokens() @@ -321,6 +321,14 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int): if self.get_num_uncomputed_tokens() == 0: self._stage = SequenceStage.DECODE + def get_num_cached_tokens(self) -> int: + """Return the number of tokens with prefix cache hit.""" + return self._num_cached_tokens + + def update_num_cached_tokens(self, num_cached_tokens: int): + """Update the number of tokens with prefix cache hit.""" + self._num_cached_tokens = num_cached_tokens + def reset_state_for_recompute(self) -> None: """Reset the number of computed tokens from this sequence. It is supposed to be called when a sequence needs to be started from @@ -378,14 +386,9 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - The sequence is constructed from the :code:`SingletonInputs` instance - passed in through the :code:`inputs` constructor argument. - - For encoder/decoder models, SingletonInputs encapsulates both a - decoder and encoder prompt, creating an ambiguity about which - prompt to construct the sequence from. The `from_decoder_prompt` - constructor argument signals whether to construct the Sequence - from the SingletonInputs decoder prompt, or encoder prompt. + The sequence is constructed from the :data:`DecoderOnlyInputs` + (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the :code:`inputs` constructor argument. Args: seq_id: The ID of the sequence. @@ -395,55 +398,23 @@ class Sequence: eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. lora_request: LoRA request. prompt_adapter_request: Prompt Adapter request. - from_decoder_prompt: Construct Sequence from SingletonInputs decoder - prompt (True) or encoder prompt (False.) Must be - True for decoder-only model. - """ def __init__( self, seq_id: int, - inputs: "SingletonInputs", + inputs: SingletonInputs, block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - from_decoder_prompt: bool = True, ) -> None: self.seq_id = seq_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.from_decoder_prompt = from_decoder_prompt - - # For decoder-only models, a Sequence is constructed - # from an DecoderOnlyInputs instance (the `inputs` arg.) - # - # For encoder/decoder models the same `inputs` - # instance could be utilized to construct either an - # encoder sequence or a decoder sequence, because - # `DecoderOnlyInputs` has both decoder- and encoder-oriented - # member variables (i.e. it encapsulates both an encoder - # and a decoder prompt.) The decision of which type of sequence - # to generate is determined by the `from_decoder_prompt` argument. - # - # When constructing a encoder sequence - # (`from_decoder_prompt` False) it matters that - # the `DecoderOnlyInputs` instance stored in `inputs` is valid - # in the sense that its encoder-related member variables are - # populated; below, an exception is raised if this is - # not the case. - # - # When constructing a decoder sequence (`from_decoder_prompt` True) - # it does not matter whether `inputs` has its encoder-related - # member variables populated. - if not (from_decoder_prompt or is_encoder_decoder_inputs(inputs)): - raise ValueError("Cannot extract encoder input prompt from " - f"invalid input {inputs}; did you forget the " - "encoder input prompt fields?") self.data = SequenceData.from_seqs(self.prompt_token_ids) self.output_logprobs: SampleLogprobs = [] @@ -466,43 +437,29 @@ def __init__( def n_blocks(self) -> int: return (self.get_len() + self.block_size - 1) // self.block_size - @cached_property + @property def prompt(self) -> Optional[str]: - # Select decoder or encoder input prompt str, as appropriate - prompt_key: str = ("prompt" - if self.from_decoder_prompt else "encoder_prompt") + return self.inputs.prompt - return cast(Optional[str], self.inputs.get(prompt_key)) - - @cached_property + @property def prompt_token_ids(self) -> List[int]: - # Select decoder or encoder input prompt token ids, as appropriate - prompt_token_ids_key: str = ("prompt_token_ids" - if self.from_decoder_prompt else - "encoder_prompt_token_ids") + return self.inputs.prompt_token_ids - # Cache computed prompt token ids - return cast(List[int], self.inputs.get(prompt_token_ids_key)) + @property + def prompt_embeds(self) -> Optional[torch.Tensor]: + return self.inputs.prompt_embeds @property def multi_modal_data(self) -> "MultiModalDataDict": - inputs = self.inputs - - if (inputs.get("multi_modal_data") - and inputs.get("encoder_multi_modal_data")): - raise ValueError( - "Multi-modal data in both encoder and decoder is not supported." - ) + return self.inputs.multi_modal_data - return cast( - "MultiModalDataDict", - (inputs.get("multi_modal_data") - or inputs.get("encoder_multi_modal_data") or {}), - ) + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + return self.inputs.multi_modal_placeholders @property def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.inputs.get("mm_processor_kwargs") or {} + return self.inputs.mm_processor_kwargs @property def lora_int_id(self) -> int: @@ -728,9 +685,13 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]: if self.encoder_seq is not None else None) @property - def multi_modal_data(self) -> "MultiModalDataDict": + def multi_modal_data(self) -> MultiModalDataDict: return self.first_seq.multi_modal_data + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + return self.first_seq.multi_modal_placeholders + @property def mm_processor_kwargs(self) -> Dict[str, Any]: return self.first_seq.mm_processor_kwargs @@ -921,7 +882,7 @@ class SequenceGroupMetadata( multi_modal_data: Multi modal data. mm_processor_kwargs: Multimodal input processor / mapper overrides. encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None + (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder model. cross_block_table: Optional cross-attention block table associated @@ -946,6 +907,7 @@ class SequenceGroupMetadata( # "MultiModalDataDict" types. We have to use Any due to msgspec # doesn't allow to have union of 2 different dicts. multi_modal_data: Optional[Any] = None + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[List[int]] = None @@ -1164,7 +1126,7 @@ def get_all_seq_ids_and_request_ids( sequence ids. """ seq_ids: List[int] = [] - request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set) + request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set) for sg in seq_group_metadata_list: for seq_id in sg.seq_data: seq_ids.append(seq_id) diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 59e71cc8deb48..6a7929d9d8f9c 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -90,7 +90,7 @@ def score_proposals( else: # Batch has a mix of spec decode enabled and disabled seq groups contracted = self._contract_batch( - contracted_bs=len(execute_model_req.seq_group_metadata_list), + execute_model_req.seq_group_metadata_list, target_sampler_output=target_sampler_output, proposals=proposals, num_scoring_tokens=num_scoring_tokens, @@ -126,7 +126,7 @@ def _expand_batch( split_batch_by_proposal_len( seq_group_metadata_list, proposal_lens_list) - target_seq_group_metadata_list = self._create_scoring_model_input( + spec_expanded_seqs = self._create_scoring_model_input( seq_group_metadata_list=spec_seqs, proposal_token_ids=proposal_token_ids_list, # NOTE: We determine the seq ids in the expanded batch using the @@ -135,16 +135,19 @@ def _expand_batch( seq_ids=get_all_seq_ids(seq_group_metadata_list)), ) - num_scoring_tokens = len(target_seq_group_metadata_list) - target_seq_group_metadata_list.extend(non_spec_seqs) + num_scoring_tokens = len(spec_expanded_seqs) + # Batch speculative and non-speculative (e.g. chunked prefill) requests + # but make sure order is prefill|decode due to backend requirement. + target_seq_group_metadata_list = non_spec_seqs + spec_expanded_seqs return (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) def _contract_batch( - self, contracted_bs: int, target_sampler_output: SamplerOutput, - proposals: SpeculativeProposals, num_scoring_tokens: int, - non_spec_indices: List[int], spec_indices: List[int], k: int + self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata], + target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, + num_scoring_tokens: int, non_spec_indices: List[int], + spec_indices: List[int], k: int ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: """Contract the expanded batch back into its original size. @@ -154,6 +157,7 @@ def _contract_batch( contracted_bs is the original batch size, and the batch size that the target_sampler_output will be contracted to. """ + contracted_bs = len(contracted_seq_group_metadata_list) (target_token_ids, target_probs, target_logprobs, target_hidden_states, non_spec_target_token_ids, non_spec_target_probs, non_spec_target_logprobs, @@ -166,8 +170,8 @@ def _contract_batch( # The number of tokens in the expanded batch used for speculation is # equal to the total expanded batch size minus the number of samples for - # non-speculative sequences. - non_spec_expanded_bs = len(non_spec_target_token_ids) + # non-speculative sequences, prefill chunks with no out tokens included + non_spec_expanded_bs = len(non_spec_indices) spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1) @@ -191,7 +195,12 @@ def _contract_batch( else: all_hidden_states = None - if non_spec_indices: + # Rule out prefills that produce no tokens. + non_spec_indices = [ + idx for idx in non_spec_indices + if contracted_seq_group_metadata_list[idx].do_sample + ] + if len(non_spec_indices): all_tokens[non_spec_indices, :1] = \ non_spec_target_token_ids.unsqueeze(1) all_probs[non_spec_indices, :1, :] = \ @@ -290,9 +299,6 @@ def _create_target_seq_group_metadata( This function creates K+1 target SequenceGroupMetadata to take advantage of the bonus token. """ - assert not input_seq_group_metadata.is_prompt, ( - "Speculating on " - "prompts not yet supported") assert len(input_seq_group_metadata.seq_data) == 1, ( "Beam search " "not supported in speculative decoding") @@ -390,27 +396,22 @@ def _split_scoring_output( # and non spec sequences) and should be removed in the future. It can be # done by supporting per-sequence proposal lens. # - # First samples are from speculative scoring, latter samples are non- - # speculative samples. - split_sizes = (num_scoring_tokens, - sampler_output.sampled_token_ids.numel() - - num_scoring_tokens) - (spec_probs, non_spec_probs - ) = sampler_output.sampled_token_probs.split(split_sizes) - (spec_sampled_tokens, non_spec_sampled_tokens + # First samples are non-speculative, latter samples are from speculative + # scoring (prefill|decode order). + split_sizes = (sampler_output.sampled_token_ids.numel() - + num_scoring_tokens, num_scoring_tokens) + (non_spec_probs, + spec_probs) = sampler_output.sampled_token_probs.split(split_sizes) + (non_spec_sampled_tokens, spec_sampled_tokens ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) - ( - spec_logprobs, - non_spec_logprobs, - ) = sampler_output.logprobs.split(split_sizes) + (non_spec_logprobs, + spec_logprobs) = sampler_output.logprobs.split(split_sizes) if sampler_output.hidden_states is not None: - ( - spec_hidden_states, - non_spec_hidden_states, - ) = sampler_output.hidden_states.split(split_sizes) + (non_spec_hidden_states, spec_hidden_states + ) = sampler_output.hidden_states.split(split_sizes) else: - spec_hidden_states, non_spec_hidden_states = None, None + non_spec_hidden_states, spec_hidden_states = None, None return (spec_sampled_tokens, spec_probs, spec_logprobs, spec_hidden_states, non_spec_sampled_tokens, non_spec_probs, diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 3aa999fcb9ebb..cd4d7eb0e6e4e 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -17,11 +17,8 @@ "Draft model speculative decoding currently only supports" "CUDA and ROCm flash attention backend.") from err -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) from vllm.logger import init_logger -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) @@ -49,40 +46,15 @@ class TP1DraftModelRunner(ModelRunner): any broadcasting inside execute_model). """ - def __init__( - self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None, - ): - if return_hidden_states: + def __init__(self, *args, **kwargs): + if kwargs.get("return_hidden_states"): raise ValueError( "return_hidden_states is not supported for TP1DraftModelRunner." ) - super().__init__( - model_config=model_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - cache_config=cache_config, - load_config=load_config, - lora_config=lora_config, - kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - prompt_adapter_config=prompt_adapter_config, - return_hidden_states=return_hidden_states, - observability_config=observability_config, - ) + super().__init__(*args, **kwargs) + + self.indices_of_seq_with_bonus_tokens = None def _update_sampling_metadata(self, sampling_metadata, num_seqs, num_queries): @@ -189,6 +161,10 @@ def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): # TODO: Add soft-tuning prompt adapter support return not self.prompt_adapter_config + def set_indices_of_seq_with_bonus_tokens(self, + indices_of_seq_with_bonus_tokens): + self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens + @torch.inference_mode() def execute_model( self, @@ -304,7 +280,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **kwargs, ) @@ -314,11 +290,30 @@ def execute_model( model_input.sampling_metadata) # Sample the next token. - outputs.append( - self.model.sample( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - )) + output = self.model.sample( + logits=logits, + sampling_metadata=model_input.sampling_metadata, + ) + outputs.append(output) + + if model_input.attn_metadata.num_prefills == 0 \ + and self.indices_of_seq_with_bonus_tokens is not None: + assert output.sampled_token_ids is not None + # output.sampled_token_ids should be of shape (num_seqs, 1) + nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape + assert num_tokens_per_seq == 1 + count = 0 + for i in range(nums_seqs): + bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[ + count] + if i != bonus_seq_idx: + # The following might cause a cpu->gpu sync + # However, the performance impact is negligible as we + # benchmarked on H100. + output.sampled_token_ids[ + i, :] = model_input.input_tokens[bonus_seq_idx] + else: + count += 1 # Prepare inputs for the next step if step != num_steps - 1: diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index f35a8a0ab8be3..cbf793e2043e3 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -21,6 +21,11 @@ def score_proposals( all_proposal_lengths = proposals.proposal_lens.tolist() for i, seq_group_metadata in enumerate( execute_model_req.seq_group_metadata_list): + if all_proposal_lengths[i] == 0: + # Keep prompt seqs untouched (keep computed_tokens for chunks). + target_seq_group_metadata_list.append(seq_group_metadata) + continue + seq_data_dict = seq_group_metadata.seq_data assert len(seq_data_dict) == 1 seq_id = next(iter(seq_data_dict.keys())) @@ -40,8 +45,7 @@ def score_proposals( new_seq_data.update_num_computed_tokens( len(prompt_token_ids) + len(output_token_ids) - 1) - # Ensure that the new sequence has at least one token - # because we only use mqa scorer in the decoding stage. + # Ensure that the new decode sequence has at least one token. assert len(output_token_ids) >= 1 new_seq_data_dict = {target_seq_id: new_seq_data} @@ -54,7 +58,6 @@ def score_proposals( target_seq_id: seq_group_metadata.block_tables[seq_id], }, lora_request=None, - token_chunk_size=1, ) target_seq_group_metadata_list.append(new_seq_group_metadata) @@ -77,6 +80,7 @@ def score_proposals( all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size) else: + # We either have decodes with different lens or prefill+decodes. all_tokens = target_token_ids.new_full(size=(bs, k + 1), fill_value=-1) all_probs = target_probs.new_zeros(*all_tokens.shape, @@ -85,15 +89,18 @@ def score_proposals( fill_value=-float("inf")) target_token_ids = target_token_ids.flatten() start_loc = 0 - for i, proposed_len in enumerate(all_proposal_lengths): - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc + for i, (proposed_len, seq_meta) in enumerate( + zip(all_proposal_lengths, target_seq_group_metadata_list)): + # Skip chunks with no output tokens. + if seq_meta.do_sample: + output_len = proposed_len + 1 + end_loc = start_loc + output_len + all_tokens[ + i, :output_len] = target_token_ids[start_loc:end_loc] + all_probs[i, :output_len] = target_probs[start_loc:end_loc] + all_logprobs[ + i, :output_len] = target_logprobs[start_loc:end_loc] + start_loc = end_loc hidden_states = None if target_sampler_output.hidden_states is not None: diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py index 4b53fbe056c47..f49b98f5c9528 100644 --- a/vllm/spec_decode/multi_step_worker.py +++ b/vllm/spec_decode/multi_step_worker.py @@ -81,6 +81,8 @@ def sampler_output( # Here we run the draft_model_runner with multi-step prepare # on the GPU directly expanded_request.num_steps = sample_len + self.model_runner.set_indices_of_seq_with_bonus_tokens( + indices_of_seq_with_bonus_tokens) model_outputs = self.execute_model( execute_model_req=expanded_request) else: @@ -97,7 +99,8 @@ def sampler_output( model_output = model_output[0] self._append_new_tokens( - model_output, expanded_request.seq_group_metadata_list) + model_output, expanded_request.seq_group_metadata_list, + indices_of_seq_with_bonus_tokens) model_outputs.append(model_output) filtered_model_outputs = self._filter_model_output( @@ -221,13 +224,15 @@ def get_spec_proposals( @staticmethod def _append_new_tokens( model_output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: + seq_group_metadata_list: List[SequenceGroupMetadata], + indices_of_seq_with_bonus_tokens: List[int]) -> None: """Given model output from a single run, append the tokens to the sequences. This is normally done outside of the worker, but it is required if the worker is to perform multiple forward passes. """ - for seq_group_metadata, sequence_group_outputs in zip( - seq_group_metadata_list, model_output): + count = 0 + for index, (seq_group_metadata, sequence_group_outputs) in enumerate( + zip(seq_group_metadata_list, model_output)): seq_group_metadata.is_prompt = False for seq_output in sequence_group_outputs.samples: @@ -237,6 +242,16 @@ def _append_new_tokens( token_id = seq_output.output_token token_logprob = seq_output.logprobs[token_id] + # Determine the actual token ID to be generated, + # considering bonus tokens + if index != indices_of_seq_with_bonus_tokens[count]: + bonus_seq_metadata = seq_group_metadata_list[ + indices_of_seq_with_bonus_tokens[count]] + _, bonus_token_seq_data = next( + iter(bonus_seq_metadata.seq_data.items())) + token_id = bonus_token_seq_data.output_token_ids[-1] + else: + count += 1 seq.append_token_id(token_id, token_logprob.logprob) seq.update_num_computed_tokens(1) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index a777e5c3f22a7..debb3b2d5ec30 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -21,7 +21,7 @@ class NGramWorker(NonLLMProposerWorkerBase): def __init__(self, *args, **kwargs): # Get local_rank/vocab_size from kwargs attribute self.local_rank = kwargs["local_rank"] - self.vocab_size = kwargs["model_config"].get_vocab_size() + self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size() # Lazy initialization list. self._proposer: Top1Proposer diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 9f7ef2f8d851c..b57742c2ebfdd 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1,10 +1,11 @@ +import copy from collections import defaultdict from functools import cached_property from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch -from vllm.config import ParallelConfig, SpeculativeConfig +from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict from vllm.logger import init_logger from vllm.model_executor.layers.rejection_sampler import RejectionSampler @@ -45,8 +46,8 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": """Helper method that is the entrypoint for Executors which use WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config. """ - assert "speculative_config" in kwargs - speculative_config: SpeculativeConfig = kwargs.get("speculative_config") + vllm_config: VllmConfig = kwargs.get("vllm_config") + speculative_config: SpeculativeConfig = vllm_config.speculative_config assert speculative_config is not None draft_worker_kwargs = kwargs.copy() @@ -58,14 +59,20 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": target_worker.model_runner.disable_logprobs =\ speculative_config.disable_logprobs + draft_worker_config = copy.deepcopy(vllm_config) + draft_worker_config.model_config = speculative_config.draft_model_config + draft_worker_config.quant_config = VllmConfig._get_quantization_config( + draft_worker_config.model_config, + vllm_config.load_config, + ) + draft_worker_config.parallel_config = speculative_config.draft_parallel_config # noqa + # TODO allow draft-model specific load config. + # Override draft-model specific worker args. draft_worker_kwargs.update( - model_config=speculative_config.draft_model_config, - parallel_config=speculative_config.draft_parallel_config, + vllm_config=draft_worker_config, ngram_prompt_lookup_max=speculative_config.ngram_prompt_lookup_max, ngram_prompt_lookup_min=speculative_config.ngram_prompt_lookup_min, - # TODO allow draft-model specific load config. - #load_config=load_config, ) spec_decode_worker = SpecDecodeWorker.create_worker( @@ -134,29 +141,27 @@ def create_worker( draft_worker_kwargs.pop("ngram_prompt_lookup_max")) ngram_prompt_lookup_min = ( draft_worker_kwargs.pop("ngram_prompt_lookup_min")) + draft_model_config = draft_worker_kwargs["vllm_config"].model_config + draft_parallel_config: ParallelConfig = draft_worker_kwargs[ + 'vllm_config'].parallel_config if ngram_prompt_lookup_max > 0: proposer_worker = NGramWorker(**draft_worker_kwargs) proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, ngram_prompt_lookup_max) else: - draft_parallel_config: ParallelConfig = draft_worker_kwargs[ - 'parallel_config'] draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - if draft_worker_kwargs[ - "model_config"].hf_config.model_type == "mlp_speculator": + if draft_model_config.hf_config.model_type == "mlp_speculator": proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) - elif draft_worker_kwargs[ - "model_config"].hf_config.model_type == "medusa": + elif draft_model_config.hf_config.model_type == "medusa": proposer_worker = MedusaWorker(**draft_worker_kwargs) else: if draft_tp == 1: draft_worker_kwargs[ "model_runner_cls"] = TP1DraftModelRunner else: - if draft_worker_kwargs[ - "model_config"].hf_config.model_type == "eagle": + if draft_model_config.hf_config.model_type == "eagle": raise NotImplementedError( "EAGLE does not support TP > 1 yet") @@ -190,8 +195,8 @@ def create_worker( "[Speculative Decoding] Disabling MQA scorer as the " "MQA is only available with flash attn backend.") - if "model_config" in draft_worker_kwargs and \ - draft_worker_kwargs["model_config"].max_model_len < \ + if draft_model_config and \ + draft_model_config.max_model_len < \ scorer_worker.model_config.max_model_len: disable_mqa_scorer = True logger.info( @@ -413,7 +418,10 @@ def execute_model( # none of the requests in the batch have spec decoding enabled. # In any of these cases, the proposer and scorer workers # are called normally. - no_spec = num_lookahead_slots == 0 or disable_all_speculation or all( + # We expect `num_speculative_tokens` to be None for prefills. + no_spec = all( + sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list + ) or num_lookahead_slots == 0 or disable_all_speculation or all( sgm.num_speculative_tokens == 0 for sgm in execute_model_req.seq_group_metadata_list) @@ -479,7 +487,7 @@ def _maybe_disable_speculative_tokens( def _serialize_sampler_output_no_logprobs( self, execute_model_req: ExecuteModelRequest, - sampler_output: SamplerOutput) -> SamplerOutput: + sampler_output: SamplerOutput) -> List[SamplerOutput]: """ Creates and returns a `SamplerOutput` with only the token IDs being serialized to CPU and populated in `CompletionSequenceGroupOutput`. @@ -509,41 +517,56 @@ def _serialize_sampler_output_no_logprobs( if any(seq_output_prompt_logprobs) else \ sampler_output.sampled_token_ids).tolist() - seq_data_entries = ( + seq_data_entries = [ (seq_id, seq_data) for sg in \ execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() - ) + if sg.do_sample # ignore empty token sequences + ] completion_seq_group_output_list: List[ CompletionSequenceGroupOutput] = [] - for index, ((seq_id, seq_data), needs_prompt_logprobs) in \ - enumerate(zip(seq_data_entries, seq_output_prompt_logprobs)): - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, + output_index = 0 + # Make sure the non-terminal prefill chunks are still aligned with + # their own empty output. + for seq_group_meta in execute_model_req.seq_group_metadata_list: + # Since we can get chunks here, we dont always have a sampled token + # (only on last chunk) but we still have to provide an output. + if not seq_group_meta.do_sample: + completion_seq_group_output_list.append( + CompletionSequenceGroupOutput(samples=[], + prompt_logprobs=None)) + else: + # Sequence with output. + seq_id, seq_data = seq_data_entries[output_index] + needs_prompt_logprobs = seq_output_prompt_logprobs[ + output_index] + if needs_prompt_logprobs: + prompt_token_ids = seq_data.get_prompt_token_ids() + prompt_logprobs = [ + create_logprobs_output( + token_id=p_token_id, + token_id_logprob_rank=-1, + token_id_logprob=0.0, + topk_token_ids=[], + topk_logprobs=[], + ) + # no prompt logprobs for the first token + for p_token_id in prompt_token_ids[1:] + ] + else: + prompt_logprobs = None + completion_seq_group_output_list.append( + create_sequence_group_output( + token_id=sampled_token_ids_list[output_index][0], token_id_logprob_rank=-1, token_id_logprob=0.0, + seq_id=seq_id, topk_token_ids=[], topk_logprobs=[], - ) - # no prompt logprobs for the first token - for p_token_id in prompt_token_ids[1:] - ] - else: - prompt_logprobs = None - - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[index][0], - token_id_logprob_rank=-1, - token_id_logprob=0.0, - seq_id=seq_id, - topk_token_ids=[], - topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - return SamplerOutput(outputs=completion_seq_group_output_list) + prompt_logprobs=prompt_logprobs)) + output_index += 1 + + return [SamplerOutput(outputs=completion_seq_group_output_list)] @nvtx_range("spec_decode_worker._run_no_spec") def _run_no_spec(self, execute_model_req: ExecuteModelRequest, @@ -563,6 +586,9 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, hidden_states = sampler_output.hidden_states if hidden_states is not None: # remove hidden_states for prompt tokens + # TODO Enable `return_hidden_states`: prefill chunks hidden states + # are pruned by the logits processor. Also, they should be arranged + # back into full-prefill latent. Address it to enable MLPSpeculator. if any(seq.is_prompt for seq in execute_model_req.seq_group_metadata_list): hidden_states = hidden_states[ @@ -588,14 +614,14 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, sampler_output_to_return = (self._serialize_sampler_output_no_logprobs( execute_model_req=execute_model_req, sampler_output=sampler_output) if self._disable_logprobs else - sampler_output) + [sampler_output]) # Clear device tensors from sampler output. This reduces communication # overhead when the engine runs in a different process than the workers. sampler_output.sampled_token_probs = None sampler_output.sampled_token_ids = None sampler_output.logprobs = None - return [sampler_output_to_return] + return sampler_output_to_return def _run_non_driver_rank(self) -> bool: """Run proposer and verifier model in non-driver workers. This is used @@ -639,9 +665,15 @@ def _run_speculative_decoding_step( This invokes the proposer worker to get k speculative tokens for each sequence, then scores each speculative token using the scoring worker. + When `enable_chunked_prefill` is set, scorer will batch decodes and + prefills, while proposer will sync its KV-cache by running an extra + forward on prefills. + Returns a list of SamplerOutput, each containing a single token per sequence. """ + # With prefill chunking, expect requests to have prompts first + # so that backend gets prefill|decode. assert num_lookahead_slots == execute_model_req.num_lookahead_slots # Pass last hidden states from target model to proposer @@ -666,6 +698,25 @@ def _run_speculative_decoding_step( proposals, ) + _, (non_spec_seqs, non_spec_indices) = split_batch_by_proposal_len( + execute_model_req.seq_group_metadata_list, proposals.proposal_lens) + # With prefill chunking enabled, `non_spec_seqs` contains prefills too: + # discard decodes that have already been processed by proposer. + non_spec_indices = [ + idx for idx in non_spec_indices + if execute_model_req.seq_group_metadata_list[idx].is_prompt + ] + if len(non_spec_indices): + all_hidden_states = proposal_scores.hidden_states + # TODO fix `return_hidden_states`, same as in `_run_no_spec` + if all_hidden_states is not None: + prefill_hidden_states = all_hidden_states[non_spec_indices] + execute_model_req.previous_hidden_states = \ + prepare_prefill_hidden_states(prefill_hidden_states) + # Sync proposer KV cache for prefills. + prefill_req = execute_model_req.clone(non_spec_seqs) + self.proposer_worker.execute_model(prefill_req) + with Timer() as verification_timer: accepted_token_ids, target_logprobs = self._verify_tokens( execute_model_req.seq_group_metadata_list, proposal_scores, @@ -764,7 +815,6 @@ def _verify_tokens( self.previous_hidden_states = HiddenStates( hidden_states, seq_group_metadata_list, second_last_token_hidden_states) - return accepted_token_ids, logprobs def _create_output_sampler_list( @@ -814,6 +864,8 @@ def _create_output_sampler_list( accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() # Construct the output on a per-step, per-sequence basis. + # Non-terminal prefill chunks will end up here as rows with just -1s + # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] sampler_output_list: List[SamplerOutput] = [] for step_index in range(num_steps): if all(token_id == -1 @@ -856,7 +908,6 @@ def _create_output_sampler_list( # This is periodic because the rejection sampler emits metrics # periodically. self._maybe_log_stage_times(*stage_times) - return sampler_output_list def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index 2bb7af7d7c600..e61cde5b17f20 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,8 +1,6 @@ from typing import List, Optional -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) @@ -20,35 +18,21 @@ class TargetModelRunner(ModelRunner): requested or not. """ - def __init__(self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - kv_cache_dtype: Optional[str] = "auto", - is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None): + def __init__( + self, + vllm_config: VllmConfig, + kv_cache_dtype: Optional[str] = "auto", + is_driver_worker: bool = False, + return_hidden_states: bool = False, + ): # An internal boolean member variable to indicate if token log # probabilities are needed or not. self.disable_logprobs = True super().__init__( - model_config=model_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - cache_config=cache_config, - load_config=load_config, - lora_config=lora_config, + vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, - prompt_adapter_config=prompt_adapter_config, return_hidden_states=return_hidden_states, - observability_config=observability_config, ) def prepare_model_input( diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index f6a52a516075d..5a7999a258b2d 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -109,7 +109,6 @@ def get_spec_proposals( proposal_probs=proposal_probs, proposal_lens=proposal_lens, no_proposals=maybe_sampler_output is None) - return proposals def _split_by_proposal_len( @@ -127,9 +126,10 @@ def _split_by_proposal_len( nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] nonzero_proposal_len_indices: List[int] = [] for i, seq_group_metadata in enumerate(seq_group_metadata_list): - # The speculative decoding for this request has been disabled - # (e.g. due to high traffic). - if seq_group_metadata.num_speculative_tokens == 0: + # The speculative decoding for this request has either been disabled + # (e.g. due to high traffic) or this is a prompt request. + if (seq_group_metadata.is_prompt + or seq_group_metadata.num_speculative_tokens == 0): proposal_lens.append(0) continue diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 9bd2531d7a15c..054845584c2ef 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -6,6 +6,9 @@ import huggingface_hub from huggingface_hub import (file_exists, hf_hub_download, try_to_load_from_cache) +from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError, + RepositoryNotFoundError, + RevisionNotFoundError) from transformers import GenerationConfig, PretrainedConfig from transformers.models.auto.image_processing_auto import ( get_image_processor_config) @@ -19,6 +22,7 @@ # yapf: disable from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, EAGLEConfig, ExaoneConfig, + H2OVLChatConfig, InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, @@ -52,6 +56,7 @@ "medusa": MedusaConfig, "eagle": EAGLEConfig, "exaone": ExaoneConfig, + "h2ovl_chat": H2OVLChatConfig, "internvl_chat": InternVLChatConfig, "nemotron": NemotronConfig, "NVLM_D": NVLM_D_Config, @@ -127,14 +132,22 @@ def uses_mrope(config: PretrainedConfig) -> bool: return "mrope_section" in rope_scaling +def is_encoder_decoder(config: PretrainedConfig) -> bool: + """Detect if the model with this config is used as an encoder/decoder.""" + text_config = getattr(config, "text_config", None) + if text_config is not None: + return is_encoder_decoder(text_config) + + return getattr(config, "is_encoder_decoder", False) + + def get_config( model: Union[str, Path], trust_remote_code: bool, revision: Optional[str] = None, code_revision: Optional[str] = None, - rope_scaling: Optional[dict] = None, - rope_theta: Optional[float] = None, config_format: ConfigFormat = ConfigFormat.AUTO, + token: Optional[str] = None, **kwargs, ) -> PretrainedConfig: # Separate model folder from file path for GGUF models @@ -145,39 +158,43 @@ def get_config( model = Path(model).parent if config_format == ConfigFormat.AUTO: - if is_gguf or file_or_path_exists(model, - HF_CONFIG_NAME, - revision=revision, - token=kwargs.get("token")): + if is_gguf or file_or_path_exists( + model, HF_CONFIG_NAME, revision=revision, token=token): config_format = ConfigFormat.HF elif file_or_path_exists(model, MISTRAL_CONFIG_NAME, revision=revision, - token=kwargs.get("token")): + token=token): config_format = ConfigFormat.MISTRAL else: # If we're in offline mode and found no valid config format, then # raise an offline mode error to indicate to the user that they # don't have files cached and may need to go online. # This is conveniently triggered by calling file_exists(). - file_exists(model, - HF_CONFIG_NAME, - revision=revision, - token=kwargs.get("token")) + file_exists(model, HF_CONFIG_NAME, revision=revision, token=token) raise ValueError(f"No supported config format found in {model}") if config_format == ConfigFormat.HF: config_dict, _ = PretrainedConfig.get_config_dict( - model, revision=revision, code_revision=code_revision, **kwargs) + model, + revision=revision, + code_revision=code_revision, + token=token, + **kwargs, + ) # Use custom model class if it's in our registry model_type = config_dict.get("model_type") if model_type in _CONFIG_REGISTRY: config_class = _CONFIG_REGISTRY[model_type] - config = config_class.from_pretrained(model, - revision=revision, - code_revision=code_revision) + config = config_class.from_pretrained( + model, + revision=revision, + code_revision=code_revision, + token=token, + **kwargs, + ) else: try: config = AutoConfig.from_pretrained( @@ -185,6 +202,7 @@ def get_config( trust_remote_code=trust_remote_code, revision=revision, code_revision=code_revision, + token=token, **kwargs, ) except ValueError as e: @@ -202,7 +220,7 @@ def get_config( raise e elif config_format == ConfigFormat.MISTRAL: - config = load_params_config(model, revision) + config = load_params_config(model, revision, token=token, **kwargs) else: raise ValueError(f"Unsupported config format: {config_format}") @@ -214,51 +232,199 @@ def get_config( model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]}) - for key, value in [ - ("rope_scaling", rope_scaling), - ("rope_theta", rope_theta), - ]: - if value is not None: - logger.info( - "Updating %s from %r to %r", - key, - getattr(config, key, None), - value, - ) - config.update({key: value}) - patch_rope_scaling(config) + if trust_remote_code: + maybe_register_config_serialize_by_value() + return config -def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None: +def get_hf_file_to_dict(file_name: str, + model: Union[str, Path], + revision: Optional[str] = 'main', + token: Optional[str] = None): + """ + Downloads a file from the Hugging Face Hub and returns + its contents as a dictionary. + + Parameters: + - file_name (str): The name of the file to download. + - model (str): The name of the model on the Hugging Face Hub. + - revision (str): The specific version of the model. + - token (str): The Hugging Face authentication token. + + Returns: + - config_dict (dict): A dictionary containing + the contents of the downloaded file. + """ + file_path = Path(model) / file_name + + if file_or_path_exists(model=model, + config_name=file_name, + revision=revision, + token=token): + + if not file_path.is_file(): + try: + hf_hub_file = hf_hub_download(model, + file_name, + revision=revision) + except (RepositoryNotFoundError, RevisionNotFoundError, + EntryNotFoundError, LocalEntryNotFoundError) as e: + logger.debug("File or repository not found in hf_hub_download", + e) + return None + file_path = Path(hf_hub_file) + + with open(file_path) as file: + return json.load(file) + return None + + +def get_pooling_config(model: str, + revision: Optional[str] = 'main', + token: Optional[str] = None): + """ + This function gets the pooling and normalize + config from the model - only applies to + sentence-transformers models. + + Args: + model (str): The name of the Hugging Face model. + revision (str, optional): The specific version + of the model to use. Defaults to 'main'. + + Returns: + dict: A dictionary containing the pooling + type and whether normalization is used. + """ + + modules_file_name = "modules.json" + modules_dict = get_hf_file_to_dict(modules_file_name, model, revision, + token) + + if modules_dict is None: + return None + + pooling = next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Pooling"), + None) + normalize = bool( + next((item for item in modules_dict + if item["type"] == "sentence_transformers.models.Normalize"), + False)) + + if pooling: + + pooling_file_name = "{}/config.json".format(pooling["path"]) + pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision, + token) + pooling_type_name = next( + (item for item, val in pooling_dict.items() if val is True), None) + + if pooling_type_name is not None: + pooling_type_name = get_pooling_config_name(pooling_type_name) + + return {"pooling_type": pooling_type_name, "normalize": normalize} + + return None + + +def get_pooling_config_name(pooling_name: str) -> Union[str, None]: + if "pooling_mode_" in pooling_name: + pooling_name = pooling_name.replace("pooling_mode_", "") + + if "_" in pooling_name: + pooling_name = pooling_name.split("_")[0] + + if "lasttoken" in pooling_name: + pooling_name = "last" + + supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN'] + pooling_type_name = pooling_name.upper() + + try: + if pooling_type_name in supported_pooling_types: + return pooling_type_name + except NotImplementedError as e: + logger.debug("Pooling type not supported", e) + return None + return None + + +def get_sentence_transformer_tokenizer_config(model: str, + revision: Optional[str] = 'main', + token: Optional[str] = None): + """ + Returns the tokenization configuration dictionary for a + given Sentence Transformer BERT model. + + Parameters: + - model (str): The name of the Sentence Transformer + BERT model. + - revision (str, optional): The revision of the m + odel to use. Defaults to 'main'. + - token (str): A Hugging Face access token. + + Returns: + - dict: A dictionary containing the configuration parameters + for the Sentence Transformer BERT model. + """ + for config_name in [ + "sentence_bert_config.json", + "sentence_roberta_config.json", + "sentence_distilbert_config.json", + "sentence_camembert_config.json", + "sentence_albert_config.json", + "sentence_xlm-roberta_config.json", + "sentence_xlnet_config.json", + ]: + encoder_dict = get_hf_file_to_dict(config_name, model, revision, token) + if encoder_dict: + break + + if not encoder_dict: + return None + + if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")): + return encoder_dict + return None + + +def maybe_register_config_serialize_by_value() -> None: """Try to register HF model configuration class to serialize by value - With trust_remote_code, the config class is typically an instance of a - custom class imported from the HF modules cache. The class will not be - importable in spawned workers by default (and won't exist at all on - other nodes), which breaks serialization of the config. + If trust_remote_code is set, and the model's config file specifies an + `AutoConfig` class, then the config class is typically an instance of + a custom class imported from the HF modules cache. + + Examples: + + >>> from transformers import AutoConfig + >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True) + >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig + >>> import transformers_modules # error, not initialized + >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True) + >>> import transformers_modules # success, initialized + >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config + + In the DeepSeek example, the config class is an instance of a custom + class that is not serializable by default. This class will not be + importable in spawned workers, and won't exist at all on + other nodes, which breaks serialization of the config. In this function we tell the cloudpickle serialization library to pass instances of these generated classes by value instead of by reference, i.e. the class definition is serialized along with its data so that the - class module does not need to be importable on the receiving end. This - registration only works if the modules cache has already been - initialized. - + class module does not need to be importable on the receiving end. See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs - """ - if not trust_remote_code: - return - + """ # noqa try: import transformers_modules except ImportError: - logger.debug("Could not import transformers_modules used for remote" - " code. If remote code is not needed remove" - " `--trust-remote-code`.") + # the config does not need trust_remote_code return try: @@ -271,19 +437,19 @@ class module does not need to be importable on the receiving end. This ray.cloudpickle.register_pickle_by_value(transformers_modules) # multiprocessing uses pickle to serialize arguments when using spawn - # Here we get pickle to use cloudpickle to serialize ModelConfig objects + # Here we get pickle to use cloudpickle to serialize config objects # that contain instances of the custom config class to avoid # serialization problems if the generated module (and model) has a `.` # in its name import multiprocessing import pickle - from vllm.config import ModelConfig + from vllm.config import VllmConfig - def _reduce_modelconfig(mc: ModelConfig): - return (pickle.loads, (cloudpickle.dumps(mc), )) + def _reduce_config(config: VllmConfig): + return (pickle.loads, (cloudpickle.dumps(config), )) - multiprocessing.reducer.register(ModelConfig, _reduce_modelconfig) + multiprocessing.reducer.register(VllmConfig, _reduce_config) except Exception as e: logger.warning( @@ -294,20 +460,17 @@ def _reduce_modelconfig(mc: ModelConfig): exc_info=e) -def load_params_config(model, revision) -> PretrainedConfig: +def load_params_config(model: Union[str, Path], + revision: Optional[str], + token: Optional[str] = None, + **kwargs) -> PretrainedConfig: # This function loads a params.json config which # should be used when loading models in mistral format config_file_name = "params.json" - config_path = Path(model) / config_file_name - - if not config_path.is_file(): - config_path = Path( - hf_hub_download(model, config_file_name, revision=revision)) - - with open(config_path, "r") as file: - config_dict = json.load(file) + config_dict = get_hf_file_to_dict(config_file_name, model, revision, token) + assert isinstance(config_dict, dict) config_mapping = { "dim": "hidden_size", @@ -351,6 +514,8 @@ def recurse_elems(elem: Any): config_dict["architectures"] = ["PixtralForConditionalGeneration"] config_dict["model_type"] = "pixtral" + config_dict.update(kwargs) + config = recurse_elems(config_dict) return config diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index f0d79197a82c5..d1e19c9a33c24 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -6,6 +6,7 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig +from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.medusa import MedusaConfig @@ -22,6 +23,7 @@ "DbrxConfig", "MPTConfig", "RWConfig", + "H2OVLChatConfig", "InternVLChatConfig", "JAISConfig", "MedusaConfig", @@ -33,4 +35,4 @@ "NVLM_D_Config", "SolarConfig", "UltravoxConfig", -] +] \ No newline at end of file diff --git a/vllm/transformers_utils/configs/chatglm.py b/vllm/transformers_utils/configs/chatglm.py index 49d2b8d8e21b1..e563bf6268d72 100644 --- a/vllm/transformers_utils/configs/chatglm.py +++ b/vllm/transformers_utils/configs/chatglm.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Adapted from # https://github.com/THUDM/ChatGLM2-6B from transformers import PretrainedConfig diff --git a/vllm/transformers_utils/configs/exaone.py b/vllm/transformers_utils/configs/exaone.py index 805b8ad930039..f60a59f554133 100644 --- a/vllm/transformers_utils/configs/exaone.py +++ b/vllm/transformers_utils/configs/exaone.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/blob/main/configuration_exaone.py # Copyright 2021 The LG AI Research EXAONE Lab. All rights reserved. diff --git a/vllm/transformers_utils/configs/h2ovl.py b/vllm/transformers_utils/configs/h2ovl.py new file mode 100644 index 0000000000000..b94c5b77e4b7f --- /dev/null +++ b/vllm/transformers_utils/configs/h2ovl.py @@ -0,0 +1,13 @@ +# Adapted from +# https://huggingface.co/h2oai/h2ovl-mississippi-2b/blob/main/configuration_h2ovl_chat.py +# -------------------------------------------------------- +# H2OVL-Mississippi +# Copyright (c) 2024 H2O.AI +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- + +from .internvl import InternVLChatConfig + + +class H2OVLChatConfig(InternVLChatConfig): + model_type = "h2ovl_chat" diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py index b06a946f34a47..82f129eb2018e 100644 --- a/vllm/transformers_utils/configs/jais.py +++ b/vllm/transformers_utils/configs/jais.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright 2023 Cerebras Systems. diff --git a/vllm/transformers_utils/configs/mpt.py b/vllm/transformers_utils/configs/mpt.py index 497db0ae48c96..0f047c8b0361c 100644 --- a/vllm/transformers_utils/configs/mpt.py +++ b/vllm/transformers_utils/configs/mpt.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copied from # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py """A HuggingFace-style model configuration.""" @@ -117,10 +116,10 @@ def _validate_config(self) -> None: init_config_defaults) if self.d_model % self.n_heads != 0: raise ValueError('d_model must be divisible by n_heads') - if any(( + if any( prob < 0 or prob > 1 for prob in - [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop] - )): + [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop + ]): raise ValueError( "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are " "probabilities and must be between 0 and 1") diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 139e6b3cdacbe..93fec667d1cf3 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2024 HuggingFace Inc. team. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # @@ -144,7 +143,7 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - head_dim = head_dim or kwargs.get("kv_channels", None) + head_dim = head_dim or kwargs.get("kv_channels") self.head_dim = head_dim if head_dim is not None else ( hidden_size // num_attention_heads) @@ -160,8 +159,8 @@ def __init__( self.rope_theta = rope_theta self.rope_scaling = rope_scaling # for backward compatibility - partial_rotary_factor = kwargs.get("rope_percent", None) or kwargs.get( - "rope_percentage", None) or partial_rotary_factor + partial_rotary_factor = kwargs.get("rope_percent") or kwargs.get( + "rope_percentage") or partial_rotary_factor self.partial_rotary_factor = partial_rotary_factor self._rope_scaling_validation() self.attention_bias = attention_bias diff --git a/vllm/transformers_utils/configs/solar.py b/vllm/transformers_utils/configs/solar.py index d5113bf01695a..0c1c048f670ee 100644 --- a/vllm/transformers_utils/configs/solar.py +++ b/vllm/transformers_utils/configs/solar.py @@ -1,4 +1,3 @@ -# coding=utf-8 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. # # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py index 9a4149251d747..6a114b513f382 100644 --- a/vllm/transformers_utils/tokenizer_group/__init__.py +++ b/vllm/transformers_utils/tokenizer_group/__init__.py @@ -25,6 +25,11 @@ def init_tokenizer_from_configs(model_config: ModelConfig, trust_remote_code=model_config.trust_remote_code, revision=model_config.tokenizer_revision) + if (model_config.encoder_config is not None + and "do_lower_case" in model_config.encoder_config): + init_kwargs["do_lower_case"] = model_config.encoder_config[ + "do_lower_case"] + return get_tokenizer_group(parallel_config.tokenizer_pool_config, **init_kwargs) diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 80e21c2d32ecc..1b273c6b120ea 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -16,9 +16,13 @@ from mistral_common.tokens.tokenizers.tekken import (SpecialTokenPolicy, Tekkenizer) +from vllm.logger import init_logger + if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam +logger = init_logger(__name__) + @dataclass class Encoding: @@ -68,24 +72,26 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: self.instruct = tokenizer.instruct_tokenizer tokenizer_ = tokenizer.instruct_tokenizer.tokenizer - if isinstance(tokenizer_, Tekkenizer): + self.is_tekken = isinstance(tokenizer_, Tekkenizer) + self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer) + if self.is_tekken: # Make sure special tokens will not raise tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE - - self._vocab = { - token: idx - for idx, token in enumerate(tokenizer_.vocab()) - } - elif isinstance(tokenizer_, SentencePieceTokenizer): - self._vocab = { - token: idx - for idx, token in enumerate(tokenizer_.vocab()) - } + elif self.is_spm: + pass else: raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}") + self._vocab = tokenizer_.vocab() + # Convert to a Dict[str, int] to match protocol, but this is a lossy + # conversion. There may be multiple token ids that decode to the same + # string due to partial UTF-8 byte sequences being converted to � + self._vocab_dict = { + token: idx + for idx, token in enumerate(self._vocab) + } self.tokenizer = tokenizer_ - self._max_token_id = max(self._vocab.values()) + self._max_token_id = self.vocab_size - 1 @classmethod def from_pretrained(cls, @@ -182,7 +188,9 @@ def __call__( return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: - return self._vocab + # NB: the dictionary form of the vocabulary collapses token ids that map + # to the same string but have different bytes + return self._vocab_dict def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary @@ -211,7 +219,7 @@ def apply_chat_template(self, return encoded.tokens def convert_tokens_to_string(self, tokens: List[str]) -> str: - if isinstance(self.tokenizer, Tekkenizer): + if self.is_tekken: tokens = [ t for t in tokens if t not in self.tokenizer._all_special_tokens @@ -220,14 +228,20 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str: if any(isinstance(t, bytes) for t in tokens): # we need to encode and decode all tokens again shift = self.tokenizer.num_special_tokens - byte_tokens = [ - t.encode("utf-8") if not isinstance(t, bytes) else t - for t in tokens - ] - ids = [ - self.tokenizer._tekken_token2id_nospecial[t] + shift - for t in byte_tokens - ] + + def _token_to_id(t: str): + t_bytes = t.encode("utf-8") \ + if not isinstance(t, bytes) else t + try: + return shift + \ + self.tokenizer._tekken_token2id_nospecial[t_bytes] + except KeyError: + logger.warning( + "Failed to convert token %s to id," + " replacing with ", t_bytes) + return self.tokenizer.unk_id + + ids = [_token_to_id(t) for t in tokens] decoded = self.tokenizer.decode(ids) else: decoded = "".join(tokens) @@ -236,7 +250,13 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str: return decoded - def decode(self, ids: Union[List[int], int]) -> str: + def decode(self, + ids: Union[List[int], int], + skip_special_tokens: bool = True) -> str: + assert ( + skip_special_tokens + ), "skip_special_tokens=False is not supported for Mistral tokenizers." + if isinstance(ids, int): ids = [ids] return self.tokenizer.decode(ids) @@ -249,18 +269,22 @@ def convert_ids_to_tokens( # TODO(Patrick) - potentially allow special tokens to not be skipped assert ( skip_special_tokens - ), "Skipping special tokens is not supported for Mistral tokenizers." + ), "skip_special_tokens=False is not supported for Mistral tokenizers." + + assert self.is_tekken or self.is_spm, type(self.tokenizer) - assert isinstance(self.tokenizer, - (Tekkenizer, SentencePieceTokenizer)), type( - self.tokenizer) + if self.is_tekken: + # skip special tokens + ids = [i for i in ids if i > self.tokenizer.num_special_tokens] tokens = [self.tokenizer.id_to_piece(id) for id in ids] - if any(t.strip() == "�" for t in tokens): - # if any stripped decoded token is undefined - # because it's invalid unicode then pass bytes + if any("�" in t for t in tokens) and self.is_tekken: + # if a decoded token contains the replacement character, then the + # token has an incomplete UTF-8 character so we must use bytes # See: https://github.com/vllm-project/vllm/pull/8640 + # https://github.com/vllm-project/vllm/pull/9625 + # if underlying tokenizeir is sentencepiece, we just add "�" tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids] return tokens diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 3f57c22e1f2e4..568185383aa5c 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -6,6 +6,5 @@ from vllm.triton_utils.custom_cache_manager import ( maybe_set_triton_cache_manager) - from vllm.triton_utils.libentry import libentry - __all__ += ["maybe_set_triton_cache_manager", "libentry"] + __all__ += ["maybe_set_triton_cache_manager"] diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py deleted file mode 100644 index 4335c7adfc13b..0000000000000 --- a/vllm/triton_utils/libentry.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copied From https://github.com/FlagOpen/FlagGems - -import inspect - -import triton - - -class LibEntry(triton.KernelInterface): - - def __init__( - self, - fn, - ): - self.fn = fn - self.arg_names = fn.arg_names - self.divisibility = 16 - self.kernel_cache = dict() - fn = self.fn - while not isinstance(fn, triton.runtime.JITFunction): - fn = fn.fn - self.jit_function: triton.runtime.JITFunction = fn - self.specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and not p.do_not_specialize - ] - self.do_not_specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and p.do_not_specialize - ] - - def key(self, spec_args, dns_args, const_args): - spec_key = [(arg.dtype, arg.data_ptr() % - self.divisibility == 0) if hasattr(arg, "data_ptr") else - (type(arg), arg) for arg in spec_args] - dns_key = [ - arg.dtype if hasattr( - arg, "data_ptr") else type(arg) if not isinstance(arg, int) - else "i32" if arg >= -(2**31) and arg <= 2**31 - - 1 else "u64" if arg >= 2**63 and arg <= 2**64 - 1 else "i64" - for arg in dns_args - ] - # const args passed by position - return tuple(spec_key + dns_key + const_args) - - def run(self, *args, **kwargs): - grid = kwargs["grid"] - # collect all the arguments - spec_args = [] # specialize arguments - dns_args = [] # do not specialize arguments - const_args = [] # constexpr arguments - k_args = [] # kernel arguments - for i, arg in enumerate(args): - if i in self.specialize_indices: - k_args.append(arg) - spec_args.append(arg) - elif i in self.do_not_specialize_indices: - k_args.append(arg) - dns_args.append(arg) - else: - const_args.append(arg) - for p in self.jit_function.params[len(args):]: - if p.name in kwargs: - val = kwargs[p.name] - elif p.default is inspect._empty: - continue - else: - val = p.default - - if p.is_constexpr: - const_args.append(val) - elif p.do_not_specialize: - dns_args.append(val) - k_args.append(val) - else: - spec_args.append(val) - k_args.append(val) - - entry_key = self.key(spec_args, dns_args, const_args) - - if entry_key not in self.kernel_cache: - # compile the kernel also completes the related computations - kernel = self.fn.run(*args, **kwargs) - fn = self.fn - # collect constexpr arguments for grid computation - constexprs = {} - while not isinstance(fn, triton.runtime.JITFunction): - if isinstance(fn, triton.runtime.Autotuner): - config = fn.best_config - constexprs["num_warps"] = config.num_warps - constexprs["num_stages"] = config.num_stages - constexprs["num_ctas"] = config.num_ctas - constexprs = {**constexprs, **config.kwargs} - elif isinstance(fn, triton.runtime.Heuristics): - for v, heur in fn.values.items(): - constexprs[v] = heur({ - **dict(zip(fn.arg_names, args)), - **kwargs, - **constexprs, - }) - else: - raise RuntimeError("Invalid Runtime Function") - fn = fn.fn - # In vLLM, certain kernels like fused_moe_kernel get the - # best_config(as kwargs) from a configuration json file, rather - # than using Autotuner & Heuristics. Therefore, all their constexprs - # (tl.constexpr) are assigned values through the following loop. - for p in self.jit_function.params: - if p.is_constexpr and p.name not in constexprs: - constexprs[p.name] = p.default #default=inspect._empty - self.kernel_cache[entry_key] = (kernel, constexprs) - else: - # load kernel from cache directly - kernel, constexprs = self.kernel_cache[entry_key] - - if callable(grid): - # collect all arguments to the grid fn,ie: - # 1. args, - # 2. kwargs, - # 3. all all other captured arguments in CompiledKernel from - # Autotunner & Heuristics when kwargs & captured args conflict, - # captured args have higher priority - # 4. We must filter out captured args with default value firstly - constexprs = { - k: v - for k, v in constexprs.items() if v is not inspect._empty - } - meta = { - **dict(zip(self.arg_names, args)), - **kwargs, - **constexprs, - } - grid = grid(meta) - if isinstance(grid, tuple): - grid = grid + (1, 1) - elif isinstance(grid, list): - grid = grid + [1, 1] - kernel[grid[0:3]](*k_args) - # maintaining the same return type as the JITFunction.run - return kernel - - -def libentry(): - """ - Decorator for triton library entries. - Motivation: - The runtime overhead of Triton kernels is the reason for the lower - performance of small kernels, particularly evident with smaller models. - Using this decorator can reduce Triton runtime overhead. - How: - The `run` function of JITFunction needs to accomplish: - - Parameter binding using inspect - - KernelArg type wrapping - - Cache key calculation - When dealing with small size, these steps can become bottlenecks in - Triton runtime. Libentry simplifies these steps to reduce runtime - overhead, thereby improving the runtime expenses of small kernels. - NOTE: - When Triton is upgraded to version 3.0.0, libentry can be removed, - see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245 - - - """ - - def decorator(fn): - return LibEntry(fn) - - return decorator diff --git a/vllm/utils.py b/vllm/utils.py index 03cdbe6a0dc7b..1b02cbff79f78 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -4,6 +4,7 @@ import datetime import enum import gc +import getpass import inspect import ipaddress import os @@ -32,6 +33,7 @@ import torch.types import yaml from packaging.version import Version +from torch.library import Library from typing_extensions import ParamSpec, TypeIs, assert_never import vllm.envs as envs @@ -79,17 +81,14 @@ "currently supported with encoder/" "decoder models.") -STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers is the only backend " - "currently supported with encoder/" +STR_NOT_IMPL_ENC_DEC_BACKEND = ("XFormers and Flash-Attention are the only " + "backends currently supported with encoder/" "decoder models.") STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER = ("Prompt adapters are not " "currently supported with encoder/" "decoder models.") -STR_NOT_IMPL_ENC_DEC_CPU = ("CPU is not currently supported with " - "encoder/decoder models.") - # Efficiently import all enc/dec error strings # rather than having to import all of the above STR_NOT_IMPL_ENC_DEC_ERR_STRS = { @@ -104,7 +103,6 @@ "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC, "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND, "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER, - "STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU } # Constants related to forcing the attention backend selection @@ -727,6 +725,9 @@ def is_pin_memory_available() -> bool: elif current_platform.is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False + elif current_platform.is_hpu(): + print_warning_once("Pin memory is not supported on HPU.") + return False elif current_platform.is_cpu() or current_platform.is_openvino(): return False return True @@ -967,6 +968,8 @@ def enable_trace_function_call_for_thread() -> None: if envs.VLLM_TRACE_FUNCTION: tmp_dir = tempfile.gettempdir() + # add username to tmp_dir to avoid permission issues + tmp_dir = os.path.join(tmp_dir, getpass.getuser()) filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" f"_thread_{threading.get_ident()}_" f"at_{datetime.datetime.now()}.log").replace(" ", "_") @@ -1147,9 +1150,23 @@ def __call__(self, parser, namespace, values, option_string=None): "Expected 'true' or 'false'.") +class SortedHelpFormatter(argparse.HelpFormatter): + """SortedHelpFormatter that sorts arguments by their option strings.""" + + def add_arguments(self, actions): + actions = sorted(actions, key=lambda x: x.option_strings) + super().add_arguments(actions) + + class FlexibleArgumentParser(argparse.ArgumentParser): """ArgumentParser that allows both underscore and dash in names.""" + def __init__(self, *args, **kwargs): + # Set the default 'formatter_class' to SortedHelpFormatter + if 'formatter_class' not in kwargs: + kwargs['formatter_class'] = SortedHelpFormatter + super().__init__(*args, **kwargs) + def parse_args(self, args=None, namespace=None): if args is None: args = sys.argv[1:] @@ -1264,7 +1281,7 @@ def _load_config_file(self, file_path: str) -> List[str]: config: Dict[str, Union[int, str]] = {} try: - with open(file_path, 'r') as config_file: + with open(file_path) as config_file: config = yaml.safe_load(config_file) except Exception as ex: logger.error( @@ -1512,3 +1529,54 @@ def weak_ref_tensors( if isinstance(tensors, tuple): return tuple(weak_ref_tensor(t) for t in tensors) raise ValueError("Invalid type for tensors") + + +def is_in_doc_build() -> bool: + try: + from sphinx.ext.autodoc.mock import _MockModule + return isinstance(torch, _MockModule) + except ModuleNotFoundError: + return False + + +# create a library to hold the custom op +vllm_lib = Library("vllm", "FRAGMENT") # noqa + + +def direct_register_custom_op( + op_name: str, + op_func: Callable, + mutates_args: List[str], + fake_impl: Optional[Callable] = None, + target_lib: Optional[Library] = None, +): + """ + `torch.library.custom_op` can have significant overhead because it + needs to consider complicated dispatching logic. This function + directly registers a custom op and dispatches it to the CUDA backend. + See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5 + for more details. + + By default, the custom op is registered to the vLLM library. If you + want to register it to a different library, you can pass the library + object to the `target_lib` argument. + + IMPORTANT: the lifetime of the operator is tied to the lifetime of the + library object. If you want to bind the operator to a different library, + make sure the library object is alive when the operator is used. + """ + if is_in_doc_build(): + return + import torch.library + if hasattr(torch.library, "infer_schema"): + schema_str = torch.library.infer_schema(op_func, + mutates_args=mutates_args) + else: + # for pytorch 2.4 + import torch._custom_op.impl + schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args) + my_lib = target_lib or vllm_lib + my_lib.define(op_name + schema_str) + my_lib.impl(op_name, op_func, "CUDA") + if fake_impl is not None: + my_lib._register_fake(op_name, fake_impl) diff --git a/vllm/v1/__init__.py b/vllm/v1/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ec07464e6a12a..e73a1e60b2730 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -7,6 +7,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.forward_context import get_forward_context +from vllm.utils import direct_register_custom_op from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -50,6 +51,7 @@ class FlashAttentionMetadata: # |-------------------- seq_len ---------------------| # |-- query_len ---| + num_actual_tokens: int # Number of tokens excluding padding. max_query_len: int query_start_loc: torch.Tensor max_seq_len: int @@ -133,7 +135,9 @@ def forward( assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") - output = torch.ops.vllm.unified_flash_attention( + output = torch.empty_like(query) + torch.ops.vllm.unified_v1_flash_attention( + output, query, key, value, @@ -152,9 +156,8 @@ def forward( return output -@torch.library.custom_op("vllm::unified_flash_attention", - mutates_args=["kv_cache"]) -def unified_flash_attention( +def unified_v1_flash_attention( + output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -169,17 +172,17 @@ def unified_flash_attention( window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: +) -> None: current_metadata = get_forward_context() if current_metadata is None: # Profiling run. - return torch.empty_like(query) + return assert current_metadata is not None assert isinstance(current_metadata, FlashAttentionMetadata) attn_metadata: FlashAttentionMetadata = current_metadata + num_actual_tokens = attn_metadata.num_actual_tokens - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, num_heads, head_size) key = key.view(-1, num_kv_heads, head_size) @@ -189,18 +192,18 @@ def unified_flash_attention( key_cache = kv_cache[0] value_cache = kv_cache[1] torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - kv_cache[0], - kv_cache[1], + key[:num_actual_tokens], + value[:num_actual_tokens], + key_cache, + value_cache, attn_metadata.slot_mapping, kv_cache_dtype, k_scale, v_scale, ) - output = flash_attn_varlen_func( - q=query, + attn_output = flash_attn_varlen_func( + q=query[:num_actual_tokens], k=key_cache, v=value_cache, cu_seqlens_q=attn_metadata.query_start_loc, @@ -214,11 +217,13 @@ def unified_flash_attention( block_table=attn_metadata.block_table, softcap=logits_soft_cap, ) - return output.view(num_tokens, hidden_size) + attn_output = attn_output.view(num_actual_tokens, -1) + # TODO(woosuk): Optimize this. + output[:num_actual_tokens].copy_(attn_output) -@unified_flash_attention.register_fake -def _( +def unified_v1_flash_attention_fake( + output: torch.Tensor, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, @@ -233,5 +238,13 @@ def _( window_size: Optional[List[int]] = None, alibi_slopes: Optional[torch.Tensor] = None, logits_soft_cap: Optional[float] = None, -) -> torch.Tensor: - return torch.empty_like(query) +) -> None: + return + + +direct_register_custom_op( + op_name="unified_v1_flash_attention", + op_func=unified_v1_flash_attention, + mutates_args=["kv_cache", "output"], + fake_impl=unified_v1_flash_attention_fake, +) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py new file mode 100644 index 0000000000000..845bd5ea05e3c --- /dev/null +++ b/vllm/v1/core/encoder_cache_manager.py @@ -0,0 +1,48 @@ +from typing import Dict, List, Set, Tuple + +from vllm.v1.request import Request + + +class EncoderCacheManager: + + def __init__(self, cache_size: int): + self.cache_size = cache_size + self.num_free_slots = cache_size + # req_id -> cached input ids + self.cached: Dict[str, Set[int]] = {} + # List of [req_id, input_id] + self.freed: List[Tuple[str, int]] = [] + + def has_cache(self, request: Request, input_id: int) -> bool: + req_id = request.request_id + return req_id in self.cached and input_id in self.cached[req_id] + + def can_allocate(self, request: Request, input_id: int) -> bool: + num_tokens = request.get_num_encoder_tokens(input_id) + return num_tokens <= self.num_free_slots + + def allocate(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + self.cached[req_id] = set() + self.cached[req_id].add(input_id) + self.num_free_slots -= request.get_num_encoder_tokens(input_id) + + def get_cached_input_ids(self, request: Request) -> Set[int]: + return self.cached.get(request.request_id, set()) + + def free(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + return + + self.cached[req_id].discard(input_id) + if len(self.cached[req_id]) == 0: + del self.cached[req_id] + self.num_free_slots += request.get_num_encoder_tokens(input_id) + self.freed.append((req_id, input_id)) + + def get_freed_ids(self) -> List[Tuple[str, int]]: + freed = self.freed + self.freed = [] + return freed diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 9b735a8be10d7..38f1c03a4d3ac 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -1,9 +1,11 @@ +from collections import defaultdict from typing import Dict, List, Optional -import numpy as np - from vllm.logger import init_logger from vllm.utils import cdiv +from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, + KVCacheBlock, hash_block_tokens, + hash_request_tokens) from vllm.v1.request import Request logger = init_logger(__name__) @@ -36,73 +38,360 @@ def __init__( self.num_preallocate_tokens = num_preallocate_tokens self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size) - self.free_block_ids = list(range(num_gpu_blocks)) - self.req_to_block_ids: Dict[str, List[int]] = {} - self.ref_cnts = np.zeros(num_gpu_blocks, dtype=np.int32) + # A Block pool of all kv-cache blocks. + self.block_pool: List[KVCacheBlock] = [ + KVCacheBlock(idx) for idx in range(num_gpu_blocks) + ] + # Free block queue that constructs and manipulates a doubly linked + # list of free blocks (including eviction candidates when caching is + # enabled). + self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool) + + # {block_hash: {block ID: block}}. A cached block is + # a full block with a block hash that can be used for prefix caching. + # The cached block may be used by running requests or in the + # free_block_queue that could potentially be evicted. + # NOTE: We currently don't de-duplicate the blocks in the cache, + # meaning that if a block becomes full and is cached, we don't check + # if there is already an identical block in the cache. This is because + # we want to make sure the allocated block IDs won't change so that + # block tables are append-only. + self.cached_block_hash_to_block: Dict[BlockHashType, Dict[ + int, KVCacheBlock]] = defaultdict(dict) + + # Mapping from request ID to blocks to track the blocks allocated + # for each request, so that we can free the blocks when the request + # is finished. + self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {} + + def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: + """Get the computed (cached) blocks for the request. + Note that the computed blocks must be full. - def get_computed_blocks(self, request: Request) -> List[int]: + Args: + request: The request to get the computed blocks. + + Returns: + A list of blocks that are computed for the request. + """ if not self.enable_caching: - # No prefix caching. + # Prefix caching is disabled. return [] - # TODO(woosuk): Implement hash-based caching. - return [] + + computed_blocks = [] + block_hashes = hash_request_tokens(self.block_size, + request.all_token_ids) + + for block_hash in block_hashes: + # block_hashes is a chain of block hashes. If a block hash is not + # in the cached_block_hash_to_id, the following block hashes are + # not computed yet for sure. + if cached_block := self._get_cached_block(block_hash): + computed_blocks.append(cached_block) + else: + break + + return computed_blocks def append_slots( self, request: Request, num_tokens: int, - ) -> Optional[List[int]]: + ) -> Optional[List[KVCacheBlock]]: + """Append slots to the block table of the request. + We first append slots to already allocated blocks. If the allocated + blocks are not enough, we allocate new blocks. + + Args: + request: The request to append slots. + num_tokens: The number of tokens to append. + + Returns: + A list of new blocks if new blocks are allocated, or None + if new blocks are required but cannot be allocated. + """ num_required_blocks = cdiv(request.num_computed_tokens + num_tokens, self.block_size) - req_block_ids = self.req_to_block_ids[request.request_id] - if num_required_blocks <= len(req_block_ids): - # No new block is needed. - return [] + req_blocks = self.req_to_blocks[request.request_id] - num_new_blocks = num_required_blocks - len(req_block_ids) - num_free_blocks = len(self.free_block_ids) - if num_new_blocks > num_free_blocks: - # Cannot allocate new blocks. + num_new_blocks = num_required_blocks - len(req_blocks) + if num_new_blocks > self.free_block_queue.num_free_blocks: + # Need to allocate new blocks due to insufficient pre-allocated + # slots, but we cannot allocate new blocks due to the limit. return None - # Allocate new blocks. + # When caching is enabled, assign token IDs to already allocated blocks. + new_token_ids = None + parent_block = None + if self.enable_caching: + # Figure out the token IDs to add to the blocks. + new_token_ids = request.all_token_ids[ + request.num_computed_tokens:request.num_computed_tokens + + num_tokens] + + # Find the last full block index. + # TODO: This may be optimized by calculating the computed tokens. + last_full_block_idx = len(req_blocks) - 1 + while (last_full_block_idx >= 0 + and req_blocks[last_full_block_idx].block_hash is None): + last_full_block_idx -= 1 + + parent_block = (req_blocks[last_full_block_idx] + if last_full_block_idx >= 0 else None) + token_id_idx = self._add_token_ids_to_blocks( + blocks=req_blocks[last_full_block_idx + 1:], + token_ids=new_token_ids, + parent_block=parent_block) + + new_token_ids = new_token_ids[token_id_idx:] + parent_block = req_blocks[-1] + + # No new block is needed. When caching is enabled, we make sure + # token_id_idx is equal to len(new_token_ids), meaning that all tokens + # are added to allocated blocks. + if num_required_blocks <= len(req_blocks): + assert not self.enable_caching or token_id_idx == num_tokens, \ + f"{token_id_idx=} != {num_tokens=}" + return [] + + # Allocate new blocks considering preallocated blocks, and + # add token IDs to them if caching is enabled. num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks, - num_free_blocks) - new_block_ids = self._get_new_blocks(num_new_blocks) - req_block_ids.extend(new_block_ids) - self.ref_cnts[new_block_ids] += 1 - return new_block_ids + self.free_block_queue.num_free_blocks) + new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids, + parent_block) + req_blocks.extend(new_blocks) + return new_blocks def allocate_slots( self, request: Request, num_tokens: int, - computed_block_ids: List[int], - ) -> Optional[List[int]]: + computed_blocks: List[KVCacheBlock], + ) -> Optional[List[KVCacheBlock]]: + """Allocate slots for a new request. + + Args: + request: The request to allocate slots. + num_tokens: The number of tokens to allocate. Note that this does + not include the tokens that have already been computed. + computed_blocks: The blocks that have already been computed. + + Returns: + A list of new allocated blocks. + """ + if num_tokens == 0: + raise ValueError( + f"num_tokens must be greater than 0, got {num_tokens}") + + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = len( + [blk for blk in computed_blocks if blk.ref_cnt == 0]) + num_required_blocks = cdiv(num_tokens, self.block_size) - num_free_blocks = len(self.free_block_ids) - if num_required_blocks > num_free_blocks: + if (num_required_blocks > self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks): # Cannot allocate new blocks. return None - num_new_blocks = min(num_required_blocks + self.num_preallocate_blocks, - num_free_blocks) - new_block_ids = self._get_new_blocks(num_new_blocks) - block_ids = computed_block_ids + new_block_ids - self.req_to_block_ids[request.request_id] = block_ids - self.ref_cnts[block_ids] += 1 - return new_block_ids + # Determine the number of new blocks to allocate considering + # preallocated blocks. + num_new_blocks = min( + num_required_blocks + self.num_preallocate_blocks, + self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks) + + num_computed_tokens = len(computed_blocks) * self.block_size + + # When caching is enabled, get the new token IDs and the parent block + # ID to generate cache keys. + new_token_ids = None + parent_block = None + if self.enable_caching: + # Touch the computed blocks to make sure they won't be evicted. + self._touch(computed_blocks) + + # Get the token IDs for the blocks being allocated for hashing. + new_token_ids = request.all_token_ids[ + num_computed_tokens:num_computed_tokens + num_tokens] + if not new_token_ids: + raise RuntimeError( + "Failed to infer the token IDs for allocation. " + f"#all_tokens={len(request.all_token_ids)} < " + f"#computed_tokens={num_computed_tokens}") + + # Get the parent block ID to construct the block chain. + parent_block = computed_blocks[-1] if computed_blocks else None + + new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids, + parent_block) + + # Concatenate the computed block IDs and the new block IDs. + self.req_to_blocks[request.request_id] = computed_blocks + new_blocks + return new_blocks def free(self, request: Request) -> None: - block_ids = self.req_to_block_ids.pop(request.request_id) - self.ref_cnts[block_ids] -= 1 - for block_id in block_ids: - ref_cnt = self.ref_cnts[block_id] - if ref_cnt == 0: - self.free_block_ids.append(block_id) - - def _get_new_blocks(self, num_blocks: int) -> List[int]: - assert num_blocks <= len(self.free_block_ids) - new_block_ids = self.free_block_ids[-num_blocks:] - self.free_block_ids = self.free_block_ids[:-num_blocks] - return new_block_ids + """Free the blocks allocated for the request. + When caching is enabled, we free the blocks in reverse order so that + the tail blocks are evicted first. + + Args: + request: The request to free the blocks. + """ + # Default to [] in case a request is freed (aborted) before alloc. + blocks = self.req_to_blocks.pop(request.request_id, []) + if self.enable_caching: + # Free blocks in reverse order so that the tail blocks are + # freed first. + blocks = reversed(blocks) + + for block in blocks: + block.ref_cnt -= 1 + if block.ref_cnt == 0: + self.free_block_queue.append(block) + + def _get_new_blocks( + self, + num_blocks: int, + token_ids: Optional[List[int]] = None, + parent_block: Optional[int] = None) -> List[KVCacheBlock]: + """Get new blocks from the free block pool, and add token IDs to + allocated blocks if caching is enabled. + Note that we do not check block cache in this function. + + Args: + num_blocks: The number of blocks to allocate. + token_ids: The token IDs in the blocks. None if caching is disabled. + parent_block: The parent block. Used to include block chain + in the block hash. + + Returns: + A list of new block. + """ + if num_blocks > self.free_block_queue.num_free_blocks: + raise ValueError( + f"Cannot get {num_blocks} free blocks from the pool") + + # First allocate blocks. + ret: List[KVCacheBlock] = [] + idx = 0 + while idx < num_blocks: + curr_block = self.free_block_queue.popleft() + assert curr_block.ref_cnt == 0 + + # Evict blocks from the cache. + if self.enable_caching: + block_hash = curr_block.block_hash + if (block_hash is not None + and block_hash in self.cached_block_hash_to_block): + if len(self.cached_block_hash_to_block[block_hash]) == 1: + del self.cached_block_hash_to_block[block_hash] + else: + del self.cached_block_hash_to_block[block_hash][ + curr_block.block_id] + curr_block.reset() + + curr_block.ref_cnt = 1 + ret.append(curr_block) + idx += 1 + + # Then assign token IDs to the allocated blocks. + if self.enable_caching: + assert token_ids is not None + token_id_idx = self._add_token_ids_to_blocks( + blocks=ret, token_ids=token_ids, parent_block=parent_block) + assert token_id_idx == len(token_ids) + + return ret + + def _cache_full_block(self, + block: KVCacheBlock, + parent_block: Optional[KVCacheBlock] = None) -> None: + """Cache a full block for prefix caching. + + Args: + block: The block to cache. + parent_block: The parent block. None if this is the first block. + """ + parent_block_hash = (parent_block.block_hash + if parent_block is not None else None) + assert len(block.token_ids) == self.block_size + block.token_ids = tuple(block.token_ids) + block_hash = hash_block_tokens(parent_block_hash, block.token_ids) + block.block_hash = block_hash + block.num_hashed_tokens = self.block_size + ( + parent_block.num_hashed_tokens if parent_block is not None else 0) + self.cached_block_hash_to_block[block_hash][block.block_id] = block + + def _get_cached_block(self, + block_hash: BlockHashType) -> Optional[KVCacheBlock]: + """Get a cached block by the block hash, or None if cache miss. + If there are duplicated blocks, we return the first block in the cache. + + Args: + block_hash: The hash value of the block. + + Returns: + The cached block if it exists, or None. + """ + if block_hash in self.cached_block_hash_to_block: + first_block_id = list( + self.cached_block_hash_to_block[block_hash].keys())[0] + return self.cached_block_hash_to_block[block_hash][first_block_id] + return None + + def _touch(self, blocks: List[KVCacheBlock]) -> None: + """Touch a block increases its reference count by 1, and may remove + the block from the free queue. This is used when a block is hit by + another request with the same prefix. + + Args: + blocks: A list of blocks to touch. + """ + for block in blocks: + # ref_cnt=0 means this block is in the free list (i.e. eviction + # candidate), so remove it. + if block.ref_cnt == 0: + self.free_block_queue.remove(block) + block.ref_cnt += 1 + + def _add_token_ids_to_blocks( + self, + blocks: List[KVCacheBlock], + token_ids: List[int], + parent_block: Optional[KVCacheBlock] = None) -> int: + """Add token IDs to a list of allocated blocks. + If a block becomes full after adding token IDs, cache it. + Return the token ID index that has not been added to the blocks + if the blocks are not enough to hold all the token IDs. + + Args: + blocks: A list of blocks to add token IDs. + token_ids: A list of token IDs to add. + parent_block: The parent block. None if this is the + first block. + + Returns: + The starting token ID index that has not been added to the blocks + due to insufficient given blocks. + """ + token_id_start = 0 + for curr_block in blocks: + # If all token IDs are added, then the rest of the blocks are + # preallocated blocks, so we only need to update the + # parent_block_id. FIXME + if token_id_start == len(token_ids): + continue + + # Add token IDs to the empty slots in the block. + empty_slots = self.block_size - len(curr_block.token_ids) + token_id_end = min(token_id_start + empty_slots, len(token_ids)) + curr_block.token_ids.extend(token_ids[token_id_start:token_id_end]) + # Cache the block if it becomes full. + if len(curr_block.token_ids) == self.block_size: + self._cache_full_block(curr_block, parent_block) + parent_block = curr_block + token_id_start = token_id_end + return token_id_start diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py new file mode 100644 index 0000000000000..33dbfb7377bfd --- /dev/null +++ b/vllm/v1/core/kv_cache_utils.py @@ -0,0 +1,194 @@ +"""KV-Cache Utilities.""" +from dataclasses import dataclass, field +from typing import List, Optional, Tuple, Union + +from vllm.logger import init_logger + +logger = init_logger(__name__) + +BlockHashType = Tuple[int, Tuple[int]] + + +@dataclass +class KVCacheBlock: + """KV-cache block metadata.""" + # Block ID, ranging from 0 to num_gpu_blocks - 1. + block_id: int + # Reference count. + ref_cnt: int = 0 + # Token IDs in the block. When the block is full, the type of token_ids + # should be Tuple[int] for fast matching. + token_ids: Union[List[int], Tuple[int]] = field(default_factory=list) + # The hash of the block composed of (block hash, tuple of token IDs). + # It is only available when the block is full. + block_hash: Optional[BlockHashType] = None + # The number of hashed tokens. More hashed tokens means the block + # is closer to the end of a prompt and more likely to be evicted. + num_hashed_tokens: int = 0 + + # Used to construct a doubly linked list for free blocks. + # These two attributes should only be manipulated by FreeKVCacheBlockQueue. + prev_free_block: Optional["KVCacheBlock"] = None + next_free_block: Optional["KVCacheBlock"] = None + + def reset(self): + """Reset the block metadata.""" + self.ref_cnt = 0 + self.token_ids = [] + self.block_hash = None + self.num_hashed_tokens = 0 + + +class FreeKVCacheBlockQueue: + """This class organizes a list of KVCacheBlock objects to a doubly linked + list of free blocks. We implement this class instead of using Python + builtin deque to support removing a block in the middle of the queue + in O(1) time. To close the performance gap to the builtin deque which is + implemented in C++, this class does not allocate any Python objects when + manipulating the linked list. Instead, this class manipulates the + prev_free_block and next_free_block attributes of the given blocks. + + The queue is ordered by block ID in the beginning. When a block is allocated + and then freed, it will be appended back with the eviction order: + 1. The least recent used block is at the front (LRU). + 2. If two blocks have the same last accessed time (allocated by the + same sequence), the one with more hash tokens (the tail of a block + chain) is at the front. + Note that we maintain this order by reversing the block order when free + blocks of a request. This operation is outside of this class. + + Args: + blocks: A list of KVCacheBlock objects. + """ + + def __init__(self, blocks: List[KVCacheBlock]) -> None: + self.num_free_blocks = len(blocks) + + # Initialize the doubly linked list of free blocks. + self.free_list_head = blocks[0] + self.free_list_tail = blocks[-1] + for i in range(self.num_free_blocks): + if i > 0: + blocks[i].prev_free_block = blocks[i - 1] + if i < self.num_free_blocks - 1: + blocks[i].next_free_block = blocks[i + 1] + + def popleft(self) -> KVCacheBlock: + """Pop the first free block and reduce num_free_blocks by 1. + + Returns: + The first free block. + """ + if not self.free_list_head: + raise ValueError("No free blocks available") + + block = self.free_list_head + self.remove(block) + return block + + def remove(self, block: KVCacheBlock) -> None: + """Remove a block in the free list and reduce num_free_blocks by 1. + + Args: + block: The block to remove. + """ + if block.prev_free_block is not None: + # Link the previous block to the next block. + block.prev_free_block.next_free_block = block.next_free_block + if block.next_free_block is not None: + # Link the next block to the previous block. + block.next_free_block.prev_free_block = block.prev_free_block + + if block == self.free_list_head: + # Update the head if the block is the head. + self.free_list_head = block.next_free_block + if block == self.free_list_tail: + # Update the tail if the block is the tail. + self.free_list_tail = block.prev_free_block + + # Remove the block from the linked list. + block.prev_free_block = block.next_free_block = None + self.num_free_blocks -= 1 + + def append(self, block: KVCacheBlock) -> None: + """Put a block back into the free list and increase + num_free_blocks by 1. + + Args: + block: The block to append. + """ + if self.free_list_tail is not None: + # Link the last block to the new block. + self.free_list_tail.next_free_block = block + block.prev_free_block = self.free_list_tail + self.free_list_tail = block + else: + # The free list is empty. + assert self.free_list_head is None + self.free_list_head = self.free_list_tail = block + + block.next_free_block = None + self.num_free_blocks += 1 + + def get_all_free_blocks(self) -> List[KVCacheBlock]: + """Get all free blocks in the free list. Mainly used for testing. + + Returns: + A list of free blocks. + """ + ret = [] + curr_block = self.free_list_head + while curr_block is not None: + ret.append(curr_block) + curr_block = curr_block.next_free_block + return ret + + +def hash_block_tokens(parent_block_hash: Optional[int], + curr_block_token_ids: Tuple[int]) -> BlockHashType: + """Computes a hash value corresponding to the contents of a block and + the contents of the preceding block(s). The hash value is used for + prefix caching. We use LRU cache for this function to avoid recomputing + hash values for the same block contents. + + TODO: Support arbitrary metadata so that we could support more + features such as LoRA adapter. + + Args: + parent_block_hash: The hash of the parent block. None + if this is the first block. + curr_block_token_ids: A tuple of token ids in the current + block. The current block is assumed to be full. + + Returns: + The hash value of the block and the token ids in the block. + The entire tuple is used as the hash key of the block. + """ + return (hash( + (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids) + + +def hash_request_tokens(block_size: int, + token_ids: List[int]) -> List[BlockHashType]: + """Computes hash values of a chain of blocks given a sequence of + token IDs. The hash value is used for prefix caching. + + Args: + block_size: The size of each block. + token_ids: A sequence of token ids in the request. + + Returns: + The list of computed hash values. + """ + ret = [] + parent_block_hash = None + for start in range(0, len(token_ids), block_size): + end = start + block_size + block_token_ids = tuple(token_ids[start:end]) + # Do not hash the block if it is not full. + if len(block_token_ids) < block_size: + break + block_hash = hash_block_tokens(parent_block_hash, block_token_ids) + ret.append(block_hash) + parent_block_hash = block_hash + return ret diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 41659ff62747d..ba50a9786d805 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -1,15 +1,21 @@ from collections import deque from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, + Tuple, Union) from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import SamplingParams +from vllm.v1.core.encoder_cache_manager import EncoderCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.engine import EngineCoreOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus +if TYPE_CHECKING: + from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.base import PlaceholderRange + logger = init_logger(__name__) @@ -34,7 +40,7 @@ def __init__( block_size=self.cache_config.block_size, num_gpu_blocks=num_gpu_blocks, sliding_window=self.cache_config.sliding_window, - enable_caching=True) + enable_caching=self.cache_config.enable_prefix_caching) self.block_size = self.cache_config.block_size # Scheduling constraints. @@ -60,12 +66,20 @@ def __init__( # Request id -> RunningRequestData self.running_reqs_data: Dict[str, RunningRequestData] = {} - def schedule(self) -> "SchedulerOutput": - scheduled_new_reqs: List[Request] = [] - scheduled_resumed_reqs: List[Request] = [] - scheduled_running_reqs: List[Request] = [] - preempted_reqs: List[Request] = [] + # Encoder-related. + # NOTE(woosuk): Here, "encoder" includes the vision encoder (and + # projector if needed). Currently, we assume that the encoder also + # has the Transformer architecture (e.g., ViT). + # FIXME(woosuk): Below are placeholder values. We need to calculate the + # actual values from the configurations. + self.max_num_encoder_input_tokens = 2048 + # NOTE(woosuk): For the models without encoder (e.g., text-only models), + # the encoder cache will not be initialized and used, regardless of + # the cache size. This is because the memory space for the encoder cache + # is preallocated in the profiling run. + self.encoder_cache_manager = EncoderCacheManager(cache_size=2048) + def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: # There's no "decoding phase" nor "prefill phase" in the scheduler. # Each request just has the num_computed_tokens and num_tokens, @@ -73,27 +87,49 @@ def schedule(self) -> "SchedulerOutput": # At each step, the scheduler tries to assign tokens to the requests # so that each request's num_computed_tokens can catch up its # num_tokens. This is general enough to cover chunked prefills, - # prefix caching, and the "jump forward" optimization in the future. + # prefix caching, and the "jump decoding" optimization in the future. + + scheduled_new_reqs: List[Request] = [] + scheduled_resumed_reqs: List[Request] = [] + scheduled_running_reqs: List[Request] = [] + preempted_reqs: List[Request] = [] req_to_new_block_ids: Dict[str, List[int]] = {} num_scheduled_tokens: Dict[str, int] = {} token_budget = self.max_num_scheduled_tokens + # Encoder-related. + scheduled_encoder_inputs: Dict[str, List[int]] = {} + encoder_budget = self.max_num_encoder_input_tokens # First, schedule the RUNNING requests. + # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be + # in the "partial" state, where the request has some tokens computed + # but not all. The constraint is due to the persistent batch in the + # V1 model runner. + # TODO(woosuk): Remove this constraint after refactoring model runner. + has_partial_request = False req_index = 0 while req_index < len(self.running): - if token_budget == 0: - break - + # Only the last request in the RUNNING queue can be "partial". + assert not has_partial_request + assert token_budget > 0 request = self.running[req_index] num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 + # Schedule encoder inputs. + encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = ( + self._try_schedule_encoder_inputs(request, + request.num_computed_tokens, + num_new_tokens, + encoder_budget)) + assert num_new_tokens > 0 + while True: - new_block_ids = self.kv_cache_manager.append_slots( + new_blocks = self.kv_cache_manager.append_slots( request, num_new_tokens) - if new_block_ids is None: + if new_blocks is None: # The request cannot be scheduled. # Preempt the lowest-priority request. preempted_req = self.running.pop() @@ -105,20 +141,40 @@ def schedule(self) -> "SchedulerOutput": preempted_reqs.append(preempted_req) if preempted_req == request: # No more request to preempt. + can_schedule = False break else: # The request can be scheduled. - scheduled_running_reqs.append(request) - - req_to_new_block_ids[request.request_id] = new_block_ids - num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens - req_index += 1 + can_schedule = True break + if not can_schedule: + break + + # Schedule the request. + scheduled_running_reqs.append(request) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in new_blocks + ] + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + has_partial_request = (request.num_computed_tokens + num_new_tokens + < request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Next, schedule the WAITING requests. if not preempted_reqs: while self.waiting: + if has_partial_request: + break if len(self.running) == self.max_num_running_reqs: break if token_budget == 0: @@ -126,25 +182,41 @@ def schedule(self) -> "SchedulerOutput": request = self.waiting[0] # Get already-cached tokens. - computed_block_ids = self.kv_cache_manager.get_computed_blocks( + computed_blocks = self.kv_cache_manager.get_computed_blocks( request) # NOTE(woosuk): Since incomplete blocks are not eligible for # sharing, `num_computed_tokens` is always a multiple of # `block_size`. - num_computed_tokens = len(computed_block_ids) * self.block_size + num_computed_tokens = len(computed_blocks) * self.block_size # Number of tokens to be scheduled. # We use `request.num_tokens` instead of # `request.num_prompt_tokens` to consider the resumed requests, # which have output tokens. num_new_tokens = request.num_tokens - num_computed_tokens + if num_new_tokens == 0: + # The happens when prompt length is divisible by the block + # size and all blocks are cached. Now we force to recompute + # the last token. + num_computed_tokens -= 1 + num_new_tokens = 1 + computed_blocks.pop() num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 - new_block_ids = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, computed_block_ids) - if new_block_ids is None: + + # Schedule encoder inputs. + (encoder_inputs_to_schedule, num_new_tokens, + new_encoder_budget) = self._try_schedule_encoder_inputs( + request, num_computed_tokens, num_new_tokens, + encoder_budget) + if num_new_tokens == 0: + # The request cannot be scheduled. + break + + new_blocks = self.kv_cache_manager.allocate_slots( + request, num_new_tokens, computed_blocks) + if new_blocks is None: # The request cannot be scheduled. break - request.num_computed_tokens = num_computed_tokens self.waiting.popleft() self.running.append(request) @@ -156,11 +228,24 @@ def schedule(self) -> "SchedulerOutput": raise RuntimeError( f"Invalid request status: {request.status}") - req_to_new_block_ids[request.request_id] = ( - computed_block_ids + new_block_ids) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in computed_blocks + new_blocks + ] num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING + request.num_computed_tokens = num_computed_tokens + has_partial_request = (num_computed_tokens + num_new_tokens < + request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) @@ -194,12 +279,14 @@ def schedule(self) -> "SchedulerOutput": scheduled_running_reqs=running_reqs_data, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_encoder_inputs=scheduled_encoder_inputs, preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between # the previous and the current steps. finished_req_ids=self.finished_req_ids, + free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(), ) self.finished_req_ids = set() @@ -223,17 +310,82 @@ def _make_running_request_data( self.running_reqs_data[request.request_id] = req_data return req_data + def _try_schedule_encoder_inputs( + self, + request: Request, + num_computed_tokens: int, + num_new_tokens: int, + encoder_budget: int, + ) -> Tuple[List[int], int, int]: + """ + Determine which encoder inputs need to be scheduled in the current step, + and update `num_new_tokens` and encoder token budget accordingly. + + An encoder input will be scheduled if: + - Its output tokens overlap with the range of tokens being computed + in this step, i.e., + [num_computed_tokens, num_computed_tokens + num_new_tokens). + - It is not already computed and stored in the encoder cache. + - There is sufficient encoder token budget to process it. + - The encoder cache has space to store it. + + If an encoder input cannot be scheduled due to cache or budget + limitations, the method adjusts `num_new_tokens` to schedule only the + decoder tokens up to just before the unschedulable encoder input. + """ + if not request.has_encoder_inputs(): + return [], num_new_tokens, encoder_budget + + encoder_inputs_to_schedule: List[int] = [] + mm_positions = request.mm_positions + assert mm_positions is not None + assert len(mm_positions) > 0 + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, num_computed_tokens + num_new_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_new_tokens: + # The encoder input is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder input is already computed and stored + # in the decoder's KV cache. + continue + + if self.encoder_cache_manager.has_cache(request, i): + # The encoder input is already computed and cached. + continue + if not self.encoder_cache_manager.can_allocate(request, i): + # The encoder cache is full. We can only schedule the decoder + # tokens just before the encoder input. + num_new_tokens = start_pos - num_computed_tokens + break + if num_encoder_tokens > encoder_budget: + # The encoder budget is exhausted. We can only schedule the + # decoder tokens up until the encoder input. + # NOTE(woosuk): We assume that the encoder tokens should be + # processed altogether, as the encoder usually uses + # bidirectional attention. + num_new_tokens = start_pos - num_computed_tokens + break + + encoder_budget -= num_encoder_tokens + encoder_inputs_to_schedule.append(i) + return encoder_inputs_to_schedule, num_new_tokens, encoder_budget + def update_from_output( self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> List[Tuple[Request, int]]: + ) -> List[EngineCoreOutput]: # NOTE(woosuk): This method doesn't consider speculative decoding. sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist() num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] - # (request, num_sampled_tokens) - sampled: List[Tuple[Request, int]] = [] + engine_core_outputs: List[EngineCoreOutput] = [] for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -241,23 +393,46 @@ def update_from_output( # the request generates output tokens. Otherwise, we ignore the # sampler output for the request. assert request.num_computed_tokens <= request.num_tokens + + cached_encoder_input_ids = ( + self.encoder_cache_manager.get_cached_input_ids(request)) + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) + if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] # NOTE(woosuk): Currently, we assume that each request # generates at most one token at each step. token_id = sampled_token_ids[req_index] - request.output_token_ids.append(token_id) - sampled.append((request, 1)) + request.append_output_token_ids(token_id) + num_new_tokens = 1 # TODO: Update the KV cache manager for prefix caching. - # Check if the request is finished. + # Check for stop and update request state. + # This must be called before me make the EngineCoreOutput. stopped = self._check_stop(request) + + # Add EngineCoreOutput for this Request. + output = EngineCoreOutput( + request_id=req_id, + new_token_ids=request.output_token_ids[-num_new_tokens:], + finished=request.is_finished(), + finish_reason=request.get_finished_reason(), + stop_reason=request.stop_reason) + engine_core_outputs.append(output) + + # Breakout of the loop. if stopped: continue new_running.append(request) self.running = new_running - return sampled + return engine_core_outputs def _check_stop(self, request: Request) -> bool: if (request.num_tokens >= self.max_model_len @@ -333,7 +508,8 @@ class NewRequestData: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional[MultiModalDataDict] + mm_inputs: List["MultiModalKwargs"] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams block_ids: List[int] num_computed_tokens: int @@ -347,9 +523,10 @@ def from_request( ) -> "NewRequestData": return cls( req_id=request.request_id, - prompt_token_ids=request.inputs["prompt_token_ids"], - prompt=request.inputs.get("prompt"), - multi_modal_data=request.inputs.get("multi_modal_data"), + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + mm_inputs=request.mm_inputs, + mm_positions=request.mm_positions, sampling_params=request.sampling_params, block_ids=block_ids, num_computed_tokens=num_computed_tokens, @@ -407,6 +584,8 @@ class SchedulerOutput: num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int + scheduled_encoder_inputs: Dict[str, List[int]] preempted_req_ids: Set[str] finished_req_ids: Set[str] + free_encoder_input_ids: List[Tuple[str, int]] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index e69de29bb2d1d..edfb8bd7c2fc1 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -0,0 +1,77 @@ +import enum +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import msgspec + +from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.sampling_params import RequestOutputKind, SamplingParams + + +@dataclass +class DetokenizerRequest: + + request_id: str + prompt: Optional[str] + prompt_token_ids: List[int] + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + stop: List[str] + include_stop_str_in_output: bool + + +@dataclass +class EngineCoreRequest: + + # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, + # but this object is currently not playing well with msgspec + # due to circular imports and typing we have in data.py + + request_id: str + #NOTE(Nick): I don't think we need to pass prompt here since it should + # always be tokenized? + prompt: Optional[str] + prompt_token_ids: List[int] + mm_data: Optional[MultiModalDataDict] + mm_placeholders: Optional[MultiModalPlaceholderDict] + mm_processor_kwargs: Optional[Dict[str, Any]] + sampling_params: SamplingParams + eos_token_id: Optional[int] + arrival_time: float + lora_request: Optional[LoRARequest] + + +class EngineCoreOutput(msgspec.Struct, + array_like=True, + omit_defaults=True, + gc=False): + + request_id: str + new_token_ids: List[int] + finished: bool + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + + +class EngineCoreOutputs(msgspec.Struct, + array_like=True, + omit_defaults=True, + gc=False): + + #NOTE(Nick): We could consider ways to make this more compact, + # e.g. columnwise layout and using an int enum for finish/stop reason + + # [num_reqs] + outputs: List[EngineCoreOutput] + + +class EngineCoreRequestType(enum.Enum): + """ + Request types defined as hex byte strings, so it can be sent over sockets + without separate encoding step. + """ + ADD = b'\x00' + ABORT = b'\x01' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py new file mode 100644 index 0000000000000..09bff9655a882 --- /dev/null +++ b/vllm/v1/engine/async_llm.py @@ -0,0 +1,372 @@ +import asyncio +from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union + +from vllm.config import ModelConfig, VllmConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.metrics_types import StatLoggerBase +from vllm.engine.protocol import EngineClient +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine.async_stream import AsyncStream +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.processor import Processor +from vllm.v1.executor.gpu_executor import GPUExecutor + +logger = init_logger(__name__) + + +class AsyncLLM(EngineClient): + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + log_stats: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + input_registry: InputRegistry = INPUT_REGISTRY, + use_cached_outputs: bool = False, + log_requests: bool = True, + start_engine_loop: bool = True, + ) -> None: + assert start_engine_loop + + self.log_requests = log_requests + self.log_stats = log_stats + self.stat_loggers = stat_loggers + self.model_config = vllm_config.model_config + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + enable_lora=bool(vllm_config.lora_config)) + self.tokenizer.ping() + + # Request streams (map of request_id -> AsyncStream). + self.request_streams: Dict[str, AsyncStream] = {} + # List of cancelled request ids to be aborted. + self.client_aborted_requests: List[str] = [] + + # Processor (converts Inputs --> EngineCoreRequests). + self.processor = Processor(vllm_config.model_config, + vllm_config.lora_config, self.tokenizer, + input_registry) + + # Detokenizer (converts EngineCoreOutputs --> RequestOutput). + self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer) + + # EngineCore (starts the engine in background process). + self.engine_core = EngineCoreClient.make_client( + vllm_config=vllm_config, + executor_class=executor_class, + usage_context=usage_context, + multiprocess_mode=True, + asyncio_mode=True, + ) + + self.output_handler = None + + def __del__(self): + self.shutdown() + + @classmethod + def from_engine_args( + cls, + engine_args: AsyncEngineArgs, + engine_config: Optional[VllmConfig] = None, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + ) -> "AsyncLLMEngine": + """Create an AsyncLLM from the EngineArgs.""" + + # Create the engine configs. + if engine_config is None: + vllm_config = engine_args.create_engine_config() + else: + vllm_config = engine_config + + executor_class = cls._get_executor_cls(vllm_config) + + # Create the AsyncLLM. + return cls( + vllm_config=vllm_config, + executor_class=executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + start_engine_loop=start_engine_loop, + usage_context=usage_context, + stat_loggers=stat_loggers, + ) + + def shutdown(self): + """Shutdown, cleaning up the background proc and IPC.""" + + self.engine_core.shutdown() + + if handler := getattr(self, "output_handler", None): + handler.cancel() + + @classmethod + def _get_executor_cls(cls, vllm_config: VllmConfig): + return GPUExecutor + + async def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + """Add new request to the AsyncLLM.""" + + if self.detokenizer.is_request_active(request_id): + raise KeyError(f"Request {request_id} already exists.") + + # 1) Create a new AsyncStream for the request. + stream = self._add_request_to_streams(request_id) + + # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. + detokenizer_req, engine_core_req = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) + + # 3) Add the request to Detokenizer (this process). + self.detokenizer.add_request(detokenizer_req) + + # 4) Add the EngineCoreRequest to EngineCore (separate process). + await self.engine_core.add_request_async(engine_core_req) + + # 5) Return the generator. + return stream.generator() + + # TODO: we should support multiple prompts in one call, as you + # can do with LLM.generate. So that for multi-prompt completion + # requests we don't need to send multiple messages to core proc, + # and so we don't need multiple streams which then get + # re-multiplexed in the API server anyhow. + async def generate( + self, + prompt: PromptType, + sampling_params: SamplingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> AsyncGenerator[RequestOutput, None]: + """ + Main function called by the API server to kick off a request + * 1) Making an AsyncStream corresponding to the Request. + # 2) Processing the Input. + * 3) Adding the Request to the Detokenizer. + * 4) Adding the Request to the EngineCore (separate process). + + A separate output_handler loop runs in a background AsyncIO task, + pulling outputs from EngineCore and putting them into the + per-request AsyncStream. + + The caller of generate() iterates the returned AsyncGenerator, + returning the RequestOutput back to the caller. + """ + + # We start the output_handler on the first call to generate() so that + # we can call __init__ before the event loop starts, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + + async for output in await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ): + yield output + + def _finish_stream(self, request_id: str): + stream = self.request_streams.pop(request_id, None) + if stream is not None: + stream.finish() + + def _add_request_to_streams( + self, + request_id: str, + ) -> AsyncStream: + + if request_id in self.request_streams: + raise ValueError(f"Request id {request_id} already running.") + + # Avoid streams having circular ref to parent AsyncLLM object. + aborted_reqs = self.client_aborted_requests + stream = AsyncStream(request_id, aborted_reqs.append) + self.request_streams[request_id] = stream + + if self.log_requests: + logger.info("Added request %s.", request_id) + + return stream + + async def _process_cancellations(self) -> None: + """ + Process requests cancelled from user disconnecting. + + When a client disconnects, AsyncStream._cancel() is called. + We passed a callback to AsyncStream(), which appends to + self.client_aborted_requests. + + As a result, if any requests are canceled from the user side + the request_id will show up in self.client_aborted_requests. + """ + + # Avoid streams having circular ref to parent AsyncLLM object. + if not self.client_aborted_requests: + return + reqs_to_abort = self.client_aborted_requests.copy() + self.client_aborted_requests.clear() + + # Remove from Detokenizer. + self.detokenizer.abort_requests(reqs_to_abort) + + # Remove from RequestStreams. + for request_id in reqs_to_abort: + if self.log_requests: + logger.info("User-cancelled request %s.", request_id) + self._finish_stream(request_id) + + # Remove from EngineCore. + await self.engine_core.abort_requests_async(reqs_to_abort) + + def _process_request_outputs(self, request_outputs: List[RequestOutput]): + """Process outputs by putting them into per-request AsyncStreams.""" + + for request_output in request_outputs: + request_id = request_output.request_id + assert request_id in self.request_streams + + # Each request in the API server pulls from the per-request stream. + stream = self.request_streams.get(request_id) + if stream is not None: + stream.put(request_output) + + # If finished, remove from the tracker. + if request_output.finished: + if self.log_requests: + logger.info("Finished request %s.", request_id) + self._finish_stream(request_id) + + async def _run_output_handler(self): + """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + + try: + while True: + # 1) Pull EngineCoreOutput from the EngineCore. + outputs = await self.engine_core.get_output_async() + + # 2) Detokenize based on the output. + request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + + # 3) Put the RequestOutputs into the per-request AsyncStreams. + self._process_request_outputs(request_outputs) + + # 4) Abort any requests that finished due to stop strings. + await self.engine_core.abort_requests_async(reqs_to_abort) + + # 5) Abort any requests due to client cancellations. + await self._process_cancellations() + + except BaseException as e: + logger.error(e) + raise e + + # TODO: can we eliminate these? + + async def abort(self, request_id: str) -> None: + # Note: Who Calls this? I dont think this is actually used. + raise ValueError("Not Supported on V1 yet.") + + def encode( + self, + prompt: PromptType, + pooling_params: PoolingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + priority: int = 0, + ): + raise ValueError("Not Supported on V1 yet.") + + async def get_model_config(self) -> ModelConfig: + return self.model_config + + async def get_decoding_config(self): + raise ValueError("Not Supported on V1 yet.") + + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.processor.input_preprocessor + + async def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + assert lora_request is None + return self.detokenizer.tokenizer + + async def is_tracing_enabled(self) -> bool: + return False + + async def do_log_stats( + self, + scheduler_outputs=None, + model_output=None, + ) -> None: + logger.debug("Called do_log_stats.") + + async def check_health(self) -> None: + logger.debug("Called check_health.") + + async def start_profile(self) -> None: + raise ValueError("Not supported on V1 yet.") + + async def stop_profile(self) -> None: + raise ValueError("Not supported on V1 yet.") + + @property + def is_running(self) -> bool: + return True + + @property + def is_stopped(self) -> bool: + return False + + @property + def errored(self) -> bool: + return False + + @property + def dead_error(self) -> BaseException: + return Exception + + +# Retain V0 name for backwards compatibility. +AsyncLLMEngine = AsyncLLM diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py new file mode 100644 index 0000000000000..3e6c759ad5ebd --- /dev/null +++ b/vllm/v1/engine/async_stream.py @@ -0,0 +1,55 @@ +import asyncio +from typing import Any, AsyncGenerator, Callable, Optional, Type, Union + +from vllm.outputs import EmbeddingRequestOutput, RequestOutput + + +class AsyncStream: + """A stream of RequestOutputs or EmbeddingRequestOutputs for a request + that can be iterated over asynchronously via an async generator.""" + + STOP_ITERATION = Exception() # Sentinel + + def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: + self.request_id = request_id + self._cancel = cancel + self._queue: asyncio.Queue = asyncio.Queue() + self._finished = False + + def put(self, item: Union[RequestOutput, EmbeddingRequestOutput, + Exception]) -> None: + if not self._finished: + self._queue.put_nowait(item) + + def finish( + self, + exception: Optional[Union[BaseException, Type[BaseException]]] = None, + ) -> None: + if not self._finished: + self._finished = True + self._queue.put_nowait(exception if self._is_raisable(exception) + else AsyncStream.STOP_ITERATION) + + async def generator( + self + ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + finished = False + try: + while True: + result = await self._queue.get() + if self._is_raisable(result): + finished = True + if result == AsyncStream.STOP_ITERATION: + return + raise result + yield result + finally: + self._finished = True + if not finished: + self._cancel(self.request_id) + + @staticmethod + def _is_raisable(value: Any): + return isinstance(value, BaseException) or \ + (isinstance(value, type) and \ + issubclass(value, BaseException)) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py new file mode 100644 index 0000000000000..35ed131d50de9 --- /dev/null +++ b/vllm/v1/engine/core.py @@ -0,0 +1,363 @@ +import multiprocessing +import queue +import threading +import time +from contextlib import contextmanager +from multiprocessing.process import BaseProcess +from multiprocessing.sharedctypes import Synchronized +from typing import Any, Iterator, List, Tuple, Type, Union + +import zmq +import zmq.asyncio +from msgspec import msgpack + +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.usage.usage_lib import UsageContext +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + EngineCoreRequest, EngineCoreRequestType) +from vllm.v1.engine.mm_input_mapper import MMInputMapper +from vllm.v1.executor.gpu_executor import GPUExecutor +from vllm.v1.request import Request, RequestStatus +from vllm.v1.serial_utils import PickleEncoder +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + +POLLING_TIMEOUT_MS = 5000 +POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 +LOGGING_TIME_S = 5000 + + +class EngineCore: + """Inner loop of vLLM's Engine.""" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + ): + # Override the configs for V1. + # FIXME + if usage_context == UsageContext.LLM_CLASS: + vllm_config.scheduler_config.max_num_seqs = 1024 + vllm_config.scheduler_config.max_num_batched_tokens = 8192 + elif usage_context == UsageContext.OPENAI_API_SERVER: + vllm_config.scheduler_config.max_num_seqs = 1024 + vllm_config.scheduler_config.max_num_batched_tokens = 2048 + + # TODO (ywang96): Enable APC by default when VLM supports it. + if not vllm_config.model_config.is_multimodal_model: + vllm_config.cache_config.enable_prefix_caching = True + + assert vllm_config.model_config.task != "embedding" + + logger.info("Initializing an LLM engine (v%s) with config: %s", + VLLM_VERSION, vllm_config) + + # Setup Model. + self.model_executor = executor_class(vllm_config) + + # Setup KV Caches and update CacheConfig after profiling. + num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( + vllm_config.cache_config) + vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks + vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks + + # Set up multimodal input mapper (e.g., convert PIL images to tensors). + self.mm_input_mapper = MMInputMapper(vllm_config.model_config) + + # Setup scheduler. + self.scheduler = Scheduler(vllm_config.scheduler_config, + vllm_config.cache_config, + vllm_config.lora_config) + + self._last_logging_time = time.time() + + def _initialize_kv_caches(self, + cache_config: CacheConfig) -> Tuple[int, int]: + num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( + ) + + if cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_gpu_blocks, + num_gpu_blocks_override) + num_gpu_blocks = num_gpu_blocks_override + + num_cpu_blocks = 0 + self.model_executor.initialize_cache(num_gpu_blocks) + return num_gpu_blocks, num_cpu_blocks + + def add_request(self, request: EngineCoreRequest): + """Add request to the scheduler.""" + + req = Request.from_engine_core_request(request) + # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may + # take 10-50 ms, which can cause a spike in the latency. We should + # consider moving this to a separate thread. + if req.mm_data: + req.mm_inputs = self.mm_input_mapper.process_inputs( + req.mm_data, req.mm_processor_kwargs) + self.scheduler.add_request(req) + + def abort_requests(self, request_ids: List[str]): + """Abort requests from the scheduler.""" + + # TODO: The scheduler doesn't really need to know the + # specific finish reason, TBD whether we propagate that + # (i.e. client-aborted vs stop criteria met). + self.scheduler.finish_requests(request_ids, + RequestStatus.FINISHED_ABORTED) + + def step(self) -> List[EngineCoreOutput]: + """Schedule, execute, and make output.""" + + if not self.scheduler.has_unfinished_requests(): + return [] + + scheduler_output = self.scheduler.schedule() + output = self.model_executor.execute_model(scheduler_output) + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, output) + return engine_core_outputs + + +class EngineCoreProc(EngineCore): + """ZMQ-wrapper for running EngineCore in background process.""" + + READY_STR = "READY" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + input_path: str, + output_path: str, + ready_path: str, + should_shutdown: Synchronized, + ): + super().__init__(vllm_config, executor_class, usage_context) + + # Signal from main process to shutdown (multiprocessing.Value). + self.should_shutdown = should_shutdown + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue = queue.Queue() + self.output_queue = queue.Queue() + threading.Thread(target=self.process_input_socket, + args=(input_path, ), + daemon=True).start() + threading.Thread(target=self.process_output_socket, + args=(output_path, ), + daemon=True).start() + + # Send Readiness signal to EngineClient. + with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket: + ready_socket.send_string(EngineCoreProc.READY_STR) + + @contextmanager + def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]: + """Context manager for use """ + + ctx = zmq.Context() + try: + socket = ctx.socket(type) + + if type == zmq.constants.PULL: + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + yield socket + + except KeyboardInterrupt: + logger.debug("EngineCore had Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + + @staticmethod + def wait_for_startup( + proc: BaseProcess, + ready_path: str, + ) -> None: + """Wait until the EngineCore is ready.""" + + try: + sync_ctx = zmq.Context() # type: ignore[attr-defined] + socket = sync_ctx.socket(zmq.constants.PULL) + socket.connect(ready_path) + + # Wait for EngineCore to send EngineCoreProc.READY_STR. + while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: + logger.debug("Waiting for EngineCoreProc to startup.") + + if not proc.is_alive(): + raise RuntimeError("EngineCoreProc failed to start.") + + message = socket.recv_string() + assert message == EngineCoreProc.READY_STR + + except BaseException as e: + logger.exception(e) + raise e + + finally: + sync_ctx.destroy(linger=0) + + @staticmethod + def make_engine_core_process( + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + input_path: str, + output_path: str, + ready_path: str, + should_shutdown: Synchronized, + ) -> BaseProcess: + # The current process might have CUDA context, + # so we need to spawn a new process. + # NOTE(rob): this is a problem for using EngineCoreProc w/ + # LLM, since we need a if __name__ == "__main__" guard. + context = multiprocessing.get_context("spawn") + + process_kwargs = { + "input_path": input_path, + "output_path": output_path, + "ready_path": ready_path, + "vllm_config": vllm_config, + "executor_class": executor_class, + "usage_context": usage_context, + "should_shutdown": should_shutdown + } + # Run EngineCore busy loop in background process. + proc = context.Process(target=EngineCoreProc.run_engine_core, + kwargs=process_kwargs) + proc.start() + + # Wait for startup + EngineCoreProc.wait_for_startup(proc, ready_path) + return proc + + @staticmethod + def run_engine_core(*args, **kwargs): + """Launch EngineCore busy loop in background process.""" + + try: + engine_core = EngineCoreProc(*args, **kwargs) + engine_core.run_busy_loop() + + except KeyboardInterrupt: + logger.debug("EngineCore interrupted.") + + except BaseException as e: + logger.exception(e) + raise e + + def run_busy_loop(self): + """Core busy loop of the EngineCore.""" + + # Loop until we get a shutdown signal. + while not self.should_shutdown: + # 1) Poll the input queue until there is work to do. + if not self.scheduler.has_unfinished_requests(): + while True: + try: + req = self.input_queue.get(timeout=POLLING_TIMEOUT_S) + self._handle_client_request(req) + break + except queue.Empty: + self._log_stats() + logger.debug("EngineCore busy loop waiting.") + if self.should_shutdown: + return + + # 2) Handle any new client requests (Abort or Add). + while not self.input_queue.empty(): + req = self.input_queue.get_nowait() + self._handle_client_request(req) + + # 3) Step the engine core. + outputs = self.step() + + # 4) Put EngineCoreOutputs into the output queue. + self.output_queue.put_nowait(outputs) + + self._log_stats() + + def _log_stats(self): + """Log basic stats every LOGGING_TIME_S""" + + now = time.time() + + if now - self._last_logging_time > LOGGING_TIME_S: + logger.info( + "RUNNING: %s | WAITING: %s", + len(self.scheduler.running), + len(self.scheduler.waiting), + ) + + self._last_logging_time = now + + def _handle_client_request( + self, request: Union[EngineCoreRequest, List[str]]) -> None: + """Handle EngineCoreRequest or EngineCoreABORT from Client.""" + + if isinstance(request, EngineCoreRequest): + self.add_request(request) + else: + # TODO: make an EngineCoreAbort wrapper + assert isinstance(request, list) + self.abort_requests(request) + + def process_input_socket(self, input_path: str): + """Input socket IO thread.""" + + # Msgpack serialization decoding. + decoder_add_req = PickleEncoder() + decoder_abort_req = PickleEncoder() + + with self.make_socket(input_path, zmq.constants.PULL) as socket: + while True: + # (RequestType, RequestData) + type_frame, data_frame = socket.recv_multipart(copy=False) + request_type = type_frame.buffer + request_data = data_frame.buffer + + # Deserialize the request data. + if request_type == EngineCoreRequestType.ADD.value: + request = decoder_add_req.decode(request_data) + elif request_type == EngineCoreRequestType.ABORT.value: + request = decoder_abort_req.decode(request_data) + else: + raise ValueError(f"Unknown RequestType: {request_type}") + + # Push to input queue for core busy loop. + self.input_queue.put_nowait(request) + + def process_output_socket(self, output_path: str): + """Output socket IO thread.""" + + # Msgpack serialization encoding. + encoder = msgpack.Encoder() + # Reuse send buffer. + buffer = bytearray() + + with self.make_socket(output_path, zmq.constants.PUSH) as socket: + while True: + engine_core_outputs = self.output_queue.get() + outputs = EngineCoreOutputs(outputs=engine_core_outputs) + encoder.encode_into(outputs, buffer) + socket.send_multipart((buffer, ), copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py new file mode 100644 index 0000000000000..09801e20e16ca --- /dev/null +++ b/vllm/v1/engine/core_client.py @@ -0,0 +1,219 @@ +import multiprocessing +import time +from typing import List, Union + +import msgspec +import zmq +import zmq.asyncio + +from vllm.logger import init_logger +from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + EngineCoreRequest, EngineCoreRequestType) +from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.serial_utils import PickleEncoder + +logger = init_logger(__name__) + + +class EngineCoreClient: + """ + EngineCoreClient: subclasses handle different methods for pushing + and pulling from the EngineCore for asyncio / multiprocessing. + + Subclasses: + * InprocClient: In process EngineCore (for V0-style LLMEngine use) + * SyncMPClient: ZMQ + background proc EngineCore (for LLM) + * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM) + """ + + @staticmethod + def make_client( + *args, + multiprocess_mode: bool, + asyncio_mode: bool, + **kwargs, + ) -> "EngineCoreClient": + + # TODO: support this for debugging purposes. + if asyncio_mode and not multiprocess_mode: + raise NotImplementedError( + "Running EngineCore in asyncio without multiprocessing " + "is not currently supported.") + + if multiprocess_mode and asyncio_mode: + return AsyncMPClient(*args, **kwargs) + + if multiprocess_mode and not asyncio_mode: + return SyncMPClient(*args, **kwargs) + + return InprocClient(*args, **kwargs) + + def shutdown(self): + pass + + def get_output(self) -> List[EngineCoreOutput]: + raise NotImplementedError + + def add_request(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + def abort_requests(self, request_ids: List[str]) -> None: + raise NotImplementedError + + async def get_output_async(self) -> List[EngineCoreOutput]: + raise NotImplementedError + + async def add_request_async(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + async def abort_requests_async(self, request_ids: List[str]) -> None: + raise NotImplementedError + + +class InprocClient(EngineCoreClient): + """ + InprocClient: client for in-process EngineCore. Intended + for use in LLMEngine for V0-style add_request() and step() + EngineCore setup in this process (no busy loop). + + * pushes EngineCoreRequest directly into the EngineCore + * pulls EngineCoreOutputs by stepping the EngineCore + + TODO: support asyncio-mode for debugging. + """ + + def __init__(self, *args, **kwargs): + self.engine_core = EngineCore(*args, **kwargs) + + def get_output(self) -> List[EngineCoreOutput]: + return self.engine_core.step() + + def add_request(self, request: EngineCoreRequest) -> None: + self.engine_core.add_request(request) + + def abort_requests(self, request_ids: List[str]) -> None: + self.engine_core.abort_requests(request_ids) + + +class MPClient(EngineCoreClient): + """ + MPClient: base client for multi-proc EngineCore. + EngineCore runs in a background process busy loop, getting + new EngineCoreRequests and returning EngineCoreOutputs + + * pushes EngineCoreRequests via input_socket + * pulls EngineCoreOutputs via output_socket + + * AsyncMPClient subclass for AsyncLLM usage + * SyncMPClient subclass for LLM usage + """ + + def __init__( + self, + *args, + asyncio_mode: bool, + **kwargs, + ): + # Serialization setup. + self.encoder = PickleEncoder() + self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) + + # ZMQ setup. + self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context()) + + # Path for IPC. + ready_path = get_open_zmq_ipc_path() + output_path = get_open_zmq_ipc_path() + input_path = get_open_zmq_ipc_path() + + # Get output (EngineCoreOutput) from EngineCore. + self.output_socket = self.ctx.socket(zmq.constants.PULL) + self.output_socket.connect(output_path) + + # Send input (EngineCoreRequest) to EngineCore. + self.input_socket = self.ctx.socket(zmq.constants.PUSH) + self.input_socket.bind(input_path) + + # Start EngineCore in background process. + self.should_shutdown = multiprocessing.Value('b', False, lock=False) + self.proc = EngineCoreProc.make_engine_core_process( + *args, + input_path=input_path, + output_path=output_path, + ready_path=ready_path, + should_shutdown=self.should_shutdown, + **kwargs, + ) + + def shutdown(self): + # Send shutdown signal to background process. + self.should_shutdown = True + + # Shut down the zmq context. + self.ctx.destroy(linger=0) + + # Shutdown the process if needed. + if hasattr(self, "proc") and self.proc.is_alive(): + self.proc.terminate() + + time.sleep(5) + if self.proc.is_alive(): + self.proc.kill() + + def __del__(self): + self.shutdown() + + +class SyncMPClient(MPClient): + """Synchronous client for multi-proc EngineCore.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, asyncio_mode=False, **kwargs) + + def get_output(self) -> List[EngineCoreOutput]: + + (frame, ) = self.output_socket.recv_multipart(copy=False) + engine_core_outputs = self.decoder.decode(frame.buffer).outputs + return engine_core_outputs + + def _send_input(self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, List[str]]) -> None: + + # (RequestType, SerializedRequest) + msg = (request_type.value, self.encoder.encode(request)) + self.input_socket.send_multipart(msg, copy=False) + + def add_request(self, request: EngineCoreRequest) -> None: + self._send_input(EngineCoreRequestType.ADD, request) + + def abort_requests(self, request_ids: List[str]) -> None: + self._send_input(EngineCoreRequestType.ABORT, request_ids) + + +class AsyncMPClient(MPClient): + """Asyncio-compatible client for multi-proc EngineCore.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, asyncio_mode=True, **kwargs) + + async def get_output_async(self) -> List[EngineCoreOutput]: + + frames = await self.output_socket.recv_multipart(copy=False) + engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs + + return engine_core_outputs + + async def _send_input( + self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, List[str]]) -> None: + + msg = (request_type.value, self.encoder.encode(request)) + await self.input_socket.send_multipart(msg, copy=False) + + async def add_request_async(self, request: EngineCoreRequest) -> None: + await self._send_input(EngineCoreRequestType.ADD, request) + + async def abort_requests_async(self, request_ids: List[str]) -> None: + if len(request_ids) > 0: + await self._send_input(EngineCoreRequestType.ABORT, request_ids) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py new file mode 100644 index 0000000000000..6249d60199a62 --- /dev/null +++ b/vllm/v1/engine/detokenizer.py @@ -0,0 +1,272 @@ +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import RequestOutputKind +from vllm.transformers_utils.detokenizer_utils import ( + AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput + +logger = init_logger(__name__) + + +@dataclass +class IncrementalDetokenizer: + + # Generation data + output_text: str + tokens: List[str] + token_ids: List[int] + + # Stop strings + stop: List[str] + include_stop_str_in_output: bool + + # Metadata for incremental detokenization + prefix_offset: int + read_offset: int + + # Parameters for detokenization + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + # TODO: Probably decouple these + request_id: str + prompt: Optional[str] + prompt_token_ids: List[int] + + # Tokenizer for this request + tokenizer: AnyTokenizer + + # Accounting for stop string buffering + stop_buffer_length: int + _last_output_text_offset: int = 0 + + @property + def output_token_ids(self) -> List[int]: + assert len(self.token_ids) >= len(self.prompt_token_ids) + return self.token_ids[len(self.prompt_token_ids):] + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: DetokenizerRequest, + ) -> "IncrementalDetokenizer": + + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=request.skip_special_tokens, + ) + + stops = request.stop + # Number of chars to hold back when stop strings are to be excluded + # from streamed output. + if stops and not request.include_stop_str_in_output: + stop_buffer_length = max(len(s) for s in stops) - 1 + else: + stop_buffer_length = 0 + + return cls( + output_text="", + tokens=tokens, + # Detokenizer mutates this list, so need a unique copy. + # NOTE(Nick): could we take ownership of it though? + token_ids=request.prompt_token_ids.copy(), + stop=stops, + include_stop_str_in_output=request.include_stop_str_in_output, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=request.skip_special_tokens, + spaces_between_special_tokens=request. + spaces_between_special_tokens, + output_kind=request.output_kind, + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + tokenizer=tokenizer, + stop_buffer_length=stop_buffer_length, + ) + + def add_tokens( + self, + new_token_ids: List[int], + finish_reason: Optional[str], + stop_reason: Optional[str], + ) -> Optional[RequestOutput]: + """ + Update RequestState for the request_id by: + 1) Detokenize the new token ids incrementally. + 2) Update the RequestOutput with the new text. + """ + + # 1) Detokenize the new token ids incrementally. + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + decoded_text = "" + for new_token_id in new_token_ids: + self.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=self.token_ids, + prev_tokens=self.tokens, + prefix_offset=self.prefix_offset, + read_offset=self.read_offset, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self. + spaces_between_special_tokens, + ) + + self.tokens.extend(new_tokens) + self.prefix_offset = prefix_offset + self.read_offset = read_offset + self.output_text += new_decoded_token_text + + decoded_text += new_decoded_token_text + + # 2) Evaluate stop criteria. + if self.stop: + stop = StopChecker.check_stop_strings( + output_text=self.output_text, + new_char_count=len(decoded_text), + stop=self.stop, + include_in_output=self.include_stop_str_in_output, + ) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + self.output_text = self.output_text[:truncate_to] + finish_reason = "stop" # TODO: use constant + stop_reason = stop_str + + # TODO: handle stop_token_ids here too? + + # 3) Update the RequestOutput object with the new text. + finished = bool(finish_reason) + if self.output_kind == RequestOutputKind.FINAL_ONLY \ + and not finished: + return None + + delta = self.output_kind == RequestOutputKind.DELTA + output_text = self._get_next_output_text(finished, delta) + token_ids = new_token_ids if delta else self.output_token_ids + + request_output = RequestOutput.new( + self.request_id, + self.prompt, + self.prompt_token_ids, + output_text, + token_ids, + finished, + ) + + if finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = finish_reason + completion_output.stop_reason = stop_reason + + return request_output + + def _get_next_output_text(self, finished: bool, delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + buffer_length = 0 if finished else self.stop_buffer_length + if not delta: + return self.output_text[:-buffer_length] if buffer_length else ( + self.output_text) + length = len(self.output_text) - buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" + + +class Detokenizer: + + def __init__(self, + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None): + # TODO: once we support LoRA, we should should pass the tokenizer + # here. We currently have two copies (this + in the LLMEngine). + self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + revision=revision) + + # Request id -> IncrementalDetokenizer + self.request_states: Dict[str, IncrementalDetokenizer] = {} + + def is_request_active(self, request_id: str): + return request_id in self.request_states + + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: Iterable[str], + ) -> None: + """Remove the request_ids from the Detokenizer.""" + + for request_id in request_ids: + self.request_states.pop(request_id, None) + + def add_request( + self, + request: DetokenizerRequest, + ): + """Add new request to the Detokenizer.""" + + assert (request.request_id not in self.request_states) + + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) + self.request_states[request.request_id] = request_state + + def step( + self, encore_core_outputs: List[EngineCoreOutput] + ) -> Tuple[List[RequestOutput], List[str]]: + """Update state and request the RequestOutputs to the LLMEngine.""" + + request_outputs: List[RequestOutput] = [] + requests_to_abort: List[str] = [] + for engine_core_output in encore_core_outputs: + request_id = engine_core_output.request_id + detokenizer = self.request_states.get(request_id) + if detokenizer is None: + # Ignore output for already-aborted request. + continue + + # Detokenize and update state. + request_output = detokenizer.add_tokens( + new_token_ids=engine_core_output.new_token_ids, + finish_reason=engine_core_output.finish_reason, + stop_reason=engine_core_output.stop_reason, + ) + + if request_output is not None: + # Add to RequestOutputs list. + request_outputs.append(request_output) + + # Free completed requests. + if request_output.finished: + self.request_states.pop(request_id) + if not engine_core_output.finished: + requests_to_abort.append(request_id) + + # Return to EngineClient. + return request_outputs, requests_to_abort diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 072e52bcd686a..4ebfff9584267 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,266 +1,125 @@ -import time -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type, - Union) - -from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, - EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from typing import Dict, List, Mapping, Optional, Type, Union + +from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderLLMInputs, InputRegistry, PromptType) -from vllm.inputs.preprocess import InputPreprocessor +from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.outputs import CompletionOutput, RequestOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer_group import ( - BaseTokenizerGroup, init_tokenizer_from_configs) +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.v1.core.scheduler import Scheduler +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.processor import Processor from vllm.v1.executor.gpu_executor import GPUExecutor -from vllm.v1.request import Request, RequestStatus -from vllm.v1.tokenizer.detokenizer import Detokenizer, DetokenizerInputs -from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) class LLMEngine: + """Legacy LLMEngine for backwards compatibility.""" def __init__( self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - speculative_config: Optional[SpeculativeConfig], - decoding_config: Optional[DecodingConfig], - observability_config: Optional[ObservabilityConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], + vllm_config: VllmConfig, executor_class: Type[GPUExecutor], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, + multiprocess_mode: bool = False, ) -> None: - # Override the configs for V1. - # FIXME - if usage_context == UsageContext.LLM_CLASS: - scheduler_config.max_num_seqs = 1024 - scheduler_config.max_num_batched_tokens = 8192 - elif usage_context == UsageContext.OPENAI_API_SERVER: - scheduler_config.max_num_seqs = 1024 - scheduler_config.max_num_batched_tokens = 2048 - - logger.info( - "Initializing an LLM engine (v%s) with config: " - "model=%r, speculative_config=%r, tokenizer=%r, " - "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, " - "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, " - "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " - "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " - "pipeline_parallel_size=%d, " - "disable_custom_all_reduce=%s, quantization=%s, " - "enforce_eager=%s, kv_cache_dtype=%s, " - "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, observability_config=%r, " - "seed=%d, served_model_name=%s, " - "num_scheduler_steps=%d, enable_prefix_caching=%s, " - "use_async_output_proc=%s, mm_processor_kwargs=%s)", - VLLM_VERSION, - model_config.model, - speculative_config, - model_config.tokenizer, - model_config.skip_tokenizer_init, - model_config.tokenizer_mode, - model_config.revision, - model_config.override_neuron_config, - model_config.rope_scaling, - model_config.rope_theta, - model_config.tokenizer_revision, - model_config.trust_remote_code, - model_config.dtype, - model_config.max_model_len, - load_config.download_dir, - load_config.load_format, - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, - scheduler_config.num_scheduler_steps, - cache_config.enable_prefix_caching, - model_config.use_async_output_proc, - model_config.mm_processor_kwargs, - ) - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.speculative_config = speculative_config - self.load_config = load_config - self.decoding_config = decoding_config or DecodingConfig() - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config or ObservabilityConfig( - ) - self.log_stats = log_stats - - assert not self.model_config.skip_tokenizer_init - self.tokenizer = self._init_tokenizer() - if self.tokenizer: - # Ping the tokenizer to ensure liveness if it runs in a - # different process. - self.tokenizer.ping() - self.detokenizer = Detokenizer(self.model_config.tokenizer) - - self.generation_config_fields = _load_generation_config_dict( - model_config) - self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) - self.input_registry = input_registry - self.input_processor = input_registry.create_input_processor( - model_config) - - # Request id -> Request - self.requests: Dict[str, Request] = {} - # NOTE(woosuk): Now that the detokenizer works asynchronously, we need - # to keep track of how many steps each request has been lagged behind - # in terms of detokenization. - # Request id -> how many detokenizer steps the request should wait for. - self.num_lagged_steps: Dict[str, int] = {} - # OPTIMIZATION: Cache the request output and update it incrementally. - # This is used to avoid creating a new RequestOutput object every step. - # Request id -> RequestOutput - self.request_outputs: Dict[str, RequestOutput] = {} - - self.model_executor = executor_class( - model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - lora_config=lora_config, - speculative_config=speculative_config, - load_config=load_config, - prompt_adapter_config=prompt_adapter_config, - observability_config=self.observability_config, + # TODO: Can we avoid this? + self.model_config = vllm_config.model_config + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + enable_lora=bool(vllm_config.lora_config)) + self.tokenizer.ping() + + # Processor (convert Inputs --> EngineCoreRequests) + self.processor = Processor(vllm_config.model_config, + vllm_config.lora_config, self.tokenizer, + input_registry, mm_registry) + + # Detokenizer (converts EngineCoreOutputs --> RequestOutput) + self.detokenizer = Detokenizer( + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, ) - assert self.model_config.task != "embedding" - self._initialize_kv_caches() - # Create the scheduler. - # NOTE: the cache_config here have been updated with the numbers of - # GPU and CPU blocks, which are profiled in the distributed executor. - self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) - - def _initialize_kv_caches(self) -> None: - num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( + # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) + self.engine_core = EngineCoreClient.make_client( + vllm_config, + executor_class, + usage_context, + multiprocess_mode=multiprocess_mode, + asyncio_mode=False, ) - if self.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = 0 - self.model_executor.initialize_cache(num_gpu_blocks) - @classmethod def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + enable_multiprocessing: bool = False, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. - engine_config = engine_args.create_engine_config() - executor_class = cls._get_executor_cls(engine_config) - # Create the LLM engine. - engine = cls( - **engine_config.to_dict(), - executor_class=executor_class, - log_stats=not engine_args.disable_log_stats, - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - return engine - - def _init_tokenizer(self) -> BaseTokenizerGroup: - return init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) - - def _verify_args(self) -> None: - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: - self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) - if self.prompt_adapter_config: - self.prompt_adapter_config.verify_with_model_config( - self.model_config) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderLLMInputs], - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], - trace_headers: Optional[Mapping[str, str]] = None, - ) -> None: - assert prompt_adapter_request is None - assert trace_headers is None - self._validate_model_inputs(processed_inputs) - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - - # TODO(woosuk): Support embedding mode. - assert isinstance(params, SamplingParams) - sampling_params = params.clone() - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - - # TODO(woosuk): Check max_logprobs - # TODO(woosuk): Support encoder-decoder models. - req = Request(request_id, processed_inputs, params, eos_token_id, - arrival_time) - self.requests[request_id] = req - self.num_lagged_steps[request_id] = 0 - self.scheduler.add_request(req) + vllm_config = engine_args.create_engine_config() + executor_class = cls._get_executor_cls(vllm_config) + + if VLLM_ENABLE_V1_MULTIPROCESSING: + logger.debug("Enabling multiprocessing for LLMEngine.") + enable_multiprocessing = True + + # Create the LLMEngine. + return cls(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats, + usage_context=usage_context, + stat_loggers=stat_loggers, + multiprocess_mode=enable_multiprocessing) + + @classmethod + def _get_executor_cls(cls, vllm_config: VllmConfig): + return GPUExecutor def stop_remote_worker_execution_loop(self) -> None: raise NotImplementedError("TP not implemented yet.") + def get_num_unfinished_requests(self) -> int: + return self.detokenizer.get_num_unfinished_requests() + + def has_unfinished_requests(self) -> bool: + return self.detokenizer.has_unfinished_requests() + + @classmethod + def validate_outputs(cls, outputs, output_type): + return outputs + + def abort_request(self, request_ids: List[str]) -> None: + """Remove request_ids from EngineCore and Detokenizer.""" + + self.engine_core.abort_requests(request_ids) + self.detokenizer.abort_requests(request_ids) + def add_request( self, request_id: str, @@ -272,261 +131,46 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if arrival_time is None: - arrival_time = time.time() - assert priority == 0, "vLLM V1 does not support priority at the moment." - - preprocessed_inputs = self.input_preprocessor.preprocess( - prompt, - request_id=request_id, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - processed_inputs = self.input_processor(preprocessed_inputs) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - trace_headers=trace_headers, - ) - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.scheduler.finish_requests(request_id, - RequestStatus.FINISHED_ABORTED) - self._free_request(request_id) + # 1) Process raw inputs into the request. + detokenizer_req, engine_core_req = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) - def get_num_unfinished_requests(self) -> int: - """Gets the number of unfinished requests.""" - return len(self.requests) + # 2) Add the request to Detokenizer. + self.detokenizer.add_request(detokenizer_req) - def has_unfinished_requests(self) -> bool: - """Returns True if there are unfinished requests.""" - return len(self.requests) > 0 + # 3) Add the request to EngineCore. + self.engine_core.add_request(engine_core_req) def step(self) -> List[RequestOutput]: - # NOTE(woosuk): This method may return an empty list when the - # detokenizer is still processing the outputs. This should not be - # considered as the end of the generation process. - # FIXME(woosuk): Currently, the step method is inefficient because it - # creates RequestOutput objects for all running requests, while they - # may not be needed unless the output is streamed to the client. - if self.scheduler.has_unfinished_requests(): - scheduler_output = self.scheduler.schedule() - output = self.model_executor.execute_model(scheduler_output) - sampled = self.scheduler.update_from_output( - scheduler_output, output) - self.send_to_detokenizer(sampled) - req_outputs = self.recv_from_detokenizer() - return req_outputs - - def send_to_detokenizer(self, sampled: List[Tuple[Request, int]]) -> None: - inputs = DetokenizerInputs( - req_ids=[], - prompt_token_ids=[], - new_token_ids=[], - skip_special_tokens=[], - spaces_between_special_tokens=[], - free_req_ids=[], # TODO(woosuk): Implement freeing. - ) - for req, num_tokens in sampled: - inputs.req_ids.append(req.request_id) - if len(req.output_token_ids) == num_tokens: - # The request is first detokenized. - inputs.prompt_token_ids.append(req.prompt_token_ids) - else: - # The prompt token ids are already cached in the detokenizer. - inputs.prompt_token_ids.append([]) - inputs.new_token_ids.append(req.output_token_ids[-num_tokens:]) - inputs.skip_special_tokens.append( - req.sampling_params.skip_special_tokens) - inputs.spaces_between_special_tokens.append( - req.sampling_params.spaces_between_special_tokens) - - # Update the number of lagged steps. - self.num_lagged_steps[req.request_id] += 1 - self.detokenizer.send(inputs) - - def recv_from_detokenizer(self) -> List[RequestOutput]: - detokenizer_output = self.detokenizer.recv() - if detokenizer_output is None: - return [] - - req_outputs: List[RequestOutput] = [] - num_reqs = len(detokenizer_output.req_ids) - for i in range(num_reqs): - req_id = detokenizer_output.req_ids[i] - if req_id not in self.requests: - # The request has been aborted while the detokenizer was - # processing the outputs. - continue - - req = self.requests[req_id] - req.output_text += detokenizer_output.detokenized_texts[i] - - self.num_lagged_steps[req_id] -= 1 - finished = (self.num_lagged_steps[req_id] == 0 - and req.is_finished()) - req_output = self._make_request_output( - req, detokenizer_output.num_output_token_ids[i], - detokenizer_output.detokenized_texts[i], finished) - req_outputs.append(req_output) - - if finished: - self._free_request(req_id) - return req_outputs - - def terminate_detokenizer(self) -> None: - self.detokenizer.terminate() - - def _make_request_output( - self, - request: Request, - num_output_tokens: int, - new_output_text: str, - finished: bool, - ) -> RequestOutput: - req_output = self.request_outputs.get(request.request_id) - if req_output is None: - # TODO: Support `n` > 1. - completion_output = CompletionOutput( - index=0, - text="", - token_ids=[], - cumulative_logprob=None, - logprobs=None, # TODO - finish_reason=None, - stop_reason=None, - lora_request=None, - ) - req_output = RequestOutput( - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, - prompt_logprobs=None, # TODO - outputs=[completion_output], - finished=False, - metrics=None, - lora_request=None, - encoder_prompt=None, - encoder_prompt_token_ids=None, - ) - self.request_outputs[request.request_id] = req_output - - completion_output = req_output.outputs[0] - if request.sampling_params.output_kind == RequestOutputKind.CUMULATIVE: - completion_output.text += new_output_text - completion_output.token_ids = ( - request.output_token_ids[:num_output_tokens]) - elif request.sampling_params.output_kind == RequestOutputKind.DELTA: - completion_output.text = new_output_text - num_prev_tokens = len(completion_output.token_ids) - completion_output.token_ids = request.output_token_ids[ - num_prev_tokens:num_output_tokens] - elif (request.sampling_params.output_kind == - RequestOutputKind.FINAL_ONLY): - if finished: - completion_output.text = request.output_text - completion_output.token_ids = request.output_token_ids - else: - completion_output.text = "" - completion_output.token_ids = [] - - if finished: - completion_output.finish_reason = request.get_finished_reason() - completion_output.stop_reason = request.stop_reason - req_output.finished = finished - return req_output - - def _free_request(self, request_id: str) -> None: - self.requests.pop(request_id, None) - self.num_lagged_steps.pop(request_id, None) - self.request_outputs.pop(request_id, None) - - def check_health(self) -> None: - if self.tokenizer: - self.tokenizer.check_health() - self.model_executor.check_health() - - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderLLMInputs]): - prompt_ids = inputs.get("prompt_token_ids") - if prompt_ids is None or len(prompt_ids) == 0: - raise ValueError("Prompt cannot be empty") - - if self.model_config.is_multimodal_model: - max_prompt_len = self.model_config.max_model_len - - if len(prompt_ids) > max_prompt_len: - raise ValueError( - f"The prompt (total length {len(prompt_ids)}) is too long " - f"to fit into the model (context length {max_prompt_len}). " - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens plus multimodal tokens. For image " - "inputs, the number of image tokens depends on the number " - "of images, and possibly their aspect ratios as well.") - - @classmethod - def validate_outputs(cls, outputs, output_type): - return outputs - - def get_model_config(self) -> ModelConfig: - """Gets the model configuration.""" - return self.model_config - def get_parallel_config(self) -> ParallelConfig: - """Gets the parallel configuration.""" - return self.parallel_config + # 1) Get EngineCoreOutput from the EngineCore. + engine_core_outputs = self.engine_core.get_output() - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config + # 2) Detokenizer the EngineCoreOutput. + request_outputs, requests_to_abort = self.detokenizer.step( + engine_core_outputs) - def get_scheduler_config(self) -> SchedulerConfig: - """Gets the scheduler configuration.""" - return self.scheduler_config + # 3) Abort requests that finished due to stopping criteria. + if requests_to_abort: + self.abort_request(requests_to_abort) - def get_lora_config(self) -> LoRAConfig: - """Gets the LoRA configuration.""" - return self.lora_config - - @classmethod - def _get_executor_cls(cls, engine_config: EngineConfig): - return GPUExecutor + return request_outputs - def is_tracing_enabled(self) -> bool: - return False + # TODO(rob): Can we get rid of these? - def do_log_stats(self, *args, **kwargs) -> None: + def get_model_config(self): pass - def is_encoder_decoder_model(self) -> bool: - return False - - def start_profile(self) -> None: + def is_encoder_decoder_model(self): pass - def stop_profile(self) -> None: + def start_profile(self): pass - def get_tokenizer_group(self, *args, **kwargs): - return self.tokenizer - - -def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: - config = try_get_generation_config( - model_config.model, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.revision, - ) - - if config is None: - return {} + def stop_profile(self): + pass - return config.to_diff_dict() + def get_tokenizer_group(self, group_type): + pass diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py new file mode 100644 index 0000000000000..594c973678235 --- /dev/null +++ b/vllm/v1/engine/mm_input_mapper.py @@ -0,0 +1,39 @@ +from typing import Any, Dict, List, Optional + +from vllm.config import ModelConfig +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalKwargs, MultiModalRegistry) + + +class MMInputMapper: + + def __init__( + self, + model_config: ModelConfig, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + self.mm_registry = mm_registry + self.multi_modal_input_mapper = mm_registry.create_input_mapper( + model_config) + self.mm_registry.init_mm_limits_per_prompt(model_config) + + def process_inputs( + self, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Dict[str, Any]], + ) -> List[MultiModalKwargs]: + image_inputs = mm_data["image"] + if not isinstance(image_inputs, list): + image_inputs = [image_inputs] + + # Process each image input separately so that later we can schedule + # them in a fine-grained manner. + mm_inputs: List[MultiModalKwargs] = [] + num_images = len(image_inputs) + for i in range(num_images): + mm_input = self.multi_modal_input_mapper( + {"image": [image_inputs[i]]}, + mm_processor_kwargs=mm_processor_kwargs, + ) + mm_inputs.append(mm_input) + return mm_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py new file mode 100644 index 0000000000000..5c1577190c75a --- /dev/null +++ b/vllm/v1/engine/processor.py @@ -0,0 +1,168 @@ +import time +from typing import Any, Dict, Mapping, Optional, Tuple, Union + +from vllm.config import LoRAConfig, ModelConfig +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs +from vllm.inputs.preprocess import InputPreprocessor +from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.config import try_get_generation_config +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest + + +class Processor: + + def __init__( + self, + model_config: ModelConfig, + lora_config: Optional[LoRAConfig], + tokenizer: BaseTokenizerGroup, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + + self.model_config = model_config + self.lora_config = lora_config + self.tokenizer = tokenizer + + self.generation_config_fields = _load_generation_config_dict( + model_config) + self.input_preprocessor = InputPreprocessor(model_config, + self.tokenizer, + mm_registry) + self.input_processor = input_registry.create_input_processor( + model_config) + + # TODO: run in an ThreadpoolExecutor or BackgroundProcess. + # This ideally should releases the GIL, so we should not block the + # asyncio loop while this is running. + def process_inputs( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: float, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: + + # TODO(woosuk): Support embedding mode. + # TODO(woosuk): Check max_logprobs + # TODO(woosuk): Support encoder-decoder models. + + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.time() + assert priority == 0, "vLLM V1 does not support priority at the moment." + assert trace_headers is None, "vLLM V1 does not support tracing yet." + + # Process inputs. + preprocessed_inputs = self.input_preprocessor.preprocess( + prompt, + request_id=request_id, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + processed_inputs = self.input_processor(preprocessed_inputs) + self._validate_model_inputs(processed_inputs) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = SingletonInputsAdapter( + processed_inputs["decoder"]) + encoder_inputs = SingletonInputsAdapter( + processed_inputs["encoder"]) + else: + decoder_inputs = SingletonInputsAdapter(processed_inputs) + encoder_inputs = None + + # TODO: Impl encoder-decoder + if encoder_inputs is not None: + raise NotImplementedError + + assert isinstance(params, SamplingParams) + # TODO: can we avoid cloning here in multiproc case + sampling_params = params.clone() + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + + # Make Request for Detokenizer. + detokenizer_request = DetokenizerRequest( + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, + sampling_params.skip_special_tokens, + sampling_params.spaces_between_special_tokens, + sampling_params.output_kind, + sampling_params.stop, + sampling_params.include_stop_str_in_output, + ) + + # Make Request for EngineCore. + engine_core_request = EngineCoreRequest( + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, + decoder_inputs.multi_modal_data, + decoder_inputs.multi_modal_placeholders, + decoder_inputs.mm_processor_kwargs, + sampling_params, + eos_token_id, + arrival_time, + lora_request, + ) + + return detokenizer_request, engine_core_request + + def _validate_model_inputs(self, inputs: ProcessorInputs): + if is_encoder_decoder_inputs(inputs): + # For encoder-decoder multimodal models, the max_prompt_len + # restricts the decoder prompt length + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] + else: + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids + + if prompt_ids is None or len(prompt_ids) == 0: + raise ValueError("Prompt cannot be empty") + + if self.model_config.is_multimodal_model: + max_prompt_len = self.model_config.max_model_len + + if len(prompt_ids) > max_prompt_len: + raise ValueError( + f"The prompt (total length {len(prompt_ids)}) is too long " + f"to fit into the model (context length {max_prompt_len}). " + "Make sure that `max_model_len` is no smaller than the " + "number of text tokens plus multimodal tokens. For image " + "inputs, the number of image tokens depends on the number " + "of images, and possibly their aspect ratios as well.") + + # TODO: Find out how many placeholder tokens are there so we can + # check that chunked prefill does not truncate them + # max_batch_len = self.scheduler_config.max_num_batched_tokens + + +def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: + config = try_get_generation_config( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.revision, + ) + + if config is None: + return {} + + return config.to_diff_dict() diff --git a/vllm/v1/executor/gpu_executor.py b/vllm/v1/executor/gpu_executor.py index c780c7031c3d6..f71fa16b16e27 100644 --- a/vllm/v1/executor/gpu_executor.py +++ b/vllm/v1/executor/gpu_executor.py @@ -1,10 +1,7 @@ import os from typing import Optional, Tuple -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import get_distributed_init_method, get_ip, get_open_port from vllm.v1.outputs import ModelRunnerOutput @@ -15,29 +12,18 @@ class GPUExecutor: - def __init__( - self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - speculative_config: Optional[SpeculativeConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], - observability_config: Optional[ObservabilityConfig], - ) -> None: - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.speculative_config = speculative_config - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config + def __init__(self, vllm_config: VllmConfig) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config self.worker = self._create_worker() self.worker.initialize() @@ -56,19 +42,10 @@ def _create_worker( distributed_init_method = get_distributed_init_method( get_ip(), get_open_port()) return Worker( - model_config=self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - device_config=self.device_config, - cache_config=self.cache_config, - load_config=self.load_config, + vllm_config=self.vllm_config, local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - lora_config=self.lora_config, - speculative_config=self.speculative_config, - prompt_adapter_config=self.prompt_adapter_config, - observability_config=self.observability_config, ) def determine_num_available_blocks(self) -> Tuple[int, int]: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index be7d4d165d280..51fb4003e5fe0 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,12 +1,13 @@ import enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Optional, Union +from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics - -if TYPE_CHECKING: - from vllm.inputs import DecoderOnlyInputs +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.utils import ConstantList class Request: @@ -14,14 +15,14 @@ class Request: def __init__( self, request_id: str, - inputs: "DecoderOnlyInputs", + inputs: DecoderOnlyInputs, sampling_params: SamplingParams, eos_token_id: Optional[int], arrival_time: float, lora_request: Optional[LoRARequest] = None, ) -> None: self.request_id = request_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.sampling_params = sampling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id @@ -37,20 +38,70 @@ def __init__( assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - self.prompt = inputs.get("prompt") - self.prompt_token_ids = inputs["prompt_token_ids"] + self.prompt = self.inputs.prompt + self.prompt_token_ids = self.inputs.prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) - self.output_token_ids: List[int] = [] - self.output_text = "" + self._output_token_ids: List[int] = [] + self._all_token_ids: List[int] = self.prompt_token_ids.copy() self.num_computed_tokens = 0 + # Raw multimodal data before the mm input mapper (e.g., PIL images). + self.mm_data = self.inputs.multi_modal_data + self.mm_processor_kwargs = self.inputs.mm_processor_kwargs + mm_positions = self.inputs.multi_modal_placeholders + if mm_positions: + # FIXME(woosuk): Support other modalities. + self.mm_positions = mm_positions.get("image", []) + else: + self.mm_positions = [] + # Output of the mm input mapper (e.g., image tensors). + self.mm_inputs: List[MultiModalKwargs] = [] + + @classmethod + def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": + return cls( + request_id=request.request_id, + inputs=token_inputs( + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + multi_modal_data=request.mm_data, + multi_modal_placeholders=request.mm_placeholders, + mm_processor_kwargs=request.mm_processor_kwargs, + ), + sampling_params=request.sampling_params, + eos_token_id=request.eos_token_id, + arrival_time=request.arrival_time, + lora_request=request.lora_request, + ) + + @property + def output_token_ids(self) -> ConstantList[int]: + # Prevent directly appending to the output_token_ids since + # all_token_ids should also be updated simultaneously. + return ConstantList(self._output_token_ids) + + @property + def all_token_ids(self) -> ConstantList[int]: + # Prevent directly appending to the all_token_ids since + # output_token_ids should also be updated simultaneously + return ConstantList(self._all_token_ids) + + def append_output_token_ids( + self, + token_ids: Union[int, List[int]], + ) -> None: + if isinstance(token_ids, int): + token_ids = [token_ids] + self._output_token_ids.extend(token_ids) + self._all_token_ids.extend(token_ids) + @property def num_tokens(self) -> int: - return self.num_prompt_tokens + len(self.output_token_ids) + return len(self._all_token_ids) @property def num_output_tokens(self) -> int: - return len(self.output_token_ids) + return len(self._output_token_ids) def is_finished(self) -> bool: return RequestStatus.is_finished(self.status) @@ -58,9 +109,21 @@ def is_finished(self) -> bool: def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) + def has_encoder_inputs(self) -> bool: + return len(self.mm_data) > 0 + + @property + def num_encoder_inputs(self) -> int: + return len(self.mm_positions) + + def get_num_encoder_tokens(self, input_id: int) -> int: + assert input_id < len(self.mm_positions) + num_tokens = self.mm_positions[input_id]["length"] + return num_tokens + class RequestStatus(enum.IntEnum): - """Status of a sequence.""" + """Status of a request.""" WAITING = 0 RUNNING = 1 PREEMPTED = 2 @@ -81,7 +144,7 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]: # Mapping of finished statuses to their finish reasons. -# NOTE: The ignored sequences are the sequences whose prompt lengths +# NOTE: The ignored requests are the requests whose prompt lengths # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index 28614377b27b9..9ef36f2e6b212 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Optional +from typing import Dict import torch @@ -16,7 +16,6 @@ class SamplingMetadata: no_top_p: bool no_top_k: bool - generators: List[Optional[torch.Generator]] - no_generator: bool + generators: Dict[int, torch.Generator] max_num_logprobs: int diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 157c4dd6d771e..927f274541c4d 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -1,5 +1,5 @@ """A layer that samples the next tokens from the model's outputs.""" -from typing import List, Optional +from typing import Dict import torch import torch.nn as nn @@ -84,22 +84,21 @@ def greedy_sample(self, probs: torch.Tensor) -> torch.Tensor: def random_sample( self, probs: torch.Tensor, - generators: List[Optional[torch.Generator]], - no_generator: bool, + generators: Dict[int, torch.Generator], ) -> torch.Tensor: q = torch.empty_like(probs) # NOTE(woosuk): To batch-process the requests without their own seeds, # which is the common case, we first assume that every request does # not have its own seed. Then, we overwrite the values for the requests # that have their own seeds. - q.exponential_() - if not no_generator: - assert len(generators) == probs.shape[0] + if len(generators) != probs.shape[0]: + # This might still be done here unnecessarily if there are greedies + q.exponential_() + if generators: # TODO(woosuk): This can be slow because we handle each request # one by one. Optimize this. - for i, generator in enumerate(generators): - if generator is not None: - q[i].exponential_(generator=generator) + for i, generator in generators.items(): + q[i].exponential_(generator=generator) return probs.div_(q).argmax(dim=-1).view(-1) def sample( @@ -112,13 +111,11 @@ def sample( if sampling_metadata.all_greedy: return self.greedy_sample(probs) if sampling_metadata.all_random: - return self.random_sample(probs, sampling_metadata.generators, - sampling_metadata.no_generator) + return self.random_sample(probs, sampling_metadata.generators) greedy_sampled = self.greedy_sample(probs) random_sampled = self.random_sample(probs, - sampling_metadata.generators, - sampling_metadata.no_generator) + sampling_metadata.generators) sampled = torch.where( sampling_metadata.temperature < _SAMPLING_EPS, greedy_sampled, diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py new file mode 100644 index 0000000000000..b1cd5c11834f8 --- /dev/null +++ b/vllm/v1/serial_utils.py @@ -0,0 +1,10 @@ +import pickle + + +class PickleEncoder: + + def encode(self, obj): + return pickle.dumps(obj) + + def decode(self, data): + return pickle.loads(data) diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py deleted file mode 100644 index 4bbcf4717981e..0000000000000 --- a/vllm/v1/tokenizer/detokenizer.py +++ /dev/null @@ -1,215 +0,0 @@ -import multiprocessing -from dataclasses import dataclass -from typing import Dict, List, Optional - -import msgspec -import zmq -from msgspec import msgpack - -from vllm.transformers_utils.detokenizer_utils import ( - convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import get_open_port - - -class DetokenizerInputs(msgspec.Struct): - - # [num_reqs] - req_ids: List[str] - # A request's prompt token ids is sent to the detokenizer only when - # the request is first detokenized. Otherwise, an empty list is sent. - prompt_token_ids: List[List[int]] - new_token_ids: List[List[int]] - skip_special_tokens: List[bool] - spaces_between_special_tokens: List[bool] - - # [num_free_reqs] - free_req_ids: List[str] - - -class DetokenizerOutputs(msgspec.Struct): - - # [num_reqs] - req_ids: List[str] - detokenized_texts: List[str] - # NOTE(woosuk): The number of the output token ids of each request - # at the time of detokenization. The detokenizer returns this to the engine - # because the request state (including the output token ids) is - # asynchronously updated in the engine, while RequestOutput requires the - # output token ids to be consistent with the detokenized text. - num_output_token_ids: List[int] - - -class Detokenizer: - - def __init__(self, tokenizer_name: str): - # FIXME(woosuk): Currently, the detokenizer is just a hacky prototype. - # For example, it does not terminate properly. We need to improve this. - self.push_port = get_open_port() - self.pull_port = get_open_port() - self.detokenizer = DetokenizerProc(tokenizer_name, self.push_port, - self.pull_port) - self.detokenizer.start() - - self.zmq_context = zmq.Context() - self.push_socket = self.zmq_context.socket(zmq.PUSH) - self.push_socket.connect(f"tcp://localhost:{self.push_port}") - self.pull_socket = self.zmq_context.socket(zmq.PULL) - self.pull_socket.connect(f"tcp://localhost:{self.pull_port}") - self.poller = zmq.Poller() - self.poller.register(self.pull_socket, zmq.POLLIN) - self.msgpack_encoder = msgpack.Encoder() - self.msgpack_decoder = msgpack.Decoder(DetokenizerOutputs) - - def send(self, inputs: DetokenizerInputs) -> None: - self.push_socket.send(self.msgpack_encoder.encode(inputs), - flags=zmq.NOBLOCK) - - def recv(self) -> Optional[DetokenizerOutputs]: - socks = dict(self.poller.poll(timeout=0)) - if self.pull_socket in socks and socks[self.pull_socket] == zmq.POLLIN: - msg = self.pull_socket.recv() - return self.msgpack_decoder.decode(msg) - return None - - def terminate(self) -> None: - self.push_socket.send(b"", flags=zmq.NOBLOCK) - self.detokenizer.join() - - -class DetokenizerProc(multiprocessing.Process): - - def __init__( - self, - tokenizer_name: str, - pull_port: int, - push_port: int, - ): - super().__init__() - self.tokenizer_name = tokenizer_name - # NOTE: The pull_port of the detokenizer should be the same as the - # push_port of the engine. Vice versa. - self.pull_port = pull_port - self.push_port = push_port - - def run(self): - # Initialize these objects after the process is forked since they are - # not picklable. - self.msgpack_encoder = msgpack.Encoder() - self.msgpack_decoder = msgpack.Decoder(DetokenizerInputs) - self.tokenizer = get_tokenizer(self.tokenizer_name) - # req_id -> RequestState - self.request_states: Dict[str, RequestState] = {} - - self.zmq_context = zmq.Context() - self.pull_socket = self.zmq_context.socket(zmq.PULL) - self.pull_socket.bind(f"tcp://*:{self.pull_port}") - self.push_socket = self.zmq_context.socket(zmq.PUSH) - self.push_socket.bind(f"tcp://*:{self.push_port}") - - while True: - message = self.pull_socket.recv() - if message == b"": - # Terminate signal. - break - inputs = self.msgpack_decoder.decode(message) - - for req_id in inputs.free_req_ids: - self.free(req_id) - - detokenized_texts: List[str] = [] - num_output_token_ids: List[int] = [] - num_reqs = len(inputs.req_ids) - for i in range(num_reqs): - req_id = inputs.req_ids[i] - if req_id not in self.request_states: - self.add_request( - request_id=req_id, - prompt_token_ids=inputs.prompt_token_ids[i], - skip_special_tokens=inputs.skip_special_tokens[i], - spaces_between_special_tokens=inputs. - spaces_between_special_tokens[i], - ) - new_str = self.detokenize(req_id, inputs.new_token_ids[i]) - detokenized_texts.append(new_str) - req_state = self.request_states[req_id] - num_output_token_ids.append( - len(req_state.token_ids) - req_state.num_prompt_tokens) - - detokenized = DetokenizerOutputs( - req_ids=inputs.req_ids, - detokenized_texts=detokenized_texts, - num_output_token_ids=num_output_token_ids, - ) - self.push_socket.send(self.msgpack_encoder.encode(detokenized), - flags=zmq.NOBLOCK) - - def add_request( - self, - request_id: str, - prompt_token_ids: List[int], - skip_special_tokens: bool, - spaces_between_special_tokens: bool, - ) -> None: - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=self.tokenizer, - prompt_ids=prompt_token_ids, - skip_special_tokens=skip_special_tokens, - ) - self.request_states[request_id] = RequestState( - req_id=request_id, - token_ids=prompt_token_ids, - tokens=tokens, - num_prompt_tokens=len(prompt_token_ids), - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - - def free(self, request_id: str) -> None: - del self.request_states[request_id] - - def detokenize(self, request_id: str, new_token_ids: List[int]) -> str: - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. - req_state = self.request_states[request_id] - decoded_text = "" - for new_token_id in new_token_ids: - req_state.token_ids.append(new_token_id) - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=req_state.token_ids, - prev_tokens=req_state.tokens, - prefix_offset=req_state.prefix_offset, - read_offset=req_state.read_offset, - skip_special_tokens=req_state.skip_special_tokens, - spaces_between_special_tokens=req_state. - spaces_between_special_tokens, - ) - - req_state.tokens.extend(new_tokens) - req_state.prefix_offset = prefix_offset - req_state.read_offset = read_offset - req_state.output_text += new_decoded_token_text - decoded_text += new_decoded_token_text - return decoded_text - - -@dataclass -class RequestState: - - req_id: str - - token_ids: List[int] - tokens: List[str] - num_prompt_tokens: int - - prefix_offset: int - read_offset: int - - skip_special_tokens: bool - spaces_between_special_tokens: bool - - output_text: str = "" diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py new file mode 100644 index 0000000000000..4b26749712e32 --- /dev/null +++ b/vllm/v1/utils.py @@ -0,0 +1,64 @@ +from typing import Generic, List, TypeVar, overload + +T = TypeVar("T") + + +class ConstantList(Generic[T]): + + def __init__(self, x: List[T]) -> None: + self._x = x + + def append(self, item): + raise Exception("Cannot append to a constant list") + + def extend(self, item): + raise Exception("Cannot extend a constant list") + + def insert(self, item): + raise Exception("Cannot insert into a constant list") + + def pop(self, item): + raise Exception("Cannot pop from a constant list") + + def remove(self, item): + raise Exception("Cannot remove from a constant list") + + def clear(self): + raise Exception("Cannot clear a constant list") + + def index(self, item): + return self._x.index(item) + + @overload + def __getitem__(self, item) -> T: + ... + + @overload + def __getitem__(self, s: slice, /) -> List[T]: + ... + + def __getitem__(self, item): + return self._x[item] + + @overload + def __setitem__(self, item, value): + ... + + @overload + def __setitem__(self, s: slice, value, /): + ... + + def __setitem__(self, item, value): + raise Exception("Cannot set item in a constant list") + + def __delitem__(self, item): + raise Exception("Cannot delete item from a constant list") + + def __iter__(self): + return iter(self._x) + + def __contains__(self, item): + return item in self._x + + def __len__(self): + return len(self._x) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index e84645ac7a4ae..eebd1de96537f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,19 +1,24 @@ +import os +import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set -from unittest.mock import patch +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple import numpy as np import torch import torch.distributed import torch.nn as nn -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm import envs +from vllm.compilation.compile_context import set_compile_context +from vllm.compilation.config import CompilationConfig +from vllm.compilation.levels import CompilationLevel +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context +from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.multimodal import MultiModalDataDict +from vllm.multimodal import MultiModalKwargs +from vllm.plugins import set_compilation_config from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, is_pin_memory_available) @@ -21,9 +26,9 @@ FlashAttentionMetadata) from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.sample.sampler import Sampler if TYPE_CHECKING: + from vllm.multimodal.inputs import PlaceholderRange from vllm.v1.core.scheduler import SchedulerOutput logger = init_logger(__name__) @@ -33,26 +38,25 @@ class GPUModelRunner: def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig] = None, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - observability_config: Optional[ObservabilityConfig] = None, + vllm_config: VllmConfig, + input_registry: InputRegistry = INPUT_REGISTRY, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config - + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + + model_config = self.model_config + cache_config = self.cache_config + scheduler_config = self.scheduler_config + parallel_config = self.parallel_config self.device = self.device_config.device self.pin_memory = is_pin_memory_available() self.dtype = self.model_config.dtype @@ -73,10 +77,16 @@ def __init__( parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() + self.hidden_size = model_config.get_hidden_size() + + # Multi-modal data support + self.input_registry = input_registry # Lazy initialization # self.model: nn.Module # Set after load_model self.kv_caches: List[torch.Tensor] = [] + # req_id -> (input_id -> encoder_output) + self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} # Request states. self.requests: Dict[str, CachedRequestState] = {} @@ -89,11 +99,33 @@ def __init__( pin_memory=self.pin_memory, ) + self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL + == CompilationLevel.PIECEWISE + and not self.model_config.enforce_eager) + # TODO(woosuk): Provide an option to tune the max cudagraph batch size. + self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] + self.positions = torch.zeros(self.max_num_tokens, + dtype=torch.int64, + device=self.device) + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. # Keep the states of the pre-empted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) + self.encoder_cache.pop(req_id, None) + + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) # Remove the requests from the persistent batch. stopped_req_ids = set().union( @@ -131,13 +163,21 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Add new requests to the cached states. for req_data in scheduler_output.scheduled_new_reqs: req_id = req_data.req_id + sampling_params = req_data.sampling_params + if sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=req_data.prompt_token_ids, prompt=req_data.prompt, - multi_modal_data=req_data.multi_modal_data, - sampling_params=req_data.sampling_params, - generator=None, # TODO + mm_inputs=req_data.mm_inputs, + mm_positions=req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, block_ids=req_data.block_ids, num_computed_tokens=req_data.num_computed_tokens, output_token_ids=[], @@ -265,11 +305,13 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): np.cumsum(seq_lens, out=seq_start_loc_np[1:]) input_ids = input_ids.to(self.device, non_blocking=True) - positions = positions.to(self.device, non_blocking=True).long() + self.positions[:total_num_scheduled_tokens].copy_(positions, + non_blocking=True) query_start_loc = query_start_loc.to(self.device, non_blocking=True) seq_start_loc = seq_start_loc.to(self.device, non_blocking=True) slot_mapping = slot_mapping.to(self.device, non_blocking=True).long() attn_metadata = FlashAttentionMetadata( + num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, query_start_loc=query_start_loc, max_seq_len=max_seq_len, @@ -283,7 +325,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return input_ids, positions, attn_metadata, logits_indices + return input_ids, attn_metadata, logits_indices def _prepare_sampling( self, @@ -300,22 +342,124 @@ def _prepare_sampling( sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy) return sampling_metadata + def _execute_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_inputs: List[MultiModalKwargs] = [] + req_input_ids: List[Tuple[int, int]] = [] + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + for input_id in encoder_input_ids: + mm_inputs.append(req_state.mm_inputs[input_id]) + req_input_ids.append((req_id, input_id)) + batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `encoder_outputs` is either of the following: + # 1. A tensor of shape [num_images, feature_size, hidden_size] + # in case when feature_size is fixed across all images. + # 2. A list (length: num_images) of tensors, each of shape + # [feature_size, hidden_size] in case when the feature size is + # dynamic depending on input images. + encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) + + # Cache the encoder outputs. + for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + self.encoder_cache[req_id][input_id] = output + + def _gather_encoder_outputs( + self, + scheduler_output: "SchedulerOutput", + ) -> List[torch.Tensor]: + encoder_outputs: List[torch.Tensor] = [] + num_reqs = self.input_batch.num_reqs + for req_id in self.input_batch.req_ids[:num_reqs]: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + encoder_outputs.append(encoder_output[start_idx:end_idx]) + return encoder_outputs + @torch.inference_mode() def execute_model( self, scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: self._update_states(scheduler_output) - inputs = self._prepare_inputs(scheduler_output) - input_ids, positions, attn_metadata, logits_indices = inputs + # Run the encoder. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + + # Prepare the decoder inputs. + input_ids, attn_metadata, logits_indices = self._prepare_inputs( + scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + if (self.use_cuda_graph + and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): + # Use piecewise CUDA graphs. + # Add padding to the batch size. + num_input_tokens = self._get_padded_batch_size( + num_scheduled_tokens) + else: + # Eager mode. + num_input_tokens = num_scheduled_tokens + + # Get the inputs embeds. + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), + # always use embeddings (rather than token ids) as input to the model. + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + + # Run the decoder. + # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata): hidden_states = self.model( - input_ids=input_ids, - positions=positions, + input_ids=None, + positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, - attn_metadata=attn_metadata, + attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_input_tokens], ) + hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(hidden_states, None) @@ -345,11 +489,10 @@ def execute_model( else: # Ignore the sampled token from the partial request. # Rewind the generator state as if the token was not sampled. - generator = self.input_batch.generators[i] + generator = self.input_batch.generators.get(i) if generator is not None: - offset = generator.get_offset() - generator = generator.set_offset(offset - 1) - self.input_batch.generators[i] = generator + # This relies on cuda-specific torch-internal impl details + generator.set_offset(generator.get_offset() - 4) if sampler_output.logprob_token_ids is None: logprob_token_ids = None @@ -369,42 +512,88 @@ def execute_model( return model_runner_output def load_model(self) -> None: + if self.use_cuda_graph: + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. + os.environ["VLLM_CUSTOM_OPS"] = "none" + set_compilation_config( + CompilationConfig( + use_cudagraph=True, + non_cudagraph_ops=["vllm.unified_v1_flash_attention"], + use_inductor=True, + enable_fusion=False, + )) + logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: # noqa: SIM117 - with patch("vllm.model_executor.layers.sampler.Sampler", Sampler): - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) + self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: - input_ids = torch.zeros(num_tokens, - dtype=torch.int32, - device=self.device) - positions = torch.zeros(num_tokens, - dtype=torch.long, - device=self.device) - kv_caches = [None for _ in range(self.num_attn_layers)] - model(input_ids, positions, kv_caches, attn_metadata=None) - return + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value `None`. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + dummy_kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(self.num_attn_layers) + ] + with set_forward_context(None): # noqa: SIM117 + with set_compile_context(self.cudagraph_batch_sizes): + # Trigger compilation for general shape. + model(input_ids=None, + positions=self.positions, + kv_caches=dummy_kv_caches, + attn_metadata=None, + inputs_embeds=self.inputs_embeds) @torch.inference_mode() def profile_run(self) -> None: + # TODO(woosuk): Profile the max memory usage of the encoder and + # the encoder cache. self._dummy_run(self.model, self.max_num_tokens) torch.cuda.synchronize() - return @torch.inference_mode() def capture_model(self) -> None: - # TODO: Implement CUDA graph support. - return + if not self.use_cuda_graph: + logger.warning( + "Skipping CUDA graph capture. Please set " + "VLLM_TORCH_COMPILE_LEVEL=%d to use CUDA graphs.", + CompilationLevel.PIECEWISE) + return + + start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] + + with set_forward_context(None): + # Trigger CUDA graph capture for specific shapes. + # Capture the large shapes first so that the smaller shapes + # can reuse the memory pool allocated for the large shapes. + for num_tokens in reversed(self.cudagraph_batch_sizes): + self.model( + input_ids=None, + positions=self.positions[:num_tokens], + kv_caches=self.kv_caches, + attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_tokens], + ) + + end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] + elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory + # This usually takes 5~20 seconds. + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / (1 << 30)) def initialize_kv_cache(self, num_blocks: int) -> None: assert len(self.kv_caches) == 0 @@ -416,6 +605,13 @@ def initialize_kv_cache(self, num_blocks: int) -> None: dtype=self.kv_cache_dtype, device=self.device)) + def _get_padded_batch_size(self, batch_size: int) -> Optional[int]: + # TODO: Optimize this? + for size in self.cudagraph_batch_sizes: + if batch_size <= size: + return size + return None + @dataclass class CachedRequestState: @@ -423,7 +619,8 @@ class CachedRequestState: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional["MultiModalDataDict"] + mm_inputs: List[MultiModalKwargs] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams generator: Optional[torch.Generator] @@ -503,8 +700,8 @@ def __init__( self.top_k_cpu = self.top_k_cpu_tensor.numpy() self.top_k_reqs: Set[str] = set() - self.generators: List[Optional[torch.Generator]] = [None - ] * max_num_reqs + # req_index -> generator + self.generators: Dict[int, torch.Generator] = {} self.num_logprobs: Dict[str, int] = {} self.prompt_logprob_reqs: Set[str] = set() @@ -518,8 +715,9 @@ def add_request( req_index = self.num_reqs assert req_index < self.max_num_reqs - self.req_ids[req_index] = request.req_id - self.req_id_to_index[request.req_id] = req_index + req_id = request.req_id + self.req_ids[req_index] = req_id + self.req_id_to_index[req_id] = req_index # Copy the prompt token ids and output token ids. num_prompt_tokens = len(request.prompt_token_ids) @@ -537,27 +735,24 @@ def add_request( sampling_params = request.sampling_params self.temperature_cpu[req_index] = sampling_params.temperature if sampling_params.sampling_type == SamplingType.GREEDY: - self.greedy_reqs.add(req_index) - elif sampling_params.sampling_type == SamplingType.RANDOM: - self.random_reqs.add(req_index) - elif sampling_params.sampling_type == SamplingType.RANDOM_SEED: - # TODO(woosuk): Support per-request random seed. - raise NotImplementedError("Per-request seed is not supported yet.") + self.greedy_reqs.add(req_id) + else: + self.random_reqs.add(req_id) self.top_p_cpu[req_index] = sampling_params.top_p if sampling_params.top_p < 1: - self.top_p_reqs.add(req_index) + self.top_p_reqs.add(req_id) self.top_k_cpu[req_index] = sampling_params.top_k if sampling_params.top_k > 0: - self.top_k_reqs.add(req_index) + self.top_k_reqs.add(req_id) self.generators[req_index] = request.generator num_logprobs = sampling_params.logprobs if num_logprobs is not None and num_logprobs > 0: - self.num_logprobs[request.req_id] = num_logprobs + self.num_logprobs[req_id] = num_logprobs if sampling_params.prompt_logprobs: - self.prompt_logprob_reqs.add(req_index) + self.prompt_logprob_reqs.add(req_id) def remove_request(self, req_id: str) -> Optional[int]: req_index = self.req_id_to_index.pop(req_id, None) @@ -569,7 +764,7 @@ def remove_request(self, req_id: str) -> Optional[int]: self.random_reqs.discard(req_id) self.top_p_reqs.discard(req_id) self.top_k_reqs.discard(req_id) - self.generators[req_index] = None + self.generators.pop(req_index, None) self.num_logprobs.pop(req_id, None) self.prompt_logprob_reqs.discard(req_id) return req_index @@ -621,7 +816,9 @@ def condense(self, empty_req_indices: List[int]) -> None: last_req_index] self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] self.top_k_cpu[empty_index] = self.top_k_cpu[last_req_index] - self.generators[empty_index] = self.generators[last_req_index] + generator = self.generators.pop(last_req_index, None) + if generator is not None: + self.generators[empty_index] = generator # Decrement last_req_index since it is now empty. last_req_index -= 1 @@ -645,8 +842,7 @@ def make_sampling_metadata( top_k=self.top_k[:self.num_reqs], no_top_p=self.no_top_p, no_top_k=self.no_top_k, - generators=self.generators[:self.num_reqs], - no_generator=self.no_generator, + generators=self.generators, max_num_logprobs=self.max_num_logprobs, ) @@ -670,16 +866,9 @@ def no_top_p(self) -> bool: def no_top_k(self) -> bool: return len(self.top_k_reqs) == 0 - @property - def no_generator(self) -> bool: - return len(self.generators) == 0 - @property def max_num_logprobs(self) -> int: - if self.num_logprobs: - return max(self.num_logprobs.values()) - else: - return 0 + return max(self.num_logprobs.values()) if self.num_logprobs else 0 @property def no_logprob(self) -> bool: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 8c5ca2ec35666..c8192b7f86eb0 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -6,10 +6,7 @@ import torch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) @@ -30,48 +27,35 @@ class Worker: def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - speculative_config: Optional[SpeculativeConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - observability_config: Optional[ObservabilityConfig] = None, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config + + # TODO: use WorkerBase.__init__(self, vllm_config=vllm_config) + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.speculative_config = speculative_config - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.model_runner = GPUModelRunner( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config, - lora_config=lora_config, - ) + self.model_runner = GPUModelRunner(vllm_config) def initialize(self): if self.device_config.device.type == "cuda": diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py new file mode 100644 index 0000000000000..7053075bf4d8f --- /dev/null +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -0,0 +1,123 @@ +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +import torch + +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MultiModalKwargs +from vllm.pooling_params import PoolingParams +from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, + SequenceGroupMetadata) +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, + ModelInputForCPUBuilder) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): + """ + Used by the CPUEmbeddingModelRunner. + """ + pooling_metadata: Optional["PoolingMetadata"] = None + + +class CPUEmbeddingModelRunner( + CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( + ModelInputForCPUWithPoolingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForCPUWithPoolingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError( + "CPU worker does not support multi-step execution.") + + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + + model_executable = self.model + execute_model_kwargs = { + "input_ids": + model_input.input_tokens, + "positions": + model_input.input_positions, + "kv_caches": + kv_caches, + "attn_metadata": + model_input.attn_metadata, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, + device=self.device), + "intermediate_tensors": + intermediate_tensors, + } + + hidden_states = model_executable(**execute_model_kwargs) + + return [ + self.model.pooler(hidden_states=hidden_states, + pooling_metadata=model_input.pooling_metadata) + ] + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, + Any]) -> ModelInputForCPUWithPoolingMetadata: + return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForCPUWithPoolingMetadata: + assert seq_group_metadata_list is not None + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + # Prepare PoolingMetadata. + assert model_input.seq_lens is not None + pooling_metadata = self._prepare_pooling(seq_group_metadata_list, + model_input.seq_lens) + + return dataclasses.replace(model_input, + virtual_engine=virtual_engine, + pooling_metadata=pooling_metadata) + + def _prepare_pooling( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + ) -> PoolingMetadata: + """Prepare PoolingMetadata for the sequence group metadata list.""" + seq_groups: List[Tuple[List[int], PoolingParams]] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + pooling_params = seq_group_metadata.pooling_params + seq_groups.append((seq_ids, pooling_params)) + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + pooling_metadata = PoolingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + ) + + return pooling_metadata diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 8ebbf6db939bc..d040831870bd8 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -4,11 +4,12 @@ import torch from vllm.attention import AttentionMetadata +from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import (CPUModelRunner, +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPUBuilder, ModelInputForCPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -50,7 +51,8 @@ def from_broadcasted_tensor_dict( super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) -class CPUEncoderDecoderModelRunner(CPUModelRunner): +class CPUEncoderDecoderModelRunner( + CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( EncoderDecoderModelInputForCPU) _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder @@ -87,21 +89,29 @@ def prepare_model_input( virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> EncoderDecoderModelInputForCPU: - model_input = super().prepare_model_input(seq_group_metadata_list, - virtual_engine, - finished_requests_ids) - model_input = cast(EncoderDecoderModelInputForCPU, model_input) + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) ( attn_metadata, encoder_input_tokens_tensor, encoder_input_positions_tensor, ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, model_input) + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + pin_memory=False, + generators=generators) return dataclasses.replace( model_input, + sampling_metadata=sampling_metadata, attn_metadata=attn_metadata, encoder_input_tokens=encoder_input_tokens_tensor, encoder_input_positions=encoder_input_positions_tensor, + virtual_engine=virtual_engine, ) def _prepare_encoder_model_input_tensors( @@ -287,7 +297,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 5032896600b3b..d3e1202c15e61 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -1,25 +1,24 @@ import dataclasses import weakref +from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import torch from torch import nn from vllm.attention import AttentionMetadata, get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, PromptAdapterConfig, - SchedulerConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import (IntermediateTensors, SequenceData, SequenceGroupMetadata) -from vllm.transformers_utils.config import uses_mrope from vllm.utils import make_tensor_with_pad from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, @@ -33,6 +32,7 @@ logger = init_logger(__name__) +TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU") _PAD_SLOT_ID = -1 @@ -62,10 +62,10 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type["ModelInputForCPU"], + cls: Type[TModelInputForCPU], tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None - ) -> "ModelInputForCPU": + ) -> TModelInputForCPU: if attn_backend is not None: tensor_dict = _init_attn_metadata_from_tensor_dict( attn_backend, tensor_dict) @@ -148,14 +148,33 @@ def build(self) -> ModelInputForCPU: query_lens=seq_lens, ) - def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data, - computed_len: int, - mm_processor_kwargs: Dict[str, Any]): - mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs) + def _compute_multi_modal_input( + self, + seq_data: SequenceData, + computed_len: int, + seq_group_metadata: SequenceGroupMetadata, + ): + # NOTE: mm_data only includes the subset of multi-modal items that + # intersect with the current prefill positions. + mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( + seq_group_metadata, + range(computed_len, len(seq_data.get_token_ids())), + ) + + if not mm_data: + return None, None, None + + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) # special processing for mrope position deltas. mrope_positions = None - if self.runner.model_is_mrope: + if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -179,7 +198,7 @@ def _compute_multi_modal_input(self, seq_data: SequenceData, mm_data, context_len=computed_len, ) seq_data.mrope_position_delta = mrope_position_delta - return mm_kwargs, mrope_positions + return mm_kwargs, placeholder_maps, mrope_positions def _prepare_prompt( self, @@ -193,7 +212,10 @@ def _prepare_prompt( slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -210,11 +232,18 @@ def _prepare_prompt( input_tokens.extend(prompt_tokens) # Token ids mrope_positions = None - if (mm_data := seq_group_metadata.multi_modal_data): - mm_kwargs, mrope_positions = self._compute_multi_modal_input( - seq_data, mm_data, computed_len, - seq_group_metadata.mm_processor_kwargs) - multi_modal_inputs_list.append(mm_kwargs) + if seq_group_metadata.multi_modal_data: + ( + mm_kwargs, + placeholder_maps, + mrope_positions, + ) = self._compute_multi_modal_input(seq_data, computed_len, + seq_group_metadata) + + multi_modal_kwargs_list.append(mm_kwargs) + for modality, placeholder_map in placeholder_maps.items(): + multi_modal_placeholder_maps[modality].extend( + placeholder_map) # Token position ids # NOTE(woosuk): Here we assume that the first token in the prompt @@ -241,11 +270,14 @@ def _prepare_prompt( slot_mapping.append(_PAD_SLOT_ID) continue - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) + # For encoder-only models, the block_table is None, + # and there is no need to initialize the slot_mapping. + if block_table is not None: + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) if any(input_mrope_positions): input_positions = None # type: ignore @@ -264,6 +296,11 @@ def _prepare_prompt( slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) # type: ignore + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + multi_modal_placeholder_maps.items() + } attn_metadata = self.attn_backend.make_metadata( is_prompt=True, @@ -275,9 +312,10 @@ def _prepare_prompt( num_decode_tokens=0, block_tables=torch.tensor([]), slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -366,6 +404,7 @@ def _prepare_decode( attn_metadata = self.attn_backend.make_metadata( is_prompt=False, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_decode_seq_len=max_decode_seq_len, @@ -381,36 +420,27 @@ def _prepare_decode( ) -class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( - ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder +class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): + """ + Helper class for shared methods between CPU model runners. + """ + _model_input_cls: Type[TModelInputForCPU] + _builder_cls: Type[ModelInputForCPUBuilder] def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, *args, **kwargs, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config + ModelRunnerBase.__init__(self, vllm_config) # Currently, CPU worker doesn't support chunked prefill. assert self.scheduler_config.chunked_prefill_enabled is False - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.prompt_adapter_config = prompt_adapter_config - self.load_config = load_config + model_config = self.model_config + cache_config = self.cache_config + self.is_driver_worker = is_driver_worker self.device = self.device_config.device @@ -435,35 +465,14 @@ def __init__( # Lazy initialization. self.model: nn.Module # Set after init_Model - @property - def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - return uses_mrope(self.model_config.hf_config) - def load_model(self) -> None: - self.model = get_model(model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) - - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForCPUWithSamplingMetadata: - return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 - tensor_dict, - attn_backend=self.attn_backend, - ) + self.model = get_model(vllm_config=self.vllm_config) def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithSamplingMetadata: + ) -> TModelInputForCPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. @@ -475,6 +484,21 @@ def _prepare_model_input_tensors( return builder.build() # type: ignore + +class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( + ModelInputForCPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForCPUWithSamplingMetadata: + return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 + tensor_dict, + attn_backend=self.attn_backend, + ) + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -522,7 +546,7 @@ def execute_model( kv_caches, "attn_metadata": model_input.attn_metadata, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), "intermediate_tensors": intermediate_tensors, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index ab93471b5af74..bc9164bd9d5df 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -6,19 +6,20 @@ import vllm.envs as envs from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, PromptAdapterConfig, - SchedulerConfig) +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, VllmConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner -from vllm.worker.cpu_model_runner import CPUModelRunner +from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerInput) + LoraNotSupportedWorkerBase, WorkerBase, + WorkerInput) logger = init_logger(__name__) @@ -121,31 +122,19 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, kv_cache_dtype: Optional[str] = "auto", - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config + WorkerBase.__init__(self, vllm_config=vllm_config) + self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.prompt_adapter_config = prompt_adapter_config + self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -162,24 +151,20 @@ def __init__( else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] - ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner - if self._is_encoder_decoder_model(): + ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner + if self.model_config.task == "embedding": + ModelRunnerClass = CPUEmbeddingModelRunner + elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner - self.model_runner: CPUModelRunner = ModelRunnerClass( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config=self.load_config, - lora_config=self.lora_config, + self.model_runner: CPUModelRunnerBase = ModelRunnerClass( + vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, - prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - self.cpu_cache: List[List[torch.Tensor]] + # Initialize cpu_cache as embedding models don't initialize kv_caches + self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace @@ -207,9 +192,6 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - def _is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model - def init_device(self) -> None: if self.local_omp_cpuid != "all": ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid) diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index a7f5b2d4fdd1f..37cfcbf13d7a3 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -3,14 +3,12 @@ import torch -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.distributed import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import MultiModalInputs +from vllm.multimodal import MultiModalKwargs from vllm.pooling_params import PoolingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, SequenceGroupMetadata) @@ -36,29 +34,13 @@ class EmbeddingModelRunner( def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - observability_config: Optional[ObservabilityConfig] = None, ): - super().__init__(model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config, - lora_config=lora_config, + super().__init__(vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, - is_driver_worker=is_driver_worker, - prompt_adapter_config=prompt_adapter_config, - observability_config=observability_config) + is_driver_worker=is_driver_worker) @torch.inference_mode() def execute_model( @@ -122,7 +104,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device)) if (self.observability_config is not None diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 6a00444f5098b..82824faa6629a 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -9,17 +9,14 @@ AttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.attention.selector import (_Backend, get_env_variable_attn_backend, - get_global_forced_attn_backend, - global_force_attn_backend) -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) + get_global_forced_attn_backend) +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs, +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import (IntermediateTensors, PoolerOutput, @@ -79,17 +76,9 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, - observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): @@ -101,17 +90,10 @@ def __init__( models) but these arguments are present here for compatibility with the base-class constructor. ''' - self._maybe_force_supported_attention_backend() super().__init__( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config, - lora_config=None, + vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, ) @@ -134,23 +116,17 @@ def raise_backend_err(): maybe_global_forced_backend = get_global_forced_attn_backend() is_forced_by_global = maybe_global_forced_backend is not None is_forced_by_env_var = maybe_env_var_forced_backend is not None - - if not (is_forced_by_global or is_forced_by_env_var): - # The user has not already specified an attention backend - # override - logger.info("EncoderDecoderModelRunner requires " - "XFormers backend; overriding backend " - "auto-selection and forcing XFormers.") - global_force_attn_backend(_Backend.XFORMERS) - elif is_forced_by_global: + if is_forced_by_global: # noqa: SIM102 # Backend override enforced by global variable takes # precedence over vLLM backend environment variable. - if maybe_global_forced_backend != _Backend.XFORMERS: + if maybe_global_forced_backend not in\ + [_Backend.XFORMERS, _Backend.FLASH_ATTN]: raise_backend_err() - elif is_forced_by_env_var: + elif is_forced_by_env_var: # noqa: SIM102 # Backend override enforced by vLLM backend # environment variable - if maybe_env_var_forced_backend != _Backend.XFORMERS: + if maybe_env_var_forced_backend not in\ + [_Backend.XFORMERS, _Backend.FLASH_ATTN]: raise_backend_err() def _list_to_int32_tensor( @@ -208,7 +184,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) @@ -306,13 +282,12 @@ def profile_run(self) -> None: (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - decoder_seq_data, decoder_dummy_multi_modal_data \ - = self.input_registry.dummy_data_for_profiling( - self.model_config, + decoder_dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, seq_len, self.mm_registry, is_encoder_data=False) - encoder_seq_data, encoder_dummy_multi_modal_data \ + encoder_dummy_data \ = self.input_registry.dummy_data_for_profiling( self.model_config, seq_len, @@ -320,26 +295,31 @@ def profile_run(self) -> None: is_encoder_data=True) # Having more tokens is over-conservative but otherwise fine - assert len(decoder_seq_data.prompt_token_ids) >= seq_len, ( + assert len( + decoder_dummy_data.seq_data.prompt_token_ids + ) >= seq_len, ( f"Expected at least {seq_len} dummy tokens for profiling, " - f"but got: {len(decoder_seq_data.prompt_token_ids)}") + f"but got: {len(decoder_dummy_data.seq_data.prompt_token_ids)}" + ) - assert decoder_dummy_multi_modal_data is None or \ - encoder_dummy_multi_modal_data is None, ( + assert decoder_dummy_data.multi_modal_data is None or \ + encoder_dummy_data.multi_modal_data is None, ( "Multi-modal data can't be provided in both encoder and decoder" ) seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: decoder_seq_data}, + seq_data={group_id: decoder_dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, - encoder_seq_data=encoder_seq_data, + encoder_seq_data=encoder_dummy_data.seq_data, cross_block_table=None, - multi_modal_data=decoder_dummy_multi_modal_data - or encoder_dummy_multi_modal_data, - ) + multi_modal_data=decoder_dummy_data.multi_modal_data + or encoder_dummy_data.multi_modal_data, + multi_modal_placeholders=decoder_dummy_data. + multi_modal_placeholders + or encoder_dummy_data.multi_modal_placeholders) seqs.append(seq) # Run the model with the dummy inputs. @@ -528,6 +508,7 @@ def _prepare_encoder_model_input_tensors( attn_metadata.encoder_seq_lens, attn_metadata.encoder_seq_lens_tensor, attn_metadata.max_encoder_seq_len, + attn_metadata.encoder_seq_start_loc, attn_metadata.cross_slot_mapping, attn_metadata.cross_block_tables, ) = ( @@ -535,6 +516,7 @@ def _prepare_encoder_model_input_tensors( encoder_seq_lens, encoder_seq_lens_tensor, max_encoder_seq_len, + encoder_seq_start_loc, cross_slot_mapping_tensor, cross_block_tables, ) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py new file mode 100644 index 0000000000000..1ff30d685c6b1 --- /dev/null +++ b/vllm/worker/hpu_model_runner.py @@ -0,0 +1,2007 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import collections +import contextlib +import dataclasses +import functools +import gc +import itertools +import math +import operator +import os +import time +from array import array +from dataclasses import dataclass, field +from enum import IntEnum +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Set, Tuple, Type, TypeVar, Union) + +import habana_frameworks.torch as htorch +import habana_frameworks.torch.internal.bridge_config as bc +import torch +from vllm_hpu_extension.ops import LoraMask as LoraMask +from vllm_hpu_extension.profiler import (HabanaHighLevelProfiler, + HabanaMemoryProfiler, format_bytes) + +from vllm.attention import AttentionMetadata, get_attn_backend +from vllm.config import DeviceConfig, VllmConfig +from vllm.distributed.parallel_state import get_world_group +from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager +from vllm.model_executor import SamplingMetadata +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.model_loader import get_model +from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, + MultiModalKwargs) +from vllm.sampling_params import SamplingParams +from vllm.sequence import (IntermediateTensors, SequenceData, + SequenceGroupMetadata) +from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.worker.model_runner_base import ( + ModelRunnerBase, ModelRunnerInputBase, + _add_attn_metadata_broadcastable_dict, + _add_sampling_metadata_broadcastable_dict, + _init_attn_metadata_from_tensor_dict, + _init_sampling_metadata_from_tensor_dict) + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionBackend + +logger = init_logger(__name__) + +_TYPE_CACHE = {} +# These values are assumed to be zero in several places. +# Use caution when updating them! +_PAD_SLOT_ID = 0 +_PAD_BLOCK_ID = 0 + +LORA_WARMUP_RANK = 8 + + +class Singleton(type): + _instances: Dict[type, object] = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super().__call__(*args, **kwargs) + return cls._instances[cls] + + +@dataclass +class HPUBucketingGlobalState(metaclass=Singleton): + prompt_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_bs_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_seq_bucket_cfg: Tuple[int, int, int] = field(init=False) + decode_block_bucket_cfg: Tuple[int, int, int] = field(init=False) + prompt_buckets: List[Tuple[int, int]] = field(init=False) + decode_buckets: List[Tuple[int, int]] = field(init=False) + + +def subtuple(obj: object, + typename: str, + to_copy: List[str], + to_override: Optional[Dict[str, object]] = None): + if obj is None: + return None + if to_override is None: + to_override = {} + fields = set(to_copy) | set(to_override.keys()) + values = {f: to_override.get(f, getattr(obj, f)) for f in fields} + if typename not in _TYPE_CACHE: + _TYPE_CACHE[typename] = collections.namedtuple(typename, + ' '.join(fields)) + return _TYPE_CACHE[typename](**values) + + +def read_bucket_settings(phase: str, dim: str, **defaults): + """Read bucketing configuration from env variables. + + phase is either 'prompt' or 'decode' + dim is either 'bs', 'seq' or 'block' + param is either 'min', 'step' or 'max' + example env variable: VLLM_DECODE_BS_BUCKET_STEP=128 + """ + params = ['min', 'step', 'max'] + env_vars = [f'VLLM_{phase}_{dim}_BUCKET_{p}'.upper() for p in params] + default_values = [defaults[p] for p in params] + values = [ + int(os.environ.get(e, d)) for e, d in zip(env_vars, default_values) + ] + for e, v, d in zip(env_vars, values, default_values): + logger.info('%s=%s (default:%s)', e, v, d) + return values + + +def warmup_range(config: Tuple[int, int, int]): + """Generate a warmup range. + + Start from bmin and multiply by 2 until you reach bstep. + Then, increase the values in the range by the value of bstep until you + reach bmax. + + Example: + bmin = 2, bstep = 32, bmax = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => return ramp_up + stable => (2, 4, 8, 16, 32, 64) + """ + bmin, bstep, bmax = config + assert bmin <= bmax, ("Min. batch size cannot be greater than max. " + "batch size. If you want to skip warmup, " + "set VLLM_SKIP_WARMUP=true") + base = itertools.repeat(2) + ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin) + ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \ + ramp_up_acc) + stable = range(bstep, bmax + 1, bstep) + buckets = list(ramp_up_tw) + list(stable) + return list(filter(lambda bucket: bucket >= bmin, buckets)) + + +def generate_prompt_buckets(bs_bucket_config, + seq_bucket_config, + max_num_batched_tokens=None): + buckets = list( + itertools.product(warmup_range(bs_bucket_config), + warmup_range(seq_bucket_config))) + if len(buckets) == 0: + msg = ("No buckets could be captured with following config " + f"(min, step, max_warmup): " + f"bs:{bs_bucket_config}, " + f"seq:{seq_bucket_config}") + raise ValueError(msg) + + filtered_buckets = buckets + if max_num_batched_tokens is not None: + # Remove buckets exceeding batch token budget + filtered_buckets = list( + filter( + lambda bucket: bucket[0] * bucket[1] <= max_num_batched_tokens, + buckets)) + + if len(filtered_buckets) == 0: + # we can handle this if we ignore max_num_batched_tokens + min_bucket_bs, min_bucket_seq = min(buckets, + key=lambda b: (b[0] * b[1])) + min_reqd_budget = min_bucket_bs * min_bucket_seq + msg = ( + "The current bucketing configuration " + f"(min, step, max_warmup): " + f"bs:{bs_bucket_config}, " + f"seq:{seq_bucket_config} cannot be used with specified " + f"max_num_batched_tokens ({max_num_batched_tokens}), as the " + f"smallest bucket ({min_reqd_budget}) would exceed token " + "budget. Please increase max_num_batched_tokens or decrease " + "bucket minimum Ignoring max_num_batched_tokens at risk of " + "out-of-memory errors.") + logger.error(msg) + return list( + sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))), [] + + captured_buckets = list( + sorted(filtered_buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) + omitted_buckets = list( + sorted([x for x in buckets if x not in filtered_buckets])) + return captured_buckets, omitted_buckets + + +def generate_decode_buckets(bs_bucket_config, blocks_bucket_config, + max_blocks): + buckets = [] + bs_buckets = warmup_range(bs_bucket_config) + block_buckets = warmup_range(blocks_bucket_config) + bmin, bstep, bmax = blocks_bucket_config + last_bucket = round_up(max_blocks, bstep) + for bs in bs_buckets: + for blocks in block_buckets: + if blocks < bs: + continue + if blocks > last_bucket: + break + buckets.append((bs, blocks)) + return list(sorted(buckets, key=lambda b: (b[0] * b[1], b[1], b[0]))) + + +def next_pow2(value: int, base: int): + res = base + while value > 1: + value = (value + 1) // 2 + res *= 2 + return res + + +def round_up(value: int, k: int): + return (value + k - 1) // k * k + + +def find_bucket(value: int, config: Tuple[int, int, int]): + bmin, bstep, _ = config + next_step = round_up(value, bstep) + next_pow = next_pow2(value, bmin) + return max(bmin, min(next_step, next_pow)) + + +def align_workers(value, op): + group = get_world_group().cpu_group + world_size = torch.distributed.get_world_size() + if world_size <= 1: + return value + value_t = torch.tensor(value, device='cpu') + torch.distributed.all_reduce(value_t, op=op, group=group) + return value_t.item() + + +def setup_profiler(): + schedule = torch.profiler.schedule(wait=0, warmup=2, active=1, repeat=1) + DEVICE = 'hpu' + activities = [torch.profiler.ProfilerActivity.CPU] + activities.extend([torch.profiler.ProfilerActivity.HPU] if DEVICE == + 'hpu' else []) + #from habana_frameworks.torch.activity_profiler import DebugActivity + #debug_activities=[DebugActivity.BRIDGE_FUNCTION_CALLS] + + profiler = torch.profiler.profile( + schedule=schedule, + activities=activities, + #debug_activities=debug_activities, + on_trace_ready=torch.profiler.tensorboard_trace_handler('.', + use_gzip=True), + record_shapes=False, + with_stack=True) + return profiler + + +def pad_list(list, k, v): + target_len = round_up(len(list), k) + padding = target_len - len(list) + return list + [v] * padding + + +def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): + slot_mapping = slot_mapping.flatten() + indices = torch.div(slot_mapping, block_size, rounding_mode="floor") + if is_prompt: + indices = indices.unflatten(0, (-1, block_size))[:, 0] + offsets = None + else: + offsets = torch.fmod(slot_mapping, block_size) + return indices, offsets + + +class HpuModelAdapter: + + def __init__(self, model, block_size, dtype, enforce_eager): + self.model = model + self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', + '0').lower() in ['1', 'true'] + self.block_size = block_size + self.dtype = dtype + if not htorch.utils.internal.is_lazy() and not enforce_eager: + self.model = torch.compile(self.model, + backend='hpu_backend', + dynamic=False) + + def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, + dtype): + prefill_metadata = attn_metadata + if prefill_metadata is None or self.prefill_use_fusedsdpa: + return attn_metadata + + seq_lens_t = prefill_metadata.seq_lens_tensor + len_mask = (torch.arange(0, seq_len, device=device, + dtype=torch.int32).view(1, seq_len).ge( + seq_lens_t.unsqueeze(-1)).view( + batch_size, 1, 1, seq_len)) + causal_mask = torch.triu(torch.ones((batch_size, 1, seq_len, seq_len), + device=device, + dtype=torch.bool), + diagonal=1) + mask = causal_mask.logical_or(len_mask) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) + attn_metadata = prefill_metadata._replace(attn_bias=attn_bias) + return attn_metadata + + def _set_block_mapping(self, metadata, batch_size, device, dtype): + mask = torch.arange(0, + self.block_size, + device=device, + dtype=torch.int32).unsqueeze(0) + mask = mask >= metadata.block_usage.unsqueeze(-1) + attn_bias = (torch.zeros_like(mask, dtype=dtype).masked_fill_( + mask, -math.inf)) + block_mapping = torch.nn.functional.one_hot(metadata.block_mapping, + num_classes=batch_size) + block_mapping = block_mapping.to(dtype) + metadata = metadata._replace(block_mapping=block_mapping, + attn_bias=attn_bias) + return metadata + + def _update_metadata(self, attn_metadata, batch_size, seq_len, device, + dtype): + if attn_metadata.is_prompt: + meta = attn_metadata + attn_metadata = self._set_attn_bias(meta, batch_size, seq_len, + device, dtype) + else: + meta = attn_metadata + attn_metadata = self._set_block_mapping(meta, batch_size, device, + dtype) + return attn_metadata + + def forward(self, *args, **kwargs): + kwargs = kwargs.copy() + selected_token_indices = kwargs.pop('selected_token_indices') + if 'warmup_mode' in kwargs: + kwargs.pop('warmup_mode') + input_ids = kwargs['input_ids'] + kwargs['attn_metadata'] = self._update_metadata( + kwargs['attn_metadata'], input_ids.size(0), input_ids.size(1), + input_ids.device, self.dtype) + LoraMask.setLoraMask(kwargs.pop('lora_mask')) + hidden_states = self.model(*args, **kwargs) + hidden_states = hidden_states.view(-1, hidden_states.shape[-1]) + hidden_states = hidden_states.index_select(0, selected_token_indices) + return hidden_states + + def compute_logits(self, *args, **kwargs): + return self.model.compute_logits(*args, **kwargs) + + def sample(self, *args, **kwargs): + return self.model.sample(*args, **kwargs) + + +class PreparePromptMetadata(NamedTuple): + input_tokens: torch.Tensor + input_positions: List[List[int]] + attn_metadata: Optional[AttentionMetadata] + seq_lens: List[int] + query_lens: List[int] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] + lora_requests: Set[LoRARequest] + multi_modal_kwargs: Optional[Dict[str, BatchedTensorInputs]] + slot_mapping: List[List[int]] + lora_ids: List[int] + + @classmethod + def empty(cls): + return PreparePromptMetadata(input_tokens=[], + input_positions=[], + attn_metadata=None, + seq_lens=[], + query_lens=[], + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + multi_modal_kwargs=None, + slot_mapping=[], + lora_ids=[]) + + +class PrepareDecodeMetadata(NamedTuple): + input_tokens: torch.Tensor + input_positions: List[List[int]] + attn_metadata: Optional[AttentionMetadata] + lora_index_mapping: List[List[int]] + lora_prompt_mapping: List[List[int]] + lora_requests: Set[LoRARequest] + slot_mapping: List[List[int]] + lora_ids: List[int] + + @classmethod + def empty(cls): + return PrepareDecodeMetadata(input_tokens=[], + input_positions=[], + attn_metadata=None, + lora_index_mapping=[], + lora_prompt_mapping=[], + lora_requests=set(), + slot_mapping=[], + lora_ids=[]) + + +# How batches are constructed. +class BatchType(IntEnum): + # Every batch is prefill. + PREFILL = 0 + # Every batch is decode. + DECODE = 1 + # Batch is a mixture of prefill and decode. + MIXED = 2 + + +TModelInputForHPU = TypeVar('TModelInputForHPU', bound="ModelInputForHPU") + + +@dataclasses.dataclass(frozen=True) +class ModelInputForHPU(ModelRunnerInputBase): + """ + This base class contains metadata needed for the base model forward pass + but not metadata for possible additional steps, e.g., sampling. Model + runners that run additional steps should subclass this method to add + additional fields. + """ + input_tokens: Optional[torch.Tensor] = None + input_positions: Optional[torch.Tensor] = None + seq_lens: Optional[List[int]] = None + query_lens: Optional[List[int]] = None + lora_mapping: Optional["LoRAMapping"] = None + lora_requests: Optional[Set[LoRARequest]] = None + attn_metadata: Optional["AttentionMetadata"] = None + multi_modal_kwargs: Optional[Dict[str, torch.Tensor]] = None + real_batch_size: Optional[int] = None + batch_size_padded: Optional[int] = None + virtual_engine: int = 0 + lora_ids: Optional[List[int]] = None + async_callback: Optional[Callable] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + "real_batch_size": self.real_batch_size, + "batch_size_padded": self.batch_size_padded, + "virtual_engine": self.virtual_engine, + "lora_ids": self.lora_ids, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls: Type[TModelInputForHPU], + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> TModelInputForHPU: + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForHPUWithSamplingMetadata(ModelInputForHPU): + """ + Used by the ModelRunner. + """ + sampling_metadata: Optional["SamplingMetadata"] = None + # Used for speculative decoding. We do not broadcast it because it is only + # used by the driver worker. + is_prompt: Optional[bool] = None + + def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: + tensor_dict = { + "input_tokens": self.input_tokens, + "input_positions": self.input_positions, + "lora_requests": self.lora_requests, + "lora_mapping": self.lora_mapping, + "multi_modal_kwargs": self.multi_modal_kwargs, + "lora_ids": self.lora_ids, + } + _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) + _add_sampling_metadata_broadcastable_dict(tensor_dict, + self.sampling_metadata) + return tensor_dict + + @classmethod + def from_broadcasted_tensor_dict( + cls, + tensor_dict: Dict[str, Any], + attn_backend: Optional["AttentionBackend"] = None, + ) -> "ModelInputForHPUWithSamplingMetadata": + tensor_dict = _init_sampling_metadata_from_tensor_dict(tensor_dict) + # FIXME(kzawora): this fails for whatever reason - why? + if attn_backend is not None: + tensor_dict = _init_attn_metadata_from_tensor_dict( + attn_backend, tensor_dict) + return cls(**tensor_dict) + + +class HPUModelRunnerBase(ModelRunnerBase[TModelInputForHPU]): + """ + Helper class for shared methods between GPU model runners. + """ + _model_input_cls: Type[TModelInputForHPU] + + def __init__( + self, + vllm_config: VllmConfig, + is_driver_worker: bool = False, + return_hidden_states: bool = False, + ): + ModelRunnerBase.__init__(self, vllm_config=vllm_config) + self.is_driver_worker = is_driver_worker + self.return_hidden_states = return_hidden_states + + self.sliding_window = (self.model_config.get_sliding_window() + if self.model_config is not None else None) + self.device_config = (self.device_config if self.device_config + is not None else DeviceConfig()) + self.device = self.device_config.device + self.enforce_eager = self.model_config.enforce_eager + self.max_num_seqs = self.scheduler_config.max_num_seqs + # NOTE(kzawora): Change that to scheduler_config.max_num_prefill_seqs + # once padding-aware scheduling gets merged + self.max_num_prefill_seqs = 64 + self.max_model_len = self.scheduler_config.max_model_len + self.max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + self.block_size = self.cache_config.block_size + + self.pin_memory = is_pin_memory_available() + self.kv_cache_dtype = self.cache_config.cache_dtype + + self.attn_backend = get_attn_backend( + self.model_config.get_head_size(), + self.model_config.dtype, + self.kv_cache_dtype, + self.block_size, + self.model_config.is_attention_free, + ) + + # Lazy initialization + self.lora_manager: LRUCacheWorkerLoRAManager = None + self.model: torch.nn.Module = None + self.inc_initialized_successfully = False + + # Profiler stats + self.profiler = HabanaHighLevelProfiler() + self.profiler_counter_helper = HabanaProfilerCounterHelper() + self.seen_configs: set = set() + self._mem_margin: Optional[int] = None + self.bucketing_global_state = HPUBucketingGlobalState() + self._setup_buckets() + self._set_gc_threshold() + + def _set_gc_threshold(self) -> None: + # Read https://docs.python.org/3/library/gc.html#gc.set_threshold + # for comprehensive description of gc generations. + # We can either use VLLM_GC_THR_GEN[0-2] (this has higher priority) + # to set particular generation threshold or use simpler + # VLLM_GC_THR_MULTIPLIER to multiply default values. + default_gc_thrs = list(gc.get_threshold()) + requested_gc_thrs = [0] * len(default_gc_thrs) + for i in range(len(default_gc_thrs)): + requested_gc_thrs[i] = int( + os.environ.get(f'VLLM_GC_THR_GEN{i}', default_gc_thrs[i])) + if requested_gc_thrs == default_gc_thrs: + gc_thr_multiplier = int(os.environ.get('VLLM_GC_THR_MULTIPLIER', + 2)) + requested_gc_thrs = [ + t * gc_thr_multiplier for t in default_gc_thrs + ] + gc.set_threshold(*requested_gc_thrs) + + # Multi-modal data support + self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + .create_input_mapper(self.model_config) + + self.skip_warmup = os.environ.get('VLLM_SKIP_WARMUP', + 'false').lower() == 'true' + + def load_model(self) -> None: + import habana_frameworks.torch.core as htcore + if self.model_config.quantization == 'inc' or \ + self.model_config.quantization == 'fp8': + htcore.hpu_set_env() + with HabanaMemoryProfiler() as m: + with HabanaMemoryProfiler() as m_getmodel: + self.model = get_model(vllm_config=self.vllm_config) + msg = ("Pre-loading model weights on " + f"{next(self.model.parameters()).device} " + f"took {m_getmodel.get_summary_string()}") + logger.info(msg) + + if self.lora_config: + assert hasattr(self.model, "supported_lora_modules" + ) and self.model.supported_lora_modules, ( + "Model does not support LoRA") + assert hasattr(self.model, "embedding_modules" + ), "Model does not have embedding_modules" + assert hasattr( + self.model, "embedding_padding_modules" + ), "Model does not have embedding_padding_modules" + self.lora_manager = LRUCacheWorkerLoRAManager( + self.scheduler_config.max_num_seqs, + self.scheduler_config.max_num_batched_tokens, + self.vocab_size, self.lora_config, self.device, + self.model.embedding_modules, + self.model.embedding_padding_modules) + self.model = self.lora_manager.create_lora_manager(self.model) + + if self.model_config.quantization == 'inc': + logger.info("Preparing model with INC..") + with HabanaMemoryProfiler() as m_inc: + from neural_compressor.torch.quantization import ( + FP8Config, convert, prepare) + config = FP8Config.from_json_file( + os.getenv("QUANT_CONFIG", "")) + if config.measure: + self.model = prepare(self.model, config) + elif config.quantize: + self.model = convert(self.model, config) + htcore.hpu_initialize(self.model, + mark_only_scales_as_const=True) + self.inc_initialized_successfully = True + logger.info("Preparing model with INC took %s", + m_inc.get_summary_string()) + else: + self.model = self.model.to("hpu") + htcore.mark_step() + torch.hpu.synchronize() + + with HabanaMemoryProfiler() as m_wrap: + self.model = _maybe_wrap_in_hpu_graph( + self.model, + self.block_size, + dtype=self.model_config.dtype, + enforce_eager=self.enforce_eager) + msg = f"Wrapping in HPU Graph took {m_wrap.get_summary_string()}" + logger.info(msg) + + self.model_memory_usage = m.consumed_device_memory + msg = f"Loading model weights took in total {m.get_summary_string()}" + logger.info(msg) + + def _use_graphs(self, batch_size, seq_len, is_prompt): + if self.enforce_eager: + return False + if self.skip_warmup: + return True + return (batch_size, seq_len, is_prompt) in self.graphed_buckets + + def _is_valid_bucket(self, bucket): + return bucket[0] * bucket[1] <= self.max_num_batched_tokens + + def _setup_buckets(self) -> None: + align_bs = lambda x: min(self.max_num_seqs, x) + #FIXME: The default values should be max_model_len + max_prompt_seq = 1024 + max_decode_seq = 2048 + self.bucketing_global_state.prompt_bs_bucket_cfg = read_bucket_settings( + 'prompt', + 'bs', + min=1, + step=align_bs(32), + max=self.max_num_prefill_seqs) + self.bucketing_global_state.decode_bs_bucket_cfg = read_bucket_settings( + 'decode', 'bs', min=1, step=align_bs(32), max=self.max_num_seqs) + self.bucketing_global_state.prompt_seq_bucket_cfg = \ + read_bucket_settings( + 'prompt', + 'seq', + min=self.block_size, + step=self.block_size, + max=max_prompt_seq) + self.bucketing_global_state.decode_block_bucket_cfg = \ + read_bucket_settings( + 'decode', + 'block', + min=self.block_size, + step=self.block_size, + max=max(self.block_size, + self.max_num_seqs * max_decode_seq // self.block_size)) + self.graphed_buckets: Set[Any] = set() + + msg = ("Prompt bucket config (min, step, max_warmup) " + f"bs:{self.bucketing_global_state.prompt_bs_bucket_cfg}, " + f"seq:{self.bucketing_global_state.prompt_seq_bucket_cfg}") + logger.info(msg) + + msg = ("Decode bucket config (min, step, max_warmup) " + f"bs:{self.bucketing_global_state.decode_bs_bucket_cfg}, " + f"block:{self.bucketing_global_state.decode_block_bucket_cfg}") + logger.info(msg) + + def _prepare_prompt( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> PreparePromptMetadata: + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] + lora_requests: Set[LoRARequest] = set() + + seq_lens: List[int] = [] + context_lens: List[int] = [] + query_lens: List[int] = [] + prefix_block_tables: List[List[int]] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + + if len(seq_group_metadata_list) == 0: + return PreparePromptMetadata.empty() + + for seq_group_metadata in seq_group_metadata_list: + assert seq_group_metadata.is_prompt + seq_ids = list(seq_group_metadata.seq_data.keys()) + assert len(seq_ids) == 1 + seq_id = seq_ids[0] + + computed_block_nums = seq_group_metadata.computed_block_nums + if (self.scheduler_config is not None + and self.scheduler_config.chunked_prefill_enabled + and not (computed_block_nums is None + or computed_block_nums == [])): + raise RuntimeError( + "chunked prefill cannot be used with prefix caching " + "now.") + + token_chunk_size = seq_group_metadata.token_chunk_size + seq_data = seq_group_metadata.seq_data[seq_id] + context_len = seq_data.get_num_computed_tokens() + # We should use get_len here because in case of preemption + # it contains output tokens. + seq_len = min(seq_data.get_len(), context_len + token_chunk_size) + prompt_tokens = seq_data.get_token_ids()[context_len:seq_len] + seq_lens.append(seq_len) + + # NOTE: This only works for oooooooxxx style attention. + if computed_block_nums is not None and len( + computed_block_nums) > 0 and self.sliding_window is None: + # Prefix is not supported with sliding_window + context_len = len(computed_block_nums) * self.block_size + prompt_tokens = prompt_tokens[context_len:] + prefix_block_tables.append(computed_block_nums) + elif self.scheduler_config.chunked_prefill_enabled: + if seq_group_metadata.block_tables is not None: + # Prefill has chunked before. + block_table = seq_group_metadata.block_tables[seq_id] + prefix_block_tables.append(block_table) + else: + # The first prefill. + prefix_block_tables.append([]) + else: + prefix_block_tables.append([]) + # Right now, prefill start is always 0. However, this + # assumption can be changed once chunked prefill is introduced. + assert context_len == 0 + + # actual prompt lens + context_lens.append(context_len) + query_lens.append(seq_len - context_len) + input_tokens.append(prompt_tokens) + # NOTE(woosuk): Here we assume that the first token in the prompt + # is always the first token in the sequence. + input_positions.append(list(range(context_len, seq_len))) + + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + mm_kwargs = self.multi_modal_input_mapper(mm_data) + multi_modal_kwargs_list.append(mm_kwargs) + + if seq_group_metadata.block_tables is None: + # During memory profiling, the block tables are not initialized + # yet. In this case, we just use a dummy slot mapping. + slot_mapping.append([_PAD_SLOT_ID] * seq_len) + continue + + # Compute the slot mapping. + slot_mapping.append([]) + block_table = seq_group_metadata.block_tables[seq_id] + + # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID, + # where start_idx is max(0, seq_len - sliding_window). + # For example, if the prompt len is 10, sliding window is 8, and + # block size is 4, the first two tokens are masked and the slot + # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + start_idx = 0 + if self.sliding_window is not None: + assert context_len == 0, ( + "Prefix caching is currently not supported with " + "sliding window attention") + start_idx = max(0, seq_len - self.sliding_window) + for i in range(context_len, seq_len): + if i < start_idx: + slot_mapping[-1].append(_PAD_SLOT_ID) + continue + + block_number = block_table[i // self.block_size] + block_offset = i % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping[-1].append(slot) + + max_query_len = max(query_lens) + sum_query_len = sum(query_lens) + real_num_seqs = len(query_lens) + assert max_query_len > 0 + + max_prompt_len = max( + find_bucket(max(seq_lens), + self.bucketing_global_state.prompt_seq_bucket_cfg), + self.block_size) + + lora_ids: List[int] = [] + for seq_group_metadata, context_len in zip(seq_group_metadata_list, + context_lens): + lora_id = seq_group_metadata.lora_int_id + lora_ids.append(lora_id) + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + lora_index_mapping += [lora_id] * (max_prompt_len - context_len) + lora_prompt_mapping.extend( + [lora_id] * + (max_prompt_len - context_len + if seq_group_metadata.sampling_params.prompt_logprobs else 1)) + + input_tokens = make_tensor_with_pad(input_tokens, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + + input_positions = make_tensor_with_pad(input_positions, + max_len=max_prompt_len, + pad=0, + dtype=torch.long, + device=self.device) + + slot_mapping = make_tensor_with_pad(slot_mapping, + max_len=max_prompt_len, + pad=_PAD_SLOT_ID, + dtype=torch.long, + device=self.device) + + seq_lens_tensor = torch.tensor(seq_lens, + dtype=torch.long, + device=self.device) + + block_indices, block_offsets = precompute_indices_and_offsets( + self.block_size, slot_mapping, True) + attn_metadata = self.attn_backend.make_metadata( + is_prompt=True, + block_list=None, + block_mapping=None, + block_usage=None, + block_indices=block_indices, + block_offsets=block_offsets, + block_scales=None, + attn_bias=None, + seq_lens_tensor=seq_lens_tensor, + num_prefills=real_num_seqs, + num_prefill_tokens=sum_query_len, + num_decode_tokens=0, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps= + None # FIXME(kzawora): mutli-modality will not work here + ) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + + return PreparePromptMetadata(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + seq_lens=seq_lens, + query_lens=query_lens, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + multi_modal_kwargs=multi_modal_kwargs, + slot_mapping=slot_mapping, + lora_ids=lora_ids) + + def _prepare_decode( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> PrepareDecodeMetadata: + input_tokens: List[List[int]] = [] + input_positions: List[List[int]] = [] + slot_mapping: List[List[int]] = [] + seq_lens: List[int] = [] + block_tables: List[List[int]] = [] + lora_index_mapping: List[List[int]] = [] + lora_prompt_mapping: List[List[int]] = [] + lora_requests: Set[LoRARequest] = set() + + if len(seq_group_metadata_list) == 0: + return PrepareDecodeMetadata.empty() + lora_ids: List[int] = [] + + dummy_slots = itertools.cycle( + range(_PAD_SLOT_ID, _PAD_SLOT_ID + self.block_size)) + + for seq_group_metadata in seq_group_metadata_list: + assert not seq_group_metadata.is_prompt + assert seq_group_metadata.token_chunk_size == 1 + + seq_ids = list(seq_group_metadata.seq_data.keys()) + lora_id = seq_group_metadata.lora_int_id + lora_ids.append(lora_id) + + if lora_id > 0: + lora_requests.add(seq_group_metadata.lora_request) + + for seq_id in seq_ids: + seq_data = seq_group_metadata.seq_data[seq_id] + generation_token = seq_data.get_last_token_id() + input_tokens.append([generation_token]) + + seq_len = seq_data.get_len() + position = seq_len - 1 + input_positions.append([position]) + + seq_len = seq_len if self.sliding_window is None else min( + seq_len, self.sliding_window) + seq_lens.append(seq_len) + + block_table = seq_group_metadata.block_tables[seq_id] + if len(block_table) == 0: + block_number = _PAD_BLOCK_ID + else: + block_number = block_table[position // self.block_size] + if block_number == _PAD_BLOCK_ID: + slot = next(dummy_slots) + else: + block_offset = position % self.block_size + slot = block_number * self.block_size + block_offset + slot_mapping.append([slot]) + lora_index_mapping.append(lora_id) + lora_prompt_mapping.append(lora_id) + + if self.sliding_window is not None: + sliding_window_blocks = (self.sliding_window // + self.block_size) + block_table = block_table[-sliding_window_blocks:] + block_tables.append(block_table) + + input_tokens = torch.tensor(input_tokens, + dtype=torch.long, + device=self.device) + input_positions = torch.tensor(input_positions, + dtype=torch.long, + device=self.device) + + num_decode_tokens = sum(seq_lens) + + blocks_used = [len(bt) for bt in block_tables if bt] + block_list = [] + block_scales = [] + for i, bt in enumerate(block_tables): + block_list.extend(bt) + blocks_in_group = len(bt) + if blocks_in_group > 0: + scale = 1.0 / blocks_in_group + block_scales.extend([scale] * blocks_in_group) + + block_mapping_nested: List[List[int]] = [ + [i] * b_u for i, b_u in enumerate(blocks_used) + ] + block_mapping: List[int] = list( + itertools.chain.from_iterable(block_mapping_nested)) + + last_block = [ + sl % self.block_size + 1 for sl in itertools.chain(*slot_mapping) + ] + block_usage = [[self.block_size] * (b_u - 1) + [lb] + for b_u, lb in zip(blocks_used, last_block)] + block_usage = list(itertools.chain(*block_usage)) + + block_bucket_size = find_bucket( + len(block_list), + self.bucketing_global_state.decode_block_bucket_cfg) + block_list = pad_list(block_list, block_bucket_size, _PAD_BLOCK_ID) + block_mapping = pad_list(block_mapping, block_bucket_size, -1) + block_usage = pad_list(block_usage, block_bucket_size, 1) + block_scales = pad_list(block_scales, block_bucket_size, 0.0) + + block_list = torch.tensor(block_list, + dtype=torch.int, + device=self.device) + block_mapping = torch.tensor(block_mapping, + dtype=torch.long, + device=self.device) + block_usage = torch.tensor(block_usage, + dtype=self.model_config.dtype, + device=self.device) + + slot_mapping = torch.tensor(slot_mapping, + dtype=torch.long, + device=self.device) + + block_indices, block_offsets = precompute_indices_and_offsets( + self.block_size, slot_mapping, False) + block_scales = torch.tensor(block_scales, + dtype=self.model_config.dtype, + device=self.device) + + attn_metadata = self.attn_backend.make_metadata( + is_prompt=False, + block_list=block_list, + block_mapping=block_mapping, + block_usage=block_usage, + block_indices=block_indices, + block_offsets=block_offsets, + block_scales=block_scales, + attn_bias=None, + seq_lens_tensor=None, + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=num_decode_tokens, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None) + return PrepareDecodeMetadata(input_tokens=input_tokens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_index_mapping=lora_index_mapping, + lora_prompt_mapping=lora_prompt_mapping, + lora_requests=lora_requests, + slot_mapping=slot_mapping, + lora_ids=lora_ids) + + def prepare_input_tensors( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + ) -> Tuple[TModelInputForHPU, SamplingMetadata]: + if len(seq_group_metadata_list) == 0: + return self._model_input_cls(), None + + input_tokens = None + input_positions = None + lora_mapping = None + lora_requests = None + multi_modal_kwargs = None + batch_type = None + seq_lens = None + query_lens = None + real_batch_size = None + batch_size_padded = None + + self.event_start = self.profiler.get_timestamp_us() + is_prompt = seq_group_metadata_list[0].is_prompt + base_event_name = 'prompt' if is_prompt else 'decode' + self.profiler.start('internal', base_event_name) + + real_batch_size = len(seq_group_metadata_list) + bucket_cfg = self.bucketing_global_state.prompt_bs_bucket_cfg \ + if is_prompt else self.bucketing_global_state.decode_bs_bucket_cfg + batch_size_padded = find_bucket(real_batch_size, bucket_cfg) + batch_size_padding = batch_size_padded - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + if batch_size_padding > 0: + dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( + 0, 0, is_prompt) + seq_group_metadata_list.extend(dummy_seq_group_metadata + for _ in range(batch_size_padding)) + + prefill_reqs = [] + decode_reqs = [] + for seq_group_meta in seq_group_metadata_list: + if seq_group_meta.is_prompt: + prefill_reqs.append(seq_group_meta) + else: + decode_reqs.append(seq_group_meta) + + # Prepare input tensors. + ( + input_tokens, + input_positions, + prefill_attn_metadata, + seq_lens, + query_lens, + lora_index_mapping, + lora_prompt_mapping, + lora_requests, + multi_modal_kwargs, + slot_mapping, + lora_ids, + ) = self._prepare_prompt(prefill_reqs) + ( + decode_input_tokens, + decode_input_positions, + decode_attn_metadata, + decode_lora_index_mapping, + decode_lora_prompt_mapping, + decode_lora_requests, + decode_slot_mapping, + decode_lora_ids, + ) = self._prepare_decode(decode_reqs) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + seq_lens, query_lens, + self.device, + self.pin_memory) + + if not self.scheduler_config.chunked_prefill_enabled: + assert (len(prefill_reqs) and len(decode_reqs)) == 0 + + num_prefills = len(seq_lens) + num_prefill_tokens = len(input_tokens) + num_decode_tokens = len(decode_input_tokens) + + # NOTE(kzawora): Here we diverge from GPU code - we don't + # support mixed batches, so we either use decode or prefill + # inputs, without coalescing. + assert (num_prefills == 0 and num_decode_tokens > 0) or ( + num_prefills > 0 + and num_decode_tokens == 0), "HPU does not support mixed batches!" + if num_decode_tokens > 0: + input_tokens = decode_input_tokens + input_positions = decode_input_positions + slot_mapping = decode_slot_mapping + lora_index_mapping = decode_lora_index_mapping + lora_prompt_mapping = decode_lora_prompt_mapping + lora_requests = decode_lora_requests + lora_ids = decode_lora_ids + + # FIXME: We need to adjust selected_token_indices to accommodate + # for padding + max_len = input_tokens.size(1) + paddings = [max_len - s for s in seq_lens] + paddings = [0] + paddings[:-1] + paddings = list(itertools.accumulate(paddings)) + paddings_prompt_logprobs = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + if seq_group_metadata.sampling_params.prompt_logprobs is not None \ + and seq_group_metadata.is_prompt: + paddings_prompt_logprobs += ([paddings[i]] * seq_lens[i]) + paddings = torch.tensor( + paddings_prompt_logprobs if paddings_prompt_logprobs else paddings, + dtype=sampling_metadata.selected_token_indices.dtype, + device=sampling_metadata.selected_token_indices.device) + sampling_metadata.selected_token_indices.add_(paddings) + + if self.lora_config: + lora_mapping = LoRAMapping( + **dict(index_mapping=lora_index_mapping, + prompt_mapping=lora_prompt_mapping, + is_prefill=(num_prefills > 0))) + else: + lora_mapping = None + + if (prefill_attn_metadata is not None + and decode_attn_metadata is not None): + batch_type = BatchType.MIXED + raise NotImplementedError("Mixed batch is not supported on HPU") + elif prefill_attn_metadata is not None: + batch_type = BatchType.PREFILL + else: + batch_type = BatchType.DECODE + + metadata_dict = { + "input_tokens": input_tokens, + "input_positions": input_positions, + "selected_token_indices": sampling_metadata.selected_token_indices, + "lora_requests": lora_requests, + "lora_mapping": lora_mapping, + "multi_modal_kwargs": multi_modal_kwargs, + "num_prefill_tokens": num_prefill_tokens, + "num_decode_tokens": num_decode_tokens, + "slot_mapping": slot_mapping, + "num_prefills": num_prefills, + "batch_type": batch_type, + "seq_lens": seq_lens, + "query_lens": query_lens + } + if prefill_attn_metadata is not None: + metadata_dict.update(prefill_attn_metadata.asdict_zerocopy()) + else: + assert decode_attn_metadata is not None + metadata_dict.update(decode_attn_metadata.asdict_zerocopy()) + + attn_metadata = prefill_attn_metadata if \ + prefill_attn_metadata is not None else decode_attn_metadata + + return self._model_input_cls(input_tokens=input_tokens, + seq_lens=seq_lens, + query_lens=query_lens, + input_positions=input_positions, + attn_metadata=attn_metadata, + lora_requests=lora_requests, + lora_mapping=lora_mapping, + multi_modal_kwargs=multi_modal_kwargs, + real_batch_size=real_batch_size, + batch_size_padded=batch_size_padded, + lora_ids=lora_ids), \ + sampling_metadata + + def _seq_len(self, attn_metadata): + if attn_metadata.num_prefills != 0: + return attn_metadata.slot_mapping.size(1) + else: + return attn_metadata.block_list.numel() + + def trim_attn_metadata(self, metadata: AttentionMetadata) -> object: + # NOTE(kzawora): To anyone working on this in the future: + # Trimming metadata is required when using HPUGraphs. + # Attention metadata is going to be hashed by PT bridge, and + # appropriate HPUGraphs will be matched based on all inputs' hash. + + # Before you put more keys in here, make sure you know their + # value type and make sure you know how it's going to be hashed. + # You can find that information in input_hash function + # in habana_frameworks/torch/hpu/graphs.py. You can also hash + # it manually with torch.hpu.graphs.input_hash(attention_metadata) + + # If you use primitive types here - they will get hashed based + # on their value. You *will* get lots of excessive graph captures + # (and an OOM eventually) if you decide to put something like + # seq_len int here. + # If you absolutely need a scalar, put it in a tensor. Tensors + # get hashed using their metadata, not their values: + # input_hash(torch.tensor(123)) == input_hash(torch.tensor(321)) + # input_hash(123) != input_hash(321) + # input_hash("abc") != input_hash("cba") + attention_metadata = subtuple(metadata, 'TrimmedAttentionMetadata', [ + 'attn_bias', 'seq_lens_tensor', 'block_list', 'block_mapping', + 'block_usage', 'slot_mapping', 'is_prompt', 'block_indices', + 'block_offsets', 'block_scales' + ]) + return attention_metadata + + def create_dummy_seq_group_metadata(self, + group_id, + seq_len, + is_prompt, + lora_request=None): + sampling_params = SamplingParams(temperature=0) + num_blocks = math.ceil(seq_len / self.block_size) + seq_len = max(seq_len, 1) + if is_prompt: + input_len = seq_len + output_len = 0 + block_tables = None + else: + input_len = seq_len - 1 + output_len = 1 + block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks} + prompt_token_ids = [0] * input_len + output_token_ids = [1] * output_len + prompt_token_ids_array = array('l', prompt_token_ids) # noqa: F821 + seq_data = SequenceData(prompt_token_ids_array) + seq_data.output_token_ids = output_token_ids + return SequenceGroupMetadata(request_id=str(group_id), + is_prompt=(output_len == 0), + seq_data={group_id: seq_data}, + sampling_params=sampling_params, + block_tables=block_tables, + lora_request=lora_request) + + def profile_run(self) -> None: + num_layers = self.model_config.get_num_layers(self.parallel_config) + kv_caches = [None] * num_layers + max_batch_size = self.bucketing_global_state.prompt_bs_bucket_cfg[-1] + max_seq_len = min( + self.bucketing_global_state.prompt_seq_bucket_cfg[-1], + self.max_num_batched_tokens // max_batch_size) + + self.warmup_scenario(max_batch_size, max_seq_len, True, kv_caches, + False, True) + return + + def warmup_scenario(self, + batch_size, + seq_len, + is_prompt, + kv_caches, + is_pt_profiler_run=False, + is_lora_profile_run=False) -> None: + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + scenario_name = ("warmup_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config and is_lora_profile_run: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_local_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + self.profiler.start('internal', scenario_name) + times = 3 if use_graphs or is_pt_profiler_run else 1 + if self.lora_config and not is_lora_profile_run: + lora_mapping = LoRAMapping( + **dict(index_mapping=[0] * batch_size * seq_len, + prompt_mapping=[0] * batch_size * seq_len, + is_prefill=is_prompt)) + self.set_active_loras(set(), lora_mapping) + if is_prompt: + seqs = [ + self.create_dummy_seq_group_metadata( + i, + seq_len, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) + for i in range(batch_size) + ] + else: + # FIXME: seq_len is actually number of blocks + blocks = [seq_len // batch_size for _ in range(batch_size)] + blocks[0] += seq_len % batch_size + seqs = [ + self.create_dummy_seq_group_metadata( + i, + b * self.block_size - 1, + is_prompt, + lora_request=dummy_lora_requests_per_seq[i] + if dummy_lora_requests_per_seq else None) + for i, b in enumerate(blocks) + ] + torch.hpu.synchronize() + profiler = None + if is_pt_profiler_run and self.is_driver_worker: + profiler = setup_profiler() + profiler.start() + for _ in range(times): + inputs = self.prepare_model_input(seqs) + self.execute_model(inputs, kv_caches, warmup_mode=True) + torch.hpu.synchronize() + if profiler: + profiler.step() + if profiler: + profiler.stop() + self.profiler.end() + gc.collect() + + def remove_all_loras(self): + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.remove_all_adapters() + + def set_active_loras(self, lora_requests: Set[LoRARequest], + lora_mapping: LoRAMapping) -> None: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + self.lora_manager.set_active_adapters(lora_requests, lora_mapping) + + def add_lora(self, lora_request: LoRARequest) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.add_adapter(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.remove_adapter(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.pin_adapter(lora_id) + + def list_loras(self) -> Set[int]: + if not self.lora_manager: + raise RuntimeError("LoRA is not enabled.") + return self.lora_manager.list_adapters() + + def log_warmup(self, phase, i, max_i, batch_size, seq_len): + free_mem = format_bytes( + HabanaMemoryProfiler.current_free_device_memory()) + dim = "num_blocks" + if phase == "Prompt": + dim = "seq_len" + msg = (f"[Warmup][{phase}][{i+1}/{max_i}] " + f"batch_size:{batch_size} " + f"{dim}:{seq_len} " + f"free_mem:{free_mem}") + logger.info(msg) + + def warmup_all_buckets(self, buckets, is_prompt, kv_caches): + for i, (batch_size, seq_len) in enumerate(reversed(buckets)): + self.log_warmup('Prompt' if is_prompt else 'Decode', i, + len(buckets), batch_size, seq_len) + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + + def warmup_graphs(self, + strategy, + buckets, + is_prompt, + kv_caches, + available_mem, + starting_mem=0, + total_batch_seq=0.001): + total_mem = starting_mem + idx = 0 + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' + num_candidates = len(buckets) + ordering : Union[Callable[[Any], Tuple[Any, Any]], \ + Callable[[Any], Tuple[Any, Any, Any]]] + if strategy == 'min_tokens': + ordering = lambda b: (b[0] * b[1], b[1], b[0]) + elif strategy == 'max_bs': + ordering = lambda b: (-b[0], b[1]) + else: + raise NotImplementedError( + f'Unsupported graph allocation strategy: {strategy}') + buckets = list(sorted(buckets, key=ordering)) + captured_all = True + for idx, (batch_size, seq_len) in enumerate(buckets): + # Graph memory usage is proportional to seq dimension in a batch + batch_seq = batch_size * seq_len if is_prompt else batch_size + mem_estimate = batch_seq / total_batch_seq * total_mem + if mem_estimate >= available_mem: + captured_all = False + continue + graphed_bucket = (batch_size, seq_len, is_prompt) + if graphed_bucket in self.graphed_buckets: + continue + self.graphed_buckets.add(graphed_bucket) + self.log_warmup(phase, idx, num_candidates, batch_size, seq_len) + with HabanaMemoryProfiler() as mem_prof: + self.warmup_scenario(batch_size, seq_len, is_prompt, kv_caches) + used_mem = align_workers(mem_prof.consumed_device_memory, + torch.distributed.ReduceOp.MAX) + available_mem -= used_mem + total_mem += used_mem + total_batch_seq += batch_seq + + return total_mem, total_batch_seq, captured_all + + def log_graph_warmup_summary(self, buckets, is_prompt, total_mem): + num_candidates = len(buckets) + phase = f'Graph/{"Prompt" if is_prompt else "Decode"}' + graphed = list(c[:2] for c in self.graphed_buckets + if c[2] == is_prompt) + if num_candidates == 0: + num_candidates = 1 + msg = (f'{phase} captured:{len(graphed)} ' + f'({100 * len(graphed) / num_candidates:.1f}%) ' + f'used_mem:{format_bytes(total_mem)} ' + f'buckets:{sorted(list(graphed))}') + logger.info(msg) + + @torch.inference_mode() + def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: + if profile := os.environ.get('VLLM_PT_PROFILE', None): + phase, bs, seq_len, graph = profile.split('_') + is_prompt = phase == 'prompt' + graphs = graph == 't' + if graphs: + self.graphed_buckets.add((int(bs), int(seq_len), is_prompt)) + self.warmup_scenario(int(bs), int(seq_len), is_prompt, kv_caches, + True) + raise AssertionError("Finished profiling") + if self.skip_warmup: + logger.info("Skipping warmup...") + return + self.profiler.start('internal', 'warmup') + max_blocks = kv_caches[0][0].size(0) + + self.bucketing_global_state.prompt_buckets, prompt_omitted_buckets = \ + generate_prompt_buckets( + self.bucketing_global_state.prompt_bs_bucket_cfg, + self.bucketing_global_state.prompt_seq_bucket_cfg, + self.max_num_batched_tokens) + + msg = (f"Generated {len(self.bucketing_global_state.prompt_buckets)} " + f"prompt buckets [bs, seq]: \ + {list(sorted(self.bucketing_global_state.prompt_buckets))}") + logger.info(msg) + + msg = (f"Omitted {len(prompt_omitted_buckets)} " + "prompt buckets due to exceeded token budget " + f"(max_num_batched_tokens={self.max_num_batched_tokens})") + logger.info(msg) + + msg = f"Omitted prompt buckets: {list(sorted(prompt_omitted_buckets))}" + logger.debug(msg) + + self.bucketing_global_state.decode_buckets = generate_decode_buckets( + self.bucketing_global_state.decode_bs_bucket_cfg, + self.bucketing_global_state.decode_block_bucket_cfg, max_blocks) + logger.info("Generated %d decode buckets [bs, total_blocks]: %s", + len(self.bucketing_global_state.decode_buckets), + list(sorted(self.bucketing_global_state.decode_buckets))) + + if not htorch.utils.internal.is_lazy() and not self.enforce_eager: + cache_size_limit = len( + self.bucketing_global_state.prompt_buckets) + len( + self.bucketing_global_state.decode_buckets) + 1 + torch._dynamo.config.cache_size_limit = max( + cache_size_limit, torch._dynamo.config.cache_size_limit) + # Multiply by 8 to follow the original default ratio between + # the cache_size_limit and accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = max( + cache_size_limit * 8, + torch._dynamo.config.accumulated_cache_size_limit) + + start_mem = HabanaMemoryProfiler.current_device_memory_usage() + start_time = time.perf_counter() + + compile_only_mode_context = functools.partial(bc.env_setting, + "PT_COMPILE_ONLY_MODE", + True) + can_use_compile_only_mode = True + try: + with compile_only_mode_context(): + pass + logger.debug("Using PT_COMPILE_ONLY_MODE.") + except KeyError: + can_use_compile_only_mode = False + logger.warning('Cannot use PT_COMPILE_ONLY_MODE. ' + 'Warmup time will be negatively impacted. ' + 'Please update Gaudi Software Suite.') + with compile_only_mode_context( + ) if can_use_compile_only_mode else contextlib.nullcontext(): + self.warmup_all_buckets(self.bucketing_global_state.prompt_buckets, + True, kv_caches) + self.warmup_all_buckets(self.bucketing_global_state.decode_buckets, + False, kv_caches) + + if not self.enforce_eager and htorch.utils.internal.is_lazy(): + assert self.mem_margin is not None, \ + ("HabanaWorker.determine_num_available_blocks needs " + "to be called before warming up the model.") + free_mem = HabanaMemoryProfiler.current_free_device_memory() + graph_free_mem = free_mem - self.mem_margin + graph_free_mem = align_workers(graph_free_mem, + torch.distributed.ReduceOp.MIN) + prompt_graph_mem_ratio = float( + os.environ.get('VLLM_GRAPH_PROMPT_RATIO', '0.3')) + prompt_available_memory = (prompt_graph_mem_ratio * + graph_free_mem) + decode_available_memory = (graph_free_mem - + prompt_available_memory) + msg = ( + f"Using {format_bytes(graph_free_mem)}" + f"/{format_bytes(free_mem)} " + "of free device memory for HPUGraphs, " + f"{format_bytes(prompt_available_memory)} for prompt and " + f"{format_bytes(decode_available_memory)} for decode " + f"(VLLM_GRAPH_PROMPT_RATIO={prompt_graph_mem_ratio})") + logger.info(msg) + prompt_strategy = os.environ.get('VLLM_GRAPH_PROMPT_STRATEGY', + 'min_tokens') + decode_strategy = os.environ.get('VLLM_GRAPH_DECODE_STRATEGY', + 'max_bs') + mem_post_prompt, prompt_batch_seq, prompt_captured_all = \ + self.warmup_graphs( + prompt_strategy, self.bucketing_global_state.prompt_buckets, + True, kv_caches, prompt_available_memory) + mem_post_decode, decode_batch_seq, decode_captured_all = \ + self.warmup_graphs( + decode_strategy, self.bucketing_global_state.decode_buckets, + False, kv_caches, decode_available_memory) + + # Not all prompt buckets were captured, but all decode buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more prompt buckets. + if (mem_post_decode + mem_post_prompt < graph_free_mem + and not prompt_captured_all and decode_captured_all): + mem_post_prompt, _, prompt_captured_all = ( + self.warmup_graphs( + prompt_strategy, + self.bucketing_global_state.prompt_buckets, True, + kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_prompt, prompt_batch_seq)) + + # Not all decode buckets were captured, but all prompt buckets + # were captured and we have some free graph-allocated space + # left. Let's try to use it for capturing more decode buckets. + if mem_post_decode + mem_post_prompt < graph_free_mem \ + and not decode_captured_all \ + and prompt_captured_all: + mem_post_decode, _, _ = self.warmup_graphs( + decode_strategy, + self.bucketing_global_state.decode_buckets, False, + kv_caches, + graph_free_mem - mem_post_prompt - mem_post_decode, + mem_post_decode, decode_batch_seq) + + self.log_graph_warmup_summary( + self.bucketing_global_state.prompt_buckets, True, + mem_post_prompt) + self.log_graph_warmup_summary( + self.bucketing_global_state.decode_buckets, False, + mem_post_decode) + + end_time = time.perf_counter() + end_mem = HabanaMemoryProfiler.current_device_memory_usage() + elapsed_time = end_time - start_time + msg = ( + f"Warmup finished in {elapsed_time:.0f} secs, " + f"allocated {format_bytes(end_mem - start_mem)} of device memory") + logger.info(msg) + self.profiler.end() + + @property + def vocab_size(self) -> int: + return self.model_config.get_vocab_size() + + @property + def mem_margin(self) -> Optional[int]: + return self._mem_margin + + @mem_margin.setter + def mem_margin(self, value): + self._mem_margin = value + + +def _maybe_wrap_in_hpu_graph(*args, **kwargs): + return htorch.hpu.wrap_in_hpu_graph( + HpuModelAdapter(*args, **kwargs), disable_tensor_cache=True + ) if htorch.utils.internal.is_lazy() else HpuModelAdapter(*args, **kwargs) + + +class HabanaProfilerCounterHelper: + + def __init__(self): + self.niter = 0 + self.average_real_throughput = None + self.logged_once = False + self.real_seq_lens = [] + self.prompt_seq_lens = [] + + def capture_seq_group_metadata_stats(self, seq_group_metadata_list): + self.real_seq_lens = [ + len(seq_data.prompt_token_ids) + len(seq_data.output_token_ids) + for seq_group_metadata in seq_group_metadata_list + for seq_data in seq_group_metadata.seq_data.values() + ] + self.prompt_seq_lens = [ + len(seq_data.prompt_token_ids) + for seq_group_metadata in seq_group_metadata_list + for seq_data in seq_group_metadata.seq_data.values() + ] + + def get_counter_dict(self, cache_config, duration, seq_len, + batch_size_padded, real_batch_size, is_prompt): + throughput = batch_size_padded / (duration / 1e6) + throughput_effective = real_batch_size / (duration / 1e6) + + real_max_seq_len = max(self.real_seq_lens) + real_num_tokens = sum(self.real_seq_lens) + padded_num_tokens = batch_size_padded * seq_len + batch_token_utilization = real_num_tokens / padded_num_tokens + if self.average_real_throughput is None: + self.average_real_throughput = throughput_effective + else: # https://www.heikohoffmann.de/htmlthesis/node134.html + self.average_real_throughput = self.average_real_throughput + 1 / ( + self.niter + 1) * (throughput_effective - + self.average_real_throughput) + phase = "prompt" if is_prompt else "decode" + counters = { + f'{phase}_bucket_batch_size': batch_size_padded, + f'{phase}_batch_size': real_batch_size, + f'{phase}_bucket_seq_len': seq_len, + f'{phase}_seq_len': real_max_seq_len, + f'{phase}_bucket_gen_throughput': throughput, + f'{phase}_real_gen_throughput': throughput_effective, + f'{phase}_batch_token_utilization': batch_token_utilization, + 'average_real_throughput': self.average_real_throughput, + 'engine_iteration': self.niter, + } + self.niter += 1 + if is_prompt: + prompt_bucket_in_throughput = (seq_len * batch_size_padded) / ( + duration / 1e6) + prompt_real_in_throughput = sum( + self.prompt_seq_lens) / (duration / 1e6) + counters[ + f'{phase}_bucket_in_throughput'] = prompt_bucket_in_throughput + counters[f'{phase}_real_in_throughput'] = prompt_real_in_throughput + + # KV cache might not be created yet (e.g. for profiling run) + if cache_config.num_gpu_blocks is not None and \ + cache_config.num_gpu_blocks != 0: + cache_num_blocks_used = [ + math.ceil(sl / cache_config.block_size) + for sl in self.real_seq_lens + ] + cache_total_num_blocks_used = sum(cache_num_blocks_used) + num_cache_blocks = cache_config.num_gpu_blocks + cache_total_num_free_blocks = \ + num_cache_blocks - cache_total_num_blocks_used + cache_computed_utilization = \ + cache_total_num_blocks_used / num_cache_blocks + max_blocks_per_seq = math.ceil(seq_len / cache_config.block_size) + batch_block_utilization = cache_total_num_blocks_used / ( + batch_size_padded * max_blocks_per_seq) + counters['cache_num_blocks_used'] = cache_total_num_blocks_used + counters['cache_num_free_blocks'] = cache_total_num_free_blocks + counters['cache_computed_utilization'] = cache_computed_utilization + counters[ + f'{phase}_batch_block_utilization'] = batch_block_utilization + if not self.logged_once: + counters['const_cache_num_blocks'] = cache_config.num_gpu_blocks + counters[ + 'const_gpu_memory_utilization'] = \ + cache_config.gpu_memory_utilization + counters['const_block_size'] = cache_config.block_size + self.logged_once = True + return counters + + +def unwrap_model(model): + if isinstance(model, torch._dynamo.eval_frame.OptimizedModule): + return unwrap_model(model._orig_mod) + else: + model = list(vars(model)['_modules'].values())[0] + modules = list(vars(model)['_modules'].values()) + return modules + + +class HPUModelRunner(HPUModelRunnerBase[ModelInputForHPUWithSamplingMetadata]): + """ + GPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForHPUWithSamplingMetadata] = ( + ModelInputForHPUWithSamplingMetadata) + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForHPUWithSamplingMetadata: + return ( + ModelInputForHPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + )) + + @torch.inference_mode() + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForHPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + The result tensors and data structure also batches input in prefill + -> decode order. For example, + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + If cuda graph is required, this API automatically pads inputs. + """ + with self.profiler.record_event('internal', 'prepare_input_tensors'): + assert seq_group_metadata_list is not None + if self.profiler.enabled: + self.profiler_counter_helper.capture_seq_group_metadata_stats( + seq_group_metadata_list=seq_group_metadata_list) + model_input, sampling_metadata = self.prepare_input_tensors( + seq_group_metadata_list) + assert model_input.attn_metadata is not None + is_prompt = model_input.attn_metadata.is_prompt + + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + + def finish_measurements(self): + from neural_compressor.torch.quantization import finalize_calibration + finalize_calibration(self.model.model) + + def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): + cfg = (batch_size, seq_len, is_prompt) + seen = cfg in self.seen_configs + self.seen_configs.add(cfg) + if not seen and not warmup_mode: + phase = 'prompt' if is_prompt else 'decode' + logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", + phase, batch_size, seq_len) + + def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: List[int], + is_prompt: bool): + ''' + This is a helper function to create the mask for lora computations. + Lora Mask is needed to ensure we match the correct lora weights for the + for the request. + For Prompt phase we have + lora_mask with shape (batch_size * seq_len, max_loras * max_rank) + lora_logits_mask with shape (batch_size, max_loras * max_rank) + For Decode phase we have both + lora_mask and lora_logits_mask with shape + (batch_size, max_loras * max_rank) + ''' + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + lora_index = 0 + + if self.lora_config: + if is_prompt: + lora_mask = torch.zeros( + input_tokens.shape[0] * input_tokens.shape[1], + (self.lora_config.max_loras) *\ + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + lora_logits_mask = torch.zeros( + input_tokens.shape[0], (self.lora_config.max_loras) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + + ones = torch.ones(input_tokens.shape[1], + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + logit_ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + + for i in range(len(lora_ids)): + if lora_ids[i] == 0: + continue + lora_index = self.lora_manager._adapter_manager.\ + lora_index_to_id.index(lora_ids[i]) + start_row = i * input_tokens.shape[1] + end_row = start_row + input_tokens.shape[1] + start_col = lora_index * self.lora_config.max_lora_rank + end_col = start_col + self.lora_config.max_lora_rank + lora_mask[start_row:end_row, start_col:end_col] = ones + lora_logits_mask[i, start_col:end_col] = logit_ones + lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_logits_mask.to('hpu') + else: + lora_mask = torch.zeros(input_tokens.shape[0], + (self.lora_config.max_loras) * + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + ones = torch.ones(1, + self.lora_config.max_lora_rank, + dtype=self.lora_config.lora_dtype) + for i in range(len(lora_ids)): + if lora_ids[i] == 0: + continue + lora_index = self.lora_manager._adapter_manager.\ + lora_index_to_id.index(lora_ids[i]) + start_pos = lora_index * self.lora_config.max_lora_rank + end_pos = start_pos + self.lora_config.max_lora_rank + lora_mask[i, start_pos:end_pos] = ones + lora_mask = lora_mask.to('hpu') + lora_logits_mask = lora_mask + + return lora_mask, lora_logits_mask + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForHPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + warmup_mode=False, + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError( + "num_steps > 1 is not supported in HPUModelRunner") + + if self.lora_config: + assert model_input.lora_requests is not None + assert model_input.lora_mapping is not None + self.set_active_loras(model_input.lora_requests, + model_input.lora_mapping) + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + assert input_tokens is not None + assert input_positions is not None + assert sampling_metadata is not None + assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt + assert is_prompt is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + lora_mask: torch.Tensor = None + lora_logits_mask: torch.Tensor = None + if self.lora_config: + assert model_input.lora_ids is not None + lora_mask, lora_logits_mask = self.create_lora_mask( + input_tokens, model_input.lora_ids, attn_metadata.is_prompt) + + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors, + "lora_mask": lora_mask, + **(model_input.multi_modal_kwargs or {}), + } + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) + + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata.selected_token_indices + ) + + if self.lora_config: + LoraMask.setLoraMask( + lora_logits_mask.index_select( + 0, sampling_metadata.selected_token_indices)) + + # Compute the logits. + with self.profiler.record_event( + 'internal', ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + return [] + + if model_input.async_callback is not None: + model_input.async_callback() + + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + output.outputs = output.outputs[:real_batch_size] + htorch.core.mark_step() + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + return [output] + + def shutdown_inc(self): + can_finalize_inc = False + from contextlib import suppress + with suppress(AttributeError): + can_finalize_inc = (self.model_config.quantization == 'inc') and \ + (self.model.model is not None) and \ + self.inc_initialized_successfully and \ + not getattr(self, "_is_inc_finalized", False) + if can_finalize_inc: + from neural_compressor.torch.quantization import ( + finalize_calibration) + finalize_calibration(self.model.model) + self._is_inc_finalized = True + + def __del__(self): + self.shutdown_inc() diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py new file mode 100644 index 0000000000000..493f7a9fad098 --- /dev/null +++ b/vllm/worker/hpu_worker.py @@ -0,0 +1,410 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +import gc +import os +from typing import List, Optional, Set, Tuple, Type + +import habana_frameworks.torch as htorch # noqa:F401 +import torch +import torch.distributed +from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes + +import vllm.envs as envs +from vllm.config import ParallelConfig, VllmConfig +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment) +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor import set_random_seed +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sequence import ExecuteModelRequest +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.hpu_model_runner import HPUModelRunner +from vllm.worker.model_runner_base import ModelRunnerBase +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, + WorkerInput) + +logger = init_logger(__name__) + + +class HPUWorker(LocalOrDistributedWorkerBase): + """A worker class that executes (a partition of) the model on a HPU. + + Each worker is associated with a single HPU. The worker is responsible for + maintaining the KV cache and executing the model on the HPU. In case of + distributed inference, each worker is assigned a partition of the model. + """ + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False, + model_runner_cls: Optional[Type[ModelRunnerBase]] = None, + ) -> None: + WorkerBase.__init__(self, vllm_config=vllm_config) + self.parallel_config.rank = rank + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.is_driver_worker = is_driver_worker + if self.is_driver_worker: + assert self.rank == 0, "The driver worker must have rank 0." + + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + + self.model_runner: HPUModelRunner = HPUModelRunner( + vllm_config=vllm_config, is_driver_worker=is_driver_worker) + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: List[HPUCacheEngine] + # Initialize gpu_cache as embedding models don't initialize kv_caches + self.hpu_cache: Optional[List[List[torch.tensor]]] = None + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + self.profiler = torch.profiler.profile( + activities=[ + torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.HPU, + ], + with_stack=True, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir, use_gzip=True)) + else: + self.profiler = None + + def start_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.start() + + def stop_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.stop() + + def _set_env_vars(self): + local_rank = self.local_rank + if self.parallel_config.world_size == 1: + local_rank = -1 + import os + os.environ["LOCAL_RANK"] = str(local_rank) + os.environ["ID"] = str(local_rank) + os.environ["WORLD_SIZE"] = str(self.parallel_config.world_size) + os.environ["RANK"] = str(self.rank) + + def init_device(self) -> None: + if self.device_config.device.type == "hpu": + self.device = torch.device("hpu") + torch.hpu.set_device(self.device) + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + if self.model_config.quantization == 'inc': + self._set_env_vars() + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + @torch.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of GPU and CPU blocks + that can be allocated with the remaining free memory. + + .. tip:: + You may limit the usage of GPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + with HabanaMemoryProfiler() as m: + self.model_runner.profile_run() + torch.hpu.synchronize() + msg = ("Model profiling run " + f"took {m.get_summary_string()}") + logger.info(msg) + # At this point we should've allocated the maximum workspace for all + # recipes we will use the extra memory for graphs/blocks + free_hpu_memory = torch.hpu.mem_get_info()[0] + + cache_block_size = self.get_cache_block_size_bytes() + graph_reserved_mem = (float( + os.environ.get('VLLM_GRAPH_RESERVED_MEM', '0.1')) + if not self.model_config.enforce_eager else 0) + graph_headroom = 1 - graph_reserved_mem + available_hpu_memory = free_hpu_memory * \ + self.cache_config.gpu_memory_utilization + hpu_memory_margin = free_hpu_memory * ( + 1 - self.cache_config.gpu_memory_utilization) + self.model_runner.mem_margin = hpu_memory_margin + cache_size_bytes = available_hpu_memory * graph_headroom + graph_headroom_bytes = available_hpu_memory * (1 - graph_headroom) + msg = ( + f"Free device memory: {format_bytes(free_hpu_memory)}, " + f"{format_bytes(available_hpu_memory)} usable " + f"(gpu_memory_utilization={self.cache_config.gpu_memory_utilization})," + f" {format_bytes(graph_headroom_bytes)} reserved for HPUGraphs " + f"(VLLM_GRAPH_RESERVED_MEM={graph_reserved_mem}), " + f"{format_bytes(cache_size_bytes)} reserved for KV cache") + logger.info(msg) + num_hpu_blocks = int(cache_size_bytes // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) + num_hpu_blocks = max(num_hpu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + + gc.collect() + return num_hpu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Allocate GPU and CPU KV cache with the specified number of blocks. + + This also warms up the model, which may record CUDA graphs. + """ + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + with HabanaMemoryProfiler() as m: + self._init_cache_engine() + torch.hpu.synchronize() + msg = ("Initializing cache engine " + f"took {m.get_summary_string()}") + logger.info(msg) + self._warm_up_model() + + def _init_cache_engine(self): + assert self.cache_config.num_gpu_blocks is not None + self.cache_engine = [ + HPUCacheEngine(self.cache_config, self.model_config, + self.parallel_config, self.device_config) + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + self.hpu_cache = [ + self.cache_engine[ve].gpu_cache + for ve in range(self.parallel_config.pipeline_parallel_size) + ] + + def _warm_up_model(self) -> None: + # NOTE(kzawora): We should use virtual engine index here + # for pipeline parallelism. Using 0 for now. + assert self.hpu_cache is not None + self.model_runner.warmup_model(self.hpu_cache[0]) + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + def finish_measurements(self): + self.model_runner.finish_measurements() + + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + return self.hpu_cache + + @torch.inference_mode() + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + virtual_engine = execute_model_req.virtual_engine + num_seq_groups = len(execute_model_req.seq_group_metadata_list) + # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. + # they contain parameters to launch cudamemcpyasync. + blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, + device="cpu", + dtype=torch.int64).view(-1, 2) + blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, + device="cpu", + dtype=torch.int64).view(-1, 2) + # `blocks_to_copy` is a gpu tensor. The src and tgt of + # blocks to copy are in the same device, and `blocks_to_copy` + # can be used directly within cuda kernels. + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self.device, + dtype=torch.int64).view(-1, 2) + + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + virtual_engine=virtual_engine, + ) + + @torch.inference_mode() + def execute_worker(self, worker_input: WorkerInput) -> None: + virtual_engine = worker_input.virtual_engine + # Issue cache operations. + if (worker_input.blocks_to_swap_in is not None + and worker_input.blocks_to_swap_in.numel() > 0): + self.cache_engine[virtual_engine].swap_in( + worker_input.blocks_to_swap_in) + if (worker_input.blocks_to_swap_out is not None + and worker_input.blocks_to_swap_out.numel() > 0): + self.cache_engine[virtual_engine].swap_out( + worker_input.blocks_to_swap_out) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) + + def add_lora(self, lora_request: LoRARequest) -> bool: + return self.model_runner.add_lora(lora_request) + + def remove_lora(self, lora_id: int) -> bool: + return self.model_runner.remove_lora(lora_id) + + def pin_lora(self, lora_id: int) -> bool: + return self.model_runner.pin_lora(lora_id) + + def list_loras(self) -> Set[int]: + return self.model_runner.list_loras() + + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def list_prompt_adapters(self) -> Set[int]: + raise NotImplementedError( + "Prompt Adapter is not implemented for HPU backend.") + + def shutdown_inc(self): + self.model_runner.shutdown_inc() + + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self) -> int: + """Get the size of the KV cache block size in bytes. + """ + return HPUCacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + + +def init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, +) -> None: + """Initialize the distributed environment.""" + init_distributed_environment(parallel_config.world_size, + rank, + distributed_init_method, + local_rank, + backend='hccl') + + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + if torch.distributed.is_initialized(): + torch_world_size = torch.distributed.get_world_size() + if torch_world_size != parallel_config.world_size: + raise RuntimeError( + "torch.distributed is already initialized but the torch world " + "size does not match parallel_config.world_size " + f"({torch_world_size} vs. {parallel_config.world_size}).") + elif not distributed_init_method: + raise ValueError( + "distributed_init_method must be set if torch.distributed " + "is not already initialized") + else: + torch.distributed.init_process_group( + backend="hccl", + world_size=parallel_config.world_size, + rank=rank, + init_method=distributed_init_method, + ) + + # A small all_reduce for warmup & checking conformance. + dummy_tensor_hpu = torch.ones(1).to('hpu') + torch.distributed.all_reduce(dummy_tensor_hpu) + assert dummy_tensor_hpu.item() == parallel_config.world_size + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, + max_model_len) -> None: + if num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.") + + +class HPUCacheEngine(CacheEngine): + + def _allocate_kv_cache( + self, + num_blocks: int, + device: str, + ) -> List[Tuple[torch.Tensor, torch.Tensor]]: + """Allocates KV cache on the specified device.""" + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, self.block_size, self.num_kv_heads, self.head_size) + kv_cache: List[Tuple[torch.Tensor, torch.Tensor]] = [] + for _ in range(self.num_attention_layers): + key_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + value_cache = torch.zeros(kv_cache_shape, + dtype=self.dtype, + device=device) + kv_layer = (key_cache, value_cache) + kv_cache.append(kv_layer) + return kv_cache diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 891637dafbb14..042f9f07eace6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -20,9 +20,7 @@ from vllm.attention.backends.utils import CommonAttentionState from vllm.compilation.compile_context import set_compile_context from vllm.compilation.levels import CompilationLevel -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.core.scheduler import SchedulerOutputs from vllm.distributed import get_pp_group from vllm.distributed.parallel_state import graph_capture @@ -40,7 +38,8 @@ from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalRegistry) + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.request import PromptAdapterRequest @@ -48,10 +47,10 @@ LRUCacheWorkerPromptAdapterManager) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.transformers_utils.config import uses_mrope -from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, - flatten_2d_lists, is_pin_memory_available, - supports_dynamo, weak_ref_tensor) +from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, + async_tensor_h2d, flatten_2d_lists, + is_pin_memory_available, supports_dynamo, + weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -136,6 +135,18 @@ def from_broadcasted_tensor_dict( attn_backend, tensor_dict) return cls(**tensor_dict) + # Exclude `async_callback` to be able to pickle this object + def __getstate__(self): + state = self.__dict__.copy() + del state["async_callback"] + return state + + # TODO: What happens when we depickle this object? + # How can we update this callback to properly pass it to the engine? + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__.update({'async_callback': None}) + @dataclass(frozen=True) class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): @@ -241,7 +252,9 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_modal_inputs: Optional[MultiModalInputs] = None, + multi_modal_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_placeholder_maps: Optional[Dict[ + str, MultiModalPlaceholderMap]] = None, # Whether the prefix cache is hit (prefill only). prefix_cache_hit: bool = False, @@ -360,7 +373,8 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_modal_inputs = multi_modal_inputs + self.multi_modal_kwargs = multi_modal_kwargs + self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit self.n_seqs = len(self.seq_ids) @@ -478,7 +492,7 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, context_len = seq_data.get_num_computed_tokens() seq_len = min(seq_len, context_len + token_chunk_size) elif self.runner.scheduler_config.is_multi_step or \ - self.runner.model_config.is_encoder_decoder_model: + self.runner.model_config.is_encoder_decoder: context_len = seq_len - 1 else: context_len = seq_data.get_num_computed_tokens() @@ -528,6 +542,9 @@ def _compute_for_prefix_cache_hit( # this may be larger than the sequence length if chunked # prefill is enabled. prefix_cache_len = len(computed_block_nums) * self.block_size + seq_group_metadata.seq_data[inter_data.seq_ids[ + seq_idx]].update_num_cached_tokens(prefix_cache_len) + # The number of so far computed prompt tokens in this sequence. context_len = inter_data.context_lens[seq_idx] # The total number of prompt tokens in this sequence. @@ -635,17 +652,28 @@ def _compute_prompt_adapter_input( def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): """If multi-modal data is given, add it to the input.""" - mm_data = seq_group_metadata.multi_modal_data + # NOTE: mm_data only includes the subset of multi-modal items that + # intersect with the current prefill positions. + positions = inter_data.input_positions[0] + mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( + seq_group_metadata, + range(positions[0], positions[0] + len(positions))) if not mm_data: return - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_modal_inputs = mm_kwargs + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + inter_data.multi_modal_kwargs = mm_kwargs + inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. - if self.runner.model_is_mrope: + if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -690,7 +718,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len = 0 - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() inter_data = self.init_cached_inter_data( @@ -816,7 +844,7 @@ def build(self) -> ModelInputForGPU: if not inter_data.is_prompt: max_decode_seq_len = max(max_decode_seq_len, max(inter_data.seq_lens)) - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: max_encoder_seq_len = max(max_encoder_seq_len, inter_data.encoder_seq_len) @@ -915,11 +943,11 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_modal_inputs_list = [ - data.multi_modal_inputs for data in self.inter_data_list - if data.multi_modal_inputs is not None + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None ] - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -945,32 +973,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config + + ModelRunnerBase.__init__(self, vllm_config) + model_config = self.model_config + cache_config = self.cache_config + self.is_driver_worker = is_driver_worker - self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states - self.observability_config = observability_config self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -1055,13 +1071,7 @@ def __init__( def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) + self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory logger.info("Loading model weights took %.4f GB", @@ -1255,7 +1265,7 @@ def profile_run(self) -> None: (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - seq_data, dummy_multi_modal_data = self.input_registry \ + dummy_data = self.input_registry \ .dummy_data_for_profiling(self.model_config, seq_len, self.mm_registry) @@ -1263,12 +1273,13 @@ def profile_run(self) -> None: seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: seq_data}, + seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_multi_modal_data, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders, ) seqs.append(seq) @@ -1371,12 +1382,6 @@ def list_prompt_adapters(self) -> Set[int]: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() - @property - def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - return uses_mrope(self.model_config.hf_config) - @torch.inference_mode() def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. @@ -1392,22 +1397,22 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: per sequence in the batch. """ assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " + logger.info("Capturing cudagraphs for decoding. This may lead to " "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") + logger.info("If out-of-memory error occurs during cudagraph capture," + " consider decreasing `gpu_memory_utilization` or " + "switching to eager mode. You can also reduce the " + "`max_num_seqs` as needed to decrease memory usage.") start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() - if self.model_is_mrope: + if self.model_config.uses_mrope: input_positions = torch.tile(input_positions, (3, 1)) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. @@ -1443,7 +1448,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.attn_state.graph_capture_get_metadata_for_batch( batch_size, is_encoder_decoder_model=self.model_config. - is_encoder_decoder_model)) + is_encoder_decoder)) if self.lora_config: lora_mapping = LoRAMapping( @@ -1462,7 +1467,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), self.attn_state.graph_clone(batch_size), - self.model_config.is_encoder_decoder_model) + self.model_config.is_encoder_decoder) capture_inputs = { "input_ids": @@ -1493,7 +1498,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model.get_seqlen_agnostic_capture_inputs( batch_size) }) - if self.model_config.is_encoder_decoder_model: + if self.model_config.is_encoder_decoder: # add the additional inputs to capture for # encoder-decoder models. self._update_inputs_to_capture_for_enc_dec_model( @@ -1506,9 +1511,12 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner) end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str, @@ -1649,7 +1657,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, device=self.device), **seqlen_agnostic_kwargs) diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index 86883cf152449..9e529f86b46bb 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -9,6 +9,7 @@ import torch from torch import is_tensor +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform @@ -46,9 +47,8 @@ def _init_attn_metadata_from_tensor_dict( # Extract the fields used to create AttentionMetadata. valid_attn_kwargs = {} for field in dataclasses.fields(attn_backend.get_metadata_cls()): - val = tensor_dict.pop(field.name, None) - if val is not None: - valid_attn_kwargs[field.name] = val + if field.name in tensor_dict: + valid_attn_kwargs[field.name] = tensor_dict.pop(field.name) attn_metadata = attn_backend.make_metadata(**valid_attn_kwargs) tensor_dict["attn_metadata"] = attn_metadata @@ -221,6 +221,22 @@ class ModelRunnerBase(ABC, Generic[T]): ModelRunnerInputBase subclass. """ + def __init__( + self, + vllm_config: VllmConfig, + ) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + # Map of request_id -> generator used for seeded random sampling generators: Dict[str, torch.Generator] = {} diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index be2f0d79154d6..3ee0fb4dc943e 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -304,6 +304,7 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): # mypy: enable-error-code=type-var def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): + super().__init__(*args, **kwargs) # Check attention backend support. diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index bf66f32d7d244..1f982fe103366 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -27,17 +27,9 @@ def __init__(self, *args, **kwargs): # for multi-step model, wrap the model runner with MultiStepModelRunner self.model_runner = MultiStepModelRunner( base_model_runner, - base_model_runner.model_config, - base_model_runner.parallel_config, - base_model_runner.scheduler_config, - base_model_runner.device_config, - base_model_runner.cache_config, - load_config=base_model_runner.load_config, - lora_config=self.lora_config, + vllm_config=base_model_runner.vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=base_model_runner.is_driver_worker, - prompt_adapter_config=base_model_runner.prompt_adapter_config, - observability_config=base_model_runner.observability_config, ) pipeline_parallel_size = self.parallel_config.pipeline_parallel_size diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index b8c760c4b5396..ae4eb6ba6eaec 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -7,14 +7,13 @@ from torch import nn from transformers_neuronx.config import GenerationConfig -from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig, - SchedulerConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.neuron import get_neuron_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs) from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import is_pin_memory_available, make_tensor_with_pad from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase @@ -57,25 +56,19 @@ class NeuronModelRunner(ModelRunnerBase[ModelInputForNeuron]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, + vllm_config: VllmConfig, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - + ModelRunnerBase.__init__(self, vllm_config) + model_config = self.model_config if model_config is not None and model_config.get_sliding_window(): logger.warning("Sliding window is not supported on Neuron. " "The model will run without sliding window.") - self.device_config = (device_config - if device_config is not None else DeviceConfig()) self.device = self.device_config.device self.pin_memory = is_pin_memory_available() # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -130,7 +123,7 @@ def _prepare_prompt( input_block_ids: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -152,12 +145,15 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: - # Process multi-modal data - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs, - ) - multi_modal_inputs_list.append(mm_kwargs) + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) assert max_seq_len > 0 @@ -175,7 +171,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, input_block_ids, seq_lens, multi_modal_kwargs) @@ -322,7 +318,7 @@ def execute_model( input_ids=model_input.input_tokens, positions=model_input.input_positions, input_block_ids=model_input.input_block_ids, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device), ) diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py index fff14d6402b44..3f6269684ac93 100644 --- a/vllm/worker/neuron_worker.py +++ b/vllm/worker/neuron_worker.py @@ -4,15 +4,15 @@ import torch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.worker.neuron_model_runner import NeuronModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerInput) + LoraNotSupportedWorkerBase, WorkerBase, + WorkerInput) class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): @@ -21,20 +21,12 @@ class NeuronWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config + WorkerBase.__init__(self, vllm_config=vllm_config) self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -44,7 +36,7 @@ def __init__( init_cached_hf_modules() self.model_runner: NeuronModelRunner = NeuronModelRunner( - model_config, parallel_config, scheduler_config, device_config) + vllm_config=vllm_config) self.is_driver_worker = True def init_device(self) -> None: diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a164fbe3393c4..6000e5dfe4e30 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -1,4 +1,5 @@ -from typing import List, NamedTuple, Optional, Tuple +from collections import defaultdict +from typing import Dict, List, NamedTuple, Optional, Tuple import openvino as ov import torch @@ -6,16 +7,15 @@ from vllm.attention import get_attn_backend from vllm.attention.backends.openvino import OpenVINOAttentionMetadata -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - SchedulerConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.openvino import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs) + MultiModalKwargs, MultiModalPlaceholderMap) from vllm.sequence import SequenceGroupMetadata +from vllm.worker.model_runner_base import ModelRunnerBase logger = init_logger(__name__) @@ -38,33 +38,21 @@ def empty(cls, device): multi_modal_kwargs={}) -class OpenVINOModelRunner: +class OpenVINOModelRunner(ModelRunnerBase): def __init__( self, ov_core: ov.Core, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, *args, **kwargs, ): self.ov_core = ov_core - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.multimodal_config = multimodal_config - self.load_config = load_config + ModelRunnerBase.__init__(self, vllm_config=vllm_config) + cache_config = self.cache_config + model_config = self.model_config self.is_driver_worker = is_driver_worker self.device = self.device_config.device @@ -82,7 +70,8 @@ def __init__( ) # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -114,7 +103,10 @@ def _prepare_model_input( seq_lens: List[int] = [] past_lens: List[int] = [] query_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) subsequence_begins: List[int] = [] block_indices: List[int] = [] @@ -168,15 +160,6 @@ def _prepare_model_input( and self.sliding_window is None and is_prompt) - mm_data = seq_group_metadata.multi_modal_data - if mm_data: - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata. - mm_processor_kwargs, - ) - multi_modal_inputs_list.append(mm_kwargs) - block_table = seq_group_metadata.block_tables[seq_id] # TODO(sang): Combine chunked prefill and prefix caching by # only allowing multiple of block_size chunk size. @@ -220,7 +203,8 @@ def _prepare_model_input( query_lens.append(query_len) input_tokens.extend(tokens) - input_positions.extend(list(range(computed_len, seq_len))) + positions_range = range(computed_len, seq_len) + input_positions.extend(list(positions_range)) past_lens.append(computed_len) subsequence_begins.append(subsequence_begins[-1] + query_len) @@ -233,6 +217,26 @@ def _prepare_model_input( ), "seq_len: {}, computed_len: {}, query_len: {}".format( seq_len, computed_len, query_len) + if seq_group_metadata.multi_modal_data: + # NOTE: mm_data only includes the subset of multi-modal + # items that intersect with the current prefill positions. + mm_data, placeholder_maps = MultiModalPlaceholderMap \ + .from_seq_group(seq_group_metadata, positions_range) + + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) + + for modality, placeholder_map in placeholder_maps.items(): + multi_modal_placeholder_maps[modality].extend( + placeholder_map, ) + max_query_len = max(query_lens) assert max_query_len > 0, "query_lens: {}".format(query_lens) @@ -261,15 +265,22 @@ def _prepare_model_input( max_context_len, dtype=torch.int32, device=self.device) # type: ignore + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + multi_modal_placeholder_maps.items() + } + attn_metadata = self.attn_backend.make_openvino_metadata( past_lens=past_lens_tensor, subsequence_begins=subsequence_begins_tensor, block_indices=block_indices_tensor, block_indices_begins=block_indices_begins_tensor, max_context_len=max_context_len_tensor, + multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return ModelInput( input_tokens, @@ -335,7 +346,7 @@ def execute_model( kv_caches, "attn_metadata": attn_metadata, - **MultiModalInputs.as_kwargs(multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {}, device=self.device), } @@ -350,3 +361,9 @@ def execute_model( sampling_metadata=sampling_metadata, ) return output + + def prepare_model_input(self, *args, **kwargs): + raise NotImplementedError + + def make_model_input_from_broadcasted_tensor_dict(self, *args, **kwargs): + raise NotImplementedError diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index a420d390c1ae4..205f8a337ce6c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -7,9 +7,8 @@ import vllm.envs as envs from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - SchedulerConfig) +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, + ParallelConfig, VllmConfig) from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized, init_distributed_environment) @@ -22,7 +21,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.worker.openvino_model_runner import OpenVINOModelRunner -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase logger = init_logger(__name__) @@ -212,33 +211,19 @@ class OpenVINOWorker(LoraNotSupportedWorkerBase): def __init__( self, ov_core: ov.Core, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, kv_cache_dtype: Optional[ov.Type] = ov.Type.undefined, is_driver_worker: bool = False, ) -> None: self.ov_core = ov_core - self.model_config = model_config - self.parallel_config = parallel_config + WorkerBase.__init__(self, vllm_config) self.parallel_config.rank = rank - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.multimodal_config = multimodal_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -250,14 +235,7 @@ def __init__( init_cached_hf_modules() self.model_runner = OpenVINOModelRunner( self.ov_core, - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config=self.load_config, - lora_config=self.lora_config, - multimodal_config=self.multimodal_config, + vllm_config=self.vllm_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, ) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 87ced7818a676..a721186137328 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -12,8 +12,7 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model @@ -90,20 +89,10 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, is_driver_worker: bool = False, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config + ModelRunnerBase.__init__(self, vllm_config=vllm_config) self.is_driver_worker = is_driver_worker self.block_size = self.cache_config.block_size @@ -148,15 +137,7 @@ def load_model(self) -> None: "vllm.model_executor.layers.vocab_parallel_embedding." "get_tensor_model_parallel_rank", return_value=xm_tp_rank): - model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=self.device_config, - parallel_config=self.parallel_config, - cache_config=self.cache_config, - scheduler_config=self.scheduler_config, - lora_config=None, - ) + model = get_model(vllm_config=self.vllm_config) model = model.eval() xm.wait_device_ops() self.model = ModelWrapper(model) @@ -184,6 +165,7 @@ def _dummy_run( num_prefill_tokens=batch_size * seq_len, num_decode_tokens=0, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, block_tables=None, context_lens=None, ) @@ -216,6 +198,7 @@ def _dummy_run( num_prefill_tokens=0, num_decode_tokens=batch_size * seq_len, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, block_tables=block_tables, context_lens=context_lens, ) @@ -360,6 +343,7 @@ def _prepare_prompt( num_prefill_tokens=0, # NOTE: This is not used. num_decode_tokens=0, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, block_tables=None, context_lens=None, ) @@ -429,6 +413,7 @@ def _prepare_decode( num_prefill_tokens=0, num_decode_tokens=batch_size, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, block_tables=block_tables, context_lens=context_lens, ) diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index de6f7ab0072fd..096cb23416909 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -6,8 +6,7 @@ import torch_xla.runtime as xr import vllm.envs as envs -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, - ParallelConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -16,7 +15,8 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size from vllm.worker.tpu_model_runner import TPUModelRunner from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, - LoraNotSupportedWorkerBase, WorkerInput) + LoraNotSupportedWorkerBase, WorkerBase, + WorkerInput) logger = init_logger(__name__) @@ -25,24 +25,14 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, is_driver_worker: bool, ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config + WorkerBase.__init__(self, vllm_config=vllm_config) self.parallel_config.rank = rank - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -56,13 +46,7 @@ def __init__( self.cache_config.cache_dtype] self.model_runner: TPUModelRunner = TPUModelRunner( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config, - is_driver_worker=is_driver_worker) + vllm_config=vllm_config, is_driver_worker=is_driver_worker) def init_device(self) -> None: os.environ["PJRT_DEVICE"] = "TPU" diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index fd30962e5d6bb..d8c8011a585d8 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -7,10 +7,7 @@ import torch.distributed import vllm.envs as envs -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) @@ -27,7 +24,8 @@ from vllm.worker.embedding_model_runner import EmbeddingModelRunner from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner -from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, + WorkerInput) logger = init_logger(__name__) @@ -42,46 +40,31 @@ class Worker(LocalOrDistributedWorkerBase): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None, - observability_config: Optional[ObservabilityConfig] = None, ) -> None: - self.model_config = model_config - self.parallel_config = parallel_config + WorkerBase.__init__(self, vllm_config) self.parallel_config.rank = rank - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.load_config = load_config - self.prompt_adapter_config = prompt_adapter_config self.is_driver_worker = is_driver_worker - if parallel_config and is_driver_worker: - assert rank % parallel_config.tensor_parallel_size == 0, \ + if is_driver_worker: + assert rank % self.parallel_config.tensor_parallel_size == 0, \ "Driver worker should be rank 0 of tensor parallel group." if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.observability_config = observability_config # Return hidden states from target model if the draft model is an # mlp_speculator + speculative_config = self.speculative_config + model_config = self.model_config speculative_args = {} if speculative_config is None \ or (speculative_config.draft_model_config.model == model_config.model) \ @@ -94,20 +77,12 @@ def __init__( ModelRunnerClass = model_runner_cls elif model_config.task == "embedding": ModelRunnerClass = EmbeddingModelRunner - elif self._is_encoder_decoder_model(): + elif self.model_config.is_encoder_decoder: ModelRunnerClass = EncoderDecoderModelRunner self.model_runner: GPUModelRunnerBase = ModelRunnerClass( - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config=load_config, - lora_config=self.lora_config, + vllm_config=self.vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, - prompt_adapter_config=prompt_adapter_config, - observability_config=observability_config, **speculative_args, ) # Uninitialized cache engine. Will be initialized by @@ -144,9 +119,6 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() - def _is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model - def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 6ba4f272315ce..cf8a4946a71c4 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -7,7 +7,7 @@ import torch -from vllm.config import ObservabilityConfig +from vllm.config import ObservabilityConfig, VllmConfig from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -29,6 +29,22 @@ class WorkerBase(ABC): communicate request metadata to other workers. """ + def __init__( + self, + vllm_config: VllmConfig, + ) -> None: + self.vllm_config = vllm_config + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.load_config = vllm_config.load_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config + self.prompt_adapter_config = vllm_config.prompt_adapter_config + self.observability_config = vllm_config.observability_config + @abstractmethod def init_device(self) -> None: """Initialize device state, such as loading the model or other on-device diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 75a6de3b24ba4..e6322e095bbb9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -1,6 +1,7 @@ import dataclasses import time import weakref +from collections import defaultdict from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar) @@ -9,9 +10,7 @@ import torch.nn as nn from vllm.attention import get_attn_backend -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm.config import VllmConfig from vllm.distributed import get_pp_group from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger @@ -19,7 +18,8 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader import get_model from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalRegistry) + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import DeviceMemoryProfiler, make_tensor_with_pad @@ -160,7 +160,10 @@ def _prepare_prompt( input_positions: List[int] = [] slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_modal_inputs_list: List[MultiModalInputs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_placeholder_maps: Dict[ + str, + MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt @@ -179,7 +182,29 @@ def _prepare_prompt( # Token position ids # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. - input_positions.extend(list(range(computed_len, seq_len))) + positions_range = range(computed_len, seq_len) + input_positions.extend(list(positions_range)) + + if seq_group_metadata.multi_modal_data: + # NOTE: mm_data only includes the subset of multi-modal items + # that intersect with the current prefill positions. + mm_data, placeholder_maps = MultiModalPlaceholderMap \ + .from_seq_group(seq_group_metadata, positions_range) + + if self.runner.mm_registry.has_processor( + self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.runner.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) + + for modality, placeholder_map in placeholder_maps.items(): + multi_modal_placeholder_maps[modality].extend( + placeholder_map) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -220,6 +245,11 @@ def _prepare_prompt( slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device) # type: ignore + placeholder_index_maps = { + modality: placeholder_map.index_map() + for modality, placeholder_map in + multi_modal_placeholder_maps.items() + } max_seqlen = max(seq_lens) tmp = [0] @@ -230,6 +260,7 @@ def _prepare_prompt( attn_metadata = self.attn_backend.make_metadata( is_prompt=True, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=placeholder_index_maps, seq_lens=seq_lens, seqlen_q=seqlen_q, max_seqlen=max_seqlen, @@ -241,7 +272,7 @@ def _prepare_prompt( block_tables=torch.tensor([], device=self.device, dtype=torch.int), ) - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -313,6 +344,7 @@ def _prepare_decode( attn_metadata = self.attn_backend.make_metadata( is_prompt=False, slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, seq_lens=seq_lens, seqlen_q=torch.tensor([]), max_seqlen=0, @@ -337,33 +369,18 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPUWithSamplingMetadata]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config + + ModelRunnerBase.__init__(self, vllm_config=vllm_config) + model_config = self.model_config + cache_config = self.cache_config self.is_driver_worker = is_driver_worker - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config - if self.observability_config is not None: - print(f"observability_config is {self.observability_config}") self.return_hidden_states = return_hidden_states self.device = self.device_config.device @@ -396,15 +413,7 @@ def __init__( def load_model(self) -> None: with DeviceMemoryProfiler() as m: - self.model = get_model( - model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config, - ) + self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory logger.info("Loading model weights took %.4f GB", @@ -450,7 +459,7 @@ def profile_run(self) -> None: (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - seq_data, dummy_multi_modal_data = self.input_registry \ + dummy_data = self.input_registry \ .dummy_data_for_profiling(self.model_config, seq_len, self.mm_registry) @@ -458,12 +467,12 @@ def profile_run(self) -> None: seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: seq_data}, + seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, lora_request=None, - multi_modal_data=dummy_multi_modal_data, - ) + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders) seqs.append(seq) # Run the model with the dummy inputs. @@ -564,7 +573,7 @@ def execute_model( kv_caches=kv_caches, attn_metadata=model_input.attn_metadata, intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {}, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, device=self.device)) # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index c1d836bb0d318..1295666055b04 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -8,10 +8,7 @@ import torch import torch.distributed -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import VllmConfig from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -19,7 +16,7 @@ from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker -from vllm.worker.worker_base import LoraNotSupportedWorkerBase +from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase from vllm.worker.xpu_model_runner import XPUModelRunner logger = init_logger(__name__) @@ -36,53 +33,32 @@ class XPUWorker(LoraNotSupportedWorkerBase, Worker): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, + vllm_config: VllmConfig, local_rank: int, rank: int, distributed_init_method: str, - lora_config: Optional[LoRAConfig] = None, - speculative_config: Optional[SpeculativeConfig] = None, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, - observability_config: Optional[ObservabilityConfig] = None, ) -> None: + WorkerBase.__init__(self, vllm_config=vllm_config) + device_config = self.device_config + parallel_config = self.parallel_config assert device_config.device_type == "xpu" assert current_platform.is_xpu() - self.model_config = model_config - self.parallel_config = parallel_config self.parallel_config.rank = rank - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.load_config = load_config + self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method - self.lora_config = lora_config - self.prompt_adapter_config = prompt_adapter_config self.is_driver_worker = is_driver_worker - self.observability_config = observability_config if parallel_config and is_driver_worker: assert rank % parallel_config.tensor_parallel_size == 0, \ "Driver worker should be rank 0 of tensor parallel group." self.model_runner = XPUModelRunner( # type: ignore - model_config, - parallel_config, - scheduler_config, - device_config, - cache_config, - load_config=self.load_config, - lora_config=self.lora_config, + vllm_config=vllm_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, - observability_config=self.observability_config, ) # Uninitialized cache engine. Will be initialized by # initialize_cache.