diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index cd2bfd8bb5bf4..79526adef2a79 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -18,6 +18,8 @@ source /etc/environment docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test function cpu_tests() { + set -e + # Run basic model test docker exec cpu-test bash -c " set -e @@ -25,8 +27,7 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m cpu_model diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 8d4f4d1a681f2..b3771bb268e22 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -4,9 +4,13 @@ # It serves a sanity check for compilation and basic model usage. set -ex +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-48-95} +NUMA_NODE=${NUMA_NODE:-1} + # Try building the docker image -numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . -numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu . +numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } @@ -14,12 +18,14 @@ trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ + --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \ + --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 function cpu_tests() { + set -e + # offline inference docker exec cpu-test-avx2 bash -c " set -e @@ -32,8 +38,7 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m cpu_model @@ -56,7 +61,7 @@ function cpu_tests() { docker exec cpu-test bash -c " set -e export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=48-92 + export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 python3 benchmarks/benchmark_serving.py \ diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index e8456357e6db1..fbaa427bb7270 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -165,6 +165,14 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization +- label: V1 Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1 + - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 37d93a1277974..0085a1cc22373 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -16,7 +16,7 @@ jobs: uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: '3.12' diff --git a/CMakeLists.txt b/CMakeLists.txt index 376565583d928..5acbd762ee957 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") # # Supported/expected torch versions for CUDA/ROCm. @@ -187,7 +187,8 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") set(VLLM_EXT_SRC "csrc/cache_kernels.cu" - "csrc/attention/attention_kernels.cu" + "csrc/attention/paged_attention_v1.cu" + "csrc/attention/paged_attention_v2.cu" "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8beae68289997..6d46a6dca371d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,3 +1,3 @@ # Contributing to vLLM -You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview/). +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 47e40e015239a..2143315d2a078 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - 'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-neuron.txt ENV VLLM_TARGET_DEVICE neuron diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index c2a40000aab4b..b19c6ddec7948 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - 'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cuh similarity index 64% rename from csrc/attention/attention_kernels.cu rename to csrc/attention/attention_kernels.cuh index bcd170411e7cb..563e1438f0b01 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cuh @@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel( } // namespace vllm -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel), \ - shared_mem_size); \ - vllm::paged_attention_v1_kernel \ - <<>>( \ - out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ - scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -// TODO(woosuk): Tune NUM_THREADS. -template -void paged_attention_v1_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_seq_len = - DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; - int logits_size = padded_max_seq_len * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len - // Keep that in sync with the logic here! - int shared_mem_size = std::max(logits_size, outputs_size); - - dim3 grid(num_heads, num_seqs, 1); - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V1(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V1(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V1(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V1(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V1(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V1(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V1(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V1(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v1_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); - -#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V1_LAUNCHER_BLOCK_SIZE) -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel \ - <<>>( \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ - value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); \ - vllm::paged_attention_v2_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ - max_num_partitions); - -template -void paged_attention_v2_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); - int logits_size = PARTITION_SIZE * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - - // For paged attention v2 kernel. - dim3 grid(num_heads, num_seqs, max_num_partitions); - int shared_mem_size = std::max(logits_size, outputs_size); - // For paged attention v2 reduce kernel. - dim3 reduce_grid(num_heads, num_seqs); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); - - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V2(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V2(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V2(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V2(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V2(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V2(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V2(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V2(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v2_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& - tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V2_LAUNCHER_BLOCK_SIZE) -} - #undef WARP_SIZE #undef MAX #undef MIN diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu new file mode 100644 index 0000000000000..8b99f0843aaf6 --- /dev/null +++ b/csrc/attention/paged_attention_v1.cu @@ -0,0 +1,193 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ + VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ + ((void*)vllm::paged_attention_v1_kernel), \ + shared_mem_size); \ + vllm::paged_attention_v1_kernel \ + <<>>( \ + out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ + scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ + alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +// TODO(woosuk): Tune NUM_THREADS. +template +void paged_attention_v1_launcher( + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int padded_max_seq_len = + DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; + int logits_size = padded_max_seq_len * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len + // Keep that in sync with the logic here! + int shared_mem_size = std::max(logits_size, outputs_size); + + dim3 grid(num_heads, num_seqs, 1); + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 64: + LAUNCH_PAGED_ATTENTION_V1(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V1(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V1(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V1(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V1(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V1(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V1(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v1_launcher( \ + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ + seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); + +#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v1( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V1_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu new file mode 100644 index 0000000000000..3a7a9dee916aa --- /dev/null +++ b/csrc/attention/paged_attention_v2.cu @@ -0,0 +1,203 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ + vllm::paged_attention_v2_kernel \ + <<>>( \ + exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ + value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ + seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ + kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); \ + vllm::paged_attention_v2_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ + max_num_partitions); + +template +void paged_attention_v2_launcher( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); + int logits_size = PARTITION_SIZE * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + + // For paged attention v2 kernel. + dim3 grid(num_heads, num_seqs, max_num_partitions); + int shared_mem_size = std::max(logits_size, outputs_size); + // For paged attention v2 reduce kernel. + dim3 reduce_grid(num_heads, num_seqs); + int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); + + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 64: + LAUNCH_PAGED_ATTENTION_V2(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V2(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V2(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V2(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V2(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V2(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V2(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v2_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v2( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& + tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V2_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index 46fef79f439fb..bd184ee22682e 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t, } } +/// each thread processes a block per query __global__ void advance_step_flashinfer_kernel( int num_threads, int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, long const* sampled_token_ids_ptr, @@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel( int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr + if (idx == 0) { + paged_kv_indptr_ptr[idx] = 0; + } if (idx < num_queries) { int sum = 0; for (int i = 0; i <= idx; ++i) { @@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel( } __global__ void advance_step_flashinfer_indices_kernel( - int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const block_tables_stride, int* paged_kv_indices_ptr, + int num_seqs, int num_queries, int const* block_tables_ptr, + int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - int row = idx / block_tables_stride; - int col = idx % block_tables_stride; - - if (row < num_queries && col < block_table_bound_ptr[row]) { - paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] = - block_tables_ptr[row * block_tables_stride + col]; + // note: max_num_blocks_per_seq = block_tables.stride(0) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // when cuda graphs are enabled, paged_kv_indptr tensor + // has to be updated for the padded queries + // tid represents a query# for paged_kv_indptr tensor + if (num_queries < tid && tid <= num_seqs) { + paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; } - // if cudagraph, fill padded seqs with the last valid seq's indptr - if (num_queries < row && row <= num_seqs) { - paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries]; + + // each thread processes a block_ptr in block_tables + // block_tables shape: [num_queries, max_num_blocks_per_seq] + // paged_kv_indices is flattened block_tables. + for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); + idx += (gridDim.x * blockDim.x)) { + // block_tables-row = paged_kv_indptr[queryNum] + int queryNum = idx / max_num_blocks_per_seq; + int col = idx % max_num_blocks_per_seq; + if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { + int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; + int block_tables_idx = queryNum * max_num_blocks_per_seq + col; + paged_kv_indices_ptr[indices_arr_idx] = + block_tables_ptr[block_tables_idx]; + } } } @@ -247,22 +263,16 @@ void advance_step_flashinfer( int threads; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - if (logging) { - printf("launching kernel with %d blocks\n", blocks); - } - // TODO(will): support arbitrary block_tables stride - if ((blocks * threads) / block_tables.stride(0) < num_queries) { - TORCH_CHECK(false, - "multi-step: not enough threads to map block_table to" - "FlashInfer's paged_kv_indices on GPU. Try reducing the number " - "of seqs,", - " increasing the block size or take smaller steps.", - " num_queries = ", num_queries, - " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); + int block_tables_stride = block_tables.stride(0); + TORCH_CHECK((blocks * threads > num_queries), + "multi-step: not enough threads to map to num_queries = ", + num_queries, " block_tables.stride(0) = ", block_tables.stride(0), + " blocks = ", blocks, " max_threads = ", threads); + if (logging) { + printf("launching kernels with %d blocks and %d threads\n", blocks, + threads); } - advance_step_flashinfer_kernel<<>>( threads, num_seqs, num_queries, block_size, reinterpret_cast(input_tokens.data_ptr()), @@ -281,7 +291,7 @@ void advance_step_flashinfer( reinterpret_cast(block_table_bound.data_ptr())); advance_step_flashinfer_indices_kernel<<>>( - threads, num_seqs, num_queries, + num_seqs, num_queries, reinterpret_cast(block_tables.data_ptr()), block_tables.stride(0), reinterpret_cast(paged_kv_indices.data_ptr()), diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index dac40ca2cfe75..18b502c786e1d 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -8,7 +8,9 @@ document.addEventListener("DOMContentLoaded", function () { script.setAttribute("version", "stable"); script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "vLLM"); - script.setAttribute("runllm-position", "TOP_RIGHT"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-position-y", "20%"); + script.setAttribute("runllm-position-x", "3%"); script.setAttribute("runllm-assistant-id", "207"); script.async = true; diff --git a/docs/source/assets/design/hierarchy.png b/docs/source/assets/design/hierarchy.png new file mode 100644 index 0000000000000..6a1b4ba9590ba Binary files /dev/null and b/docs/source/assets/design/hierarchy.png differ diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst new file mode 100644 index 0000000000000..15f0c8ccf77ee --- /dev/null +++ b/docs/source/design/class_hierarchy.rst @@ -0,0 +1,72 @@ +vLLM's Class Hierarchy +======================= + +This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible. + +1. **Entrypoints**: vLLM has two entrypoints: `command line usage `__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage `__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference. + +2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload. + +3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory. + +4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs. + +5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get. + +The following figure shows the class hierarchy of vLLM: + + .. figure:: ../assets/design/hierarchy.png + :alt: query + :width: 100% + :align: center + +There are several important design choices behind this class hierarchy: + +1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig `__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option. + +2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. + +.. note:: + + To support this change, all vLLM models' signatures have been updated to: + + .. code-block:: python + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + + .. code-block:: python + + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + + This way, the model can work with both old and new versions of vLLM. + +3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file. + +One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem. + +In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes. diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst index 20394bd311ea9..e6c1cea6001ea 100644 --- a/docs/source/design/huggingface_integration.rst +++ b/docs/source/design/huggingface_integration.rst @@ -1,3 +1,5 @@ +.. _huggingface_integration: + Integration with HuggingFace =================================== @@ -5,27 +7,25 @@ This document describes how vLLM integrates with HuggingFace libraries. We will Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. -1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM will first try to locate the config file ``config.json`` using this argument. See the `code snippet `__ for the implementation. +1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: - - If the ``model`` argument is a local path, vLLM will directly read the config file from the path. + - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + + - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. - - Otherwise, vLLM will try to read the config from the HuggingFace cache. See `their website `__ for more information on how the HuggingFace cache works. Here, we can also use the argument ``--revision`` to specify the revision of the model in the cache. + - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. - - If neither of the above works, vLLM will download the config file from the HuggingFace model hub, using the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. -2. After obtaining the config file, vLLM will load the config into a dictionary. It first `inspects `__ the ``model_type`` field in the config to determine the model type and config class to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. +3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. -3. After obtaining the config object, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. - -4. The config object is `attached `__ as the ``hf_config`` field to vLLM's ``model_config`` object. - -5. After vLLM obtains the config object, it will use the ``architectures`` field to determine the model class to initialize. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``. vLLM maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. -6. Finally, we reach the model class we want to initialize, i.e., the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. Beyond that, there are two more things vLLM depends on HuggingFace for. @@ -33,7 +33,7 @@ Beyond that, there are two more things vLLM depends on HuggingFace for. 2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. - - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: This completes the integration between vLLM and HuggingFace. diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 6bf170b164fb8..69530fd778c55 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -62,7 +62,7 @@ Build from source .. code-block:: console $ pip install --upgrade pip - $ pip install cmake>=3.26,<=3.30 wheel packaging ninja "setuptools-scm>=8" numpy + $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - Finally, build and install vLLM CPU backend: diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 91978065faf42..77bf550601346 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -20,6 +20,10 @@ Hangs loading a model from disk If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. +.. note:: + + To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. + Model is too large ---------------------------------------- If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. @@ -75,6 +79,9 @@ If GPU/CPU communication cannot be established, you can use the following Python print("PyTorch GLOO is successful!") + if world_size <= 1: + exit() + # Test vLLM NCCL, with cuda graph from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator @@ -119,6 +126,8 @@ If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes` If the script runs successfully, you should see the message ``sanity check is successful!``. +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + .. note:: A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: diff --git a/docs/source/index.rst b/docs/source/index.rst index 75e6db8f70382..77c71afb13ac5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -157,16 +157,17 @@ Documentation :maxdepth: 2 :caption: Design + design/class_hierarchy + design/huggingface_integration design/input_processing/model_inputs_index design/kernel/paged_attention design/multimodal/multimodal_index - design/huggingface_integration -.. Contributing: contributing to the vLLM project +.. For Developers: contributing to the vLLM project .. toctree:: :maxdepth: 2 - :caption: Contributing + :caption: For Developers contributing/overview contributing/profiling/profiling_index diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst index 3d0d1aec69845..49b5285c45590 100644 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i 3. Register maximum number of multi-modal tokens ------------------------------------------------ -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. .. code-block:: diff diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 5a474043078db..161733c049bbe 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -450,7 +450,7 @@ Text Generation - Idefics3 - T + I - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - - + - ✅︎ - * - :code:`InternVLChatModel` - InternVL2 @@ -538,7 +538,7 @@ Text Generation - ✅︎ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`+` + - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ @@ -584,6 +584,12 @@ Multimodal Embedding - :code:`TIGER-Lab/VLM2Vec-Full` - 🚧 - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 112e9db6a41de..bcbe50a25fa09 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t response_json = response.json() print("Embedding output:", response_json["data"][0]["embedding"]) +Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. + +.. code-block:: bash + + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + +.. important:: + + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, + which is handled by the jinja template. + +.. important:: + + Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst index 7882e14f3b849..f39997e0e44d9 100644 --- a/docs/source/serving/integrations.rst +++ b/docs/source/serving/integrations.rst @@ -13,3 +13,4 @@ Integrations deploying_with_dstack serving_with_langchain serving_with_llamaindex + serving_with_llamastack diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 9b29ca66022cb..78965813b1213 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -162,7 +162,7 @@ vllm serve --chat-template ./path-to-chat-template.jinja vLLM community provides a set of chat templates for popular models. You can find them in the examples directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) -With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: ```python completion = client.chat.completions.create( @@ -172,10 +172,10 @@ completion = client.chat.completions.create( ] ) ``` -Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like +Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify -between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match +between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match this, unless explicitly specified. @@ -191,8 +191,8 @@ this, unless explicitly specified. ### Config file The `serve` module can also accept arguments from a config file in -`yaml` format. The arguments in the yaml must be specified using the -long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): +`yaml` format. The arguments in the yaml must be specified using the +long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): For example: @@ -208,44 +208,44 @@ uvicorn-log-level: "info" $ vllm serve SOME_MODEL --config config.yaml ``` --- -**NOTE** +**NOTE** In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence. The order of priorities is `command line > config file values > defaults`. --- ## Tool calling in the chat completion API - -vLLM supports named function calling and `auto` tool choice in the chat completion API. The `tool_choice` options `required` is **not yet supported** but on the roadmap. +vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap. It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. +Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models. ### Named Function Calling -vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is -enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a -high-quality one. +vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is +enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +high-quality one. vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. -To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and -specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. +To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and +specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. ### Automatic Function Calling To enable this feature, you should set the following flags: -* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. -* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages -that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their -`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat +* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their +`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) -If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! +If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! #### Hermes Models (`hermes`) @@ -256,8 +256,8 @@ All Nous Research Hermes-series models newer than Hermes 2 Pro should be support * `NousResearch/Hermes-3-*` -_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge -step in their creation_. +_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge +step in their creation_. Flags: `--tool-call-parser hermes` @@ -269,9 +269,9 @@ Supported models: * Additional mistral function-calling models are compatible as well. Known issues: -1. Mistral 7B struggles to generate parallel tool calls correctly. -2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is -much shorter than what vLLM generates. Since an exception is thrown when this condition +1. Mistral 7B struggles to generate parallel tool calls correctly. +2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is +much shorter than what vLLM generates. Since an exception is thrown when this condition is not met, the following additional chat templates are provided: * `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that @@ -291,11 +291,11 @@ Supported models: * `meta-llama/Meta-Llama-3.1-405B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: -1. Parallel tool calls are not supported. +1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -341,6 +341,34 @@ AI21's Jamba-1.5 models are supported. Flags: `--tool-call-parser jamba` +#### Models with Pythonic Tool Calls (`pythonic`) + +A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. + +As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: +```python +[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] +``` + +Limitations: +* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) +* Llama's smaller models struggle to use tools effectively. + +Example supported models: +* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) + +Flags: `--tool-call-parser pythonic --chat-template {see_above}` + +--- +**WARNING** +Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary. + +--- + + ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. diff --git a/docs/source/serving/runai_model_streamer.rst b/docs/source/serving/runai_model_streamer.rst index 8cee969260770..0c8a5434519f1 100644 --- a/docs/source/serving/runai_model_streamer.rst +++ b/docs/source/serving/runai_model_streamer.rst @@ -13,6 +13,12 @@ To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +.. code-block:: console + + $ vllm serve s3://my-bucket/path-to-model --load-format runai_streamer + + + Tunable parameters ------------------ diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst new file mode 100644 index 0000000000000..8ef96c4e54369 --- /dev/null +++ b/docs/source/serving/serving_with_llamastack.rst @@ -0,0 +1,42 @@ +.. _run_on_llamastack: + +Serving with Llama Stack +============================ + +vLLM is also available via `Llama Stack `_ . + +To install Llama Stack, run + +.. code-block:: console + + $ pip install llama-stack -q + +Inference using OpenAI Compatible API +------------------------------------- + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +.. code-block:: yaml + + inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 + +Please refer to `this guide `_ for more details on this remote vLLM provider. + +Inference via Embedded vLLM +--------------------------- + +An `inline vLLM provider +`_ +is also available. This is a sample of configuration using that method: + +.. code-block:: yaml + + inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py index effb588e1387f..fff82020d9a30 100644 --- a/examples/openai_chat_embedding_client_for_multimodal.py +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -1,33 +1,120 @@ +import argparse +import base64 +import io + import requests +from PIL import Image image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": - "TIGER-Lab/VLM2Vec-Full", - "messages": [{ + +def vlm2vec(): + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": + "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }], + "encoding_format": + "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +def dse_qwen2_vl(inp: dict): + # Embedding an Image + if inp["dtype"] == "image": + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": inp["image_url"], + } + }, { + "type": "text", + "text": "What is shown in this image?" + }] + }] + # Embedding a Text Query + else: + # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image + # of the minimum input size + buffer = io.BytesIO() + image_placeholder = Image.new("RGB", (56, 56)) + image_placeholder.save(buffer, "png") + buffer.seek(0) + image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') + messages = [{ "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": image_url + "url": f"data:image/jpeg;base64,{image_placeholder}", } }, { "type": "text", - "text": "Represent the given image." + "text": f"Query: {inp['content']}" }, - ], - }], - "encoding_format": - "float", - }, -) -response.raise_for_status() -response_json = response.json() - -print("Embedding output:", response_json["data"][0]["embedding"]) + ] + }] + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "MrLight/dse-qwen2-2b-mrl-v1", + "messages": messages, + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + "Script to call a specified VLM through the API. Make sure to serve " + "the model with --task embedding before running this.") + parser.add_argument("model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.") + args = parser.parse_args() + + if args.model == "vlm2vec": + vlm2vec() + elif args.model == "dse_qwen2_vl": + dse_qwen2_vl({ + "dtye": "image", + "image_url": image_url, + }) + dse_qwen2_vl({ + "dtype": "text", + "content": "What is the weather like today?", + }) diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json index d1389f5392c8c..f76a61bb5eec3 100644 --- a/examples/production_monitoring/grafana.json +++ b/examples/production_monitoring/grafana.json @@ -1,33 +1,4 @@ { - "__inputs": [ - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.2" - }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -54,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 1, "links": [], "liveNow": false, "panels": [ @@ -76,6 +47,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -241,6 +213,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -358,6 +331,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -523,6 +497,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -658,6 +633,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -823,6 +799,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -984,7 +961,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1076,7 +1053,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1117,6 +1094,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -1147,8 +1125,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1199,6 +1176,319 @@ ], "title": "Finish Reason", "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Prefill", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Decode", + "range": true, + "refId": "B" + } + ], + "title": "Requests Prefill and Decode Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Tokens", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Generation Token in Sequence Group", + "type": "timeseries" } ], "refresh": "", @@ -1207,21 +1497,34 @@ "templating": { "list": [ { - "type": "datasource", - "name": "DS_PROMETHEUS", - "label": "datasource", - "current": {}, + "current": { + "selected": false, + "text": "prometheus", + "value": "edx8memhpd9tsa" + }, "hide": 0, "includeAll": false, + "label": "datasource", "multi": false, + "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false + "skipUrlSync": false, + "type": "datasource" }, { + "current": { + "selected": false, + "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct", + "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, "definition": "label_values(model_name)", "hide": 0, "includeAll": false, @@ -1249,7 +1552,6 @@ "timezone": "", "title": "vLLM", "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", - "version": 1, + "version": 8, "weekStart": "" } - diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja new file mode 100644 index 0000000000000..e7b93fae31770 --- /dev/null +++ b/examples/template_dse_qwen2_vl.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %} +{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %} +{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %} +{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %} +{% endraw %}{% endif %}<|endoftext|> \ No newline at end of file diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja new file mode 100644 index 0000000000000..8c38de6c6a907 --- /dev/null +++ b/examples/tool_chat_template_llama3.2_pythonic.jinja @@ -0,0 +1,98 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = false %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call functions, please respond with a python list of the calls. " }} + {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a python list for function calls " }} + {{- "with their proper arguments to best answer the given prompt.\n\n" }} + {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- tool_call.name + '(' -}} + {%- for param in tool_call.arguments %} + {{- param + '=' -}} + {{- "%sr" | format(tool_call.arguments[param]) -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ')' -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ']<|eot_id|>' -}} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja new file mode 100644 index 0000000000000..a9b3b7189dddf --- /dev/null +++ b/examples/tool_chat_template_toolace.jinja @@ -0,0 +1,65 @@ +{{- bos_token }} + +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language." %} +{%- endif %} + +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n" }} + {{- "If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.\n" }} + {{- "You should only return the function call in tools call sections.\n\n" }} + {{- "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n" }} + {{- "You SHOULD NOT include any other text in the response.\n" }} + {{- "Here is a list of functions in JSON format that you can invoke.\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- "\n" }} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- tool_call.name + '(' -}} + {%- for param in tool_call.arguments %} + {{- param + '=' -}} + {{- "%sr" | format(tool_call.arguments[param]) -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ')' -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ']<|eot_id|>' -}} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} + +{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} diff --git a/format.sh b/format.sh index d06ee62351a21..a57882d2ac3f9 100755 --- a/format.sh +++ b/format.sh @@ -44,18 +44,19 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') # # params: tool name, tool version, required version tool_version_check() { - if [[ "$2" != "$3" ]]; then - echo "❓❓Wrong $1 version installed: $3 is required, not $2." + expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) + if [[ "$2" != "$expected" ]]; then + echo "❓❓Wrong $1 version installed: $expected is required, not $2." exit 1 fi } -tool_version_check "yapf" "$YAPF_VERSION" "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "ruff" "$RUFF_VERSION" "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "yapf" "$YAPF_VERSION" +tool_version_check "ruff" "$RUFF_VERSION" +tool_version_check "mypy" "$MYPY_VERSION" +tool_version_check "isort" "$ISORT_VERSION" +tool_version_check "codespell" "$CODESPELL_VERSION" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" YAPF_FLAGS=( '--recursive' diff --git a/pyproject.toml b/pyproject.toml index 3be401daa44c7..3c8c46cc8621e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] # Should be mirrored in requirements-build.txt requires = [ - "cmake>=3.26,<=3.30", + "cmake>=3.26", "ninja", "packaging", "setuptools>=61", diff --git a/python_only_dev.py b/python_only_dev.py index 4ab203bb6f9d6..1ca0f5c30b741 100644 --- a/python_only_dev.py +++ b/python_only_dev.py @@ -69,7 +69,8 @@ current_vllm_path = os.path.join(cwd, "vllm") print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") - os.rename(pre_built_vllm_path, tmp_path) + shutil.copytree(pre_built_vllm_path, tmp_path) + shutil.rmtree(pre_built_vllm_path) print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") os.symlink(current_vllm_path, pre_built_vllm_path) diff --git a/requirements-build.txt b/requirements-build.txt index 64b92861df25d..fec01caaf25ef 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,5 +1,5 @@ # Should be mirrored in pyproject.toml -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools>=61 diff --git a/requirements-common.txt b/requirements-common.txt index 6bc15ead74415..3f5fa571ed4b3 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,5 +31,5 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.7.1 # required for compressed-tensors +compressed-tensors == 0.8.0 # required for compressed-tensors runai-model-streamer diff --git a/requirements-test.txt b/requirements-test.txt index fb322fcc72dc2..65695111e4dc5 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -14,7 +14,6 @@ aiohappyeyeballs==2.4.3 # via aiohttp aiohttp==3.10.10 # via - # -r requirements-test.in # datasets # fsspec # lm-eval @@ -40,15 +39,15 @@ attrs==24.2.0 # referencing audioread==3.0.1 # via librosa -awscli==1.35.19 +awscli==1.35.23 # via -r requirements-test.in bitsandbytes==0.44.1 # via -r requirements-test.in black==24.10.0 # via datamodel-code-generator -boto3==1.35.53 +boto3==1.35.57 # via tensorizer -botocore==1.35.53 +botocore==1.35.57 # via # awscli # boto3 @@ -82,7 +81,7 @@ cupy-cuda12x==13.3.0 # via ray cycler==0.12.1 # via matplotlib -datamodel-code-generator==0.26.2 +datamodel-code-generator==0.26.3 # via -r requirements-test.in dataproperty==1.0.1 # via @@ -263,7 +262,6 @@ numpy==1.26.4 # mistral-common # numba # numexpr - # opencv-python # opencv-python-headless # pandas # peft @@ -307,8 +305,6 @@ nvidia-nvjitlink-cu12==12.4.127 # torch nvidia-nvtx-cu12==12.4.127 # via torch -opencv-python==4.10.0.84 - # via -r requirements-test.in opencv-python-headless==4.10.0.84 # via mistral-common packaging==24.1 @@ -440,7 +436,6 @@ regex==2024.9.11 # transformers requests==2.32.3 # via - # -r requirements-test.in # buildkite-test-collector # datasets # evaluate @@ -521,7 +516,7 @@ tiktoken==0.7.0 # mistral-common timm==1.0.11 # via -r requirements-test.in -tokenizers==0.20.1 +tokenizers==0.20.3 # via transformers toml==0.10.2 # via datamodel-code-generator diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 94a3225dcf479..f9a0770804e55 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for TPU -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools-scm>=8 diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 479cb4bb18484..e41295792283f 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt ray >= 2.9 -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools-scm>=8 diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json index 03d077b76f627..798a34e8dd92d 100644 --- a/tests/compile/piecewise/piecewise_compilation_config.json +++ b/tests/compile/piecewise/piecewise_compilation_config.json @@ -1,4 +1,5 @@ { "use_cudagraph": true, - "non_cudagraph_ops": ["silly.attention"] + "non_cudagraph_ops": ["silly.attention"], + "cudagraph_copy_inputs": true } \ No newline at end of file diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py index d151d62516b07..c631850ecdedb 100644 --- a/tests/compile/piecewise/test_simple.py +++ b/tests/compile/piecewise/test_simple.py @@ -12,10 +12,9 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.compilation.levels import CompilationLevel +from vllm.config import VllmConfig from vllm.utils import direct_register_custom_op -os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) - global_counter = 0 # create a library to hold the custom op @@ -48,7 +47,11 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, @support_torch_compile class SillyModel(nn.Module): - def __init__(self) -> None: + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: super().__init__() def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -74,13 +77,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: def test_simple_piecewise_compile(): - model = SillyModel() - directory = os.path.dirname(__file__) config = os.path.join(directory, "piecewise_compilation_config.json") os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config + os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) + + model = SillyModel(vllm_config=VllmConfig(), prefix='') - input_buffer = torch.randn(100).cuda() + inputs = torch.randn(100).cuda() with compilation_counter.expect( num_graphs_seen=1, # one graph for the model @@ -92,15 +96,15 @@ def test_simple_piecewise_compile(): ): with set_compile_context([1, 2]): - model(input_buffer) + model(inputs) - model(input_buffer[:2]) - model(input_buffer[:1]) + model(torch.randn(2).cuda()) + model(torch.randn(1).cuda()) - input_buffer[:2].zero_() + input = torch.zeros(2).cuda() global global_counter global_counter = 0 - output = model(input_buffer[:2]) + output = model(input) assert global_counter == 2 assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 73fa9e9906936..c363a587a818e 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -19,6 +19,7 @@ from vllm.compilation.counter import compilation_counter from vllm.compilation.decorators import support_torch_compile from vllm.compilation.levels import CompilationLevel +from vllm.config import VllmConfig from vllm.plugins import set_compilation_config from vllm.utils import direct_register_custom_op @@ -195,9 +196,15 @@ def forward( return hidden_states, residual +@support_torch_compile class LlamaModel(nn.Module): - def __init__(self, config: LlamaConfig) -> None: + def __init__(self, + *, + vllm_config: VllmConfig, + config: LlamaConfig, + prefix: str = '', + **kwargs) -> None: super().__init__() self.embedding_tokens = nn.Embedding( num_embeddings=config.vocab_size, @@ -265,10 +272,9 @@ def run_model(llama_config, CompilationLevel.NO_COMPILATION) set_compilation_config(None) - cls = LlamaModel - if use_compile: - cls = support_torch_compile(LlamaModel) - model = cls(llama_config).eval().cuda() + model = LlamaModel(config=llama_config, + vllm_config=VllmConfig(), + prefix="").eval().cuda() B = 16 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() @@ -357,7 +363,6 @@ def test_toy_llama(): def benchmark(): os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE) from triton.testing import do_bench - cls = support_torch_compile(LlamaModel) # similar to llama 3.1-8B llama_config = LlamaConfig(hidden_size=4096, @@ -390,7 +395,9 @@ def benchmark(): else: set_compilation_config(None) - model = cls(llama_config).eval().cuda().to(torch.bfloat16) + model = LlamaModel(config=llama_config, + vllm_config=VllmConfig(), + prefix="").eval().cuda().to(torch.bfloat16) B = 256 # max batch size input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() diff --git a/tests/conftest.py b/tests/conftest.py index 6cf791dc62ce5..0dc1cc6e83c18 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets: class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + if device is None: device = "cpu" if current_platform.is_cpu() else "cuda" diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 1489a60891761..5d566f8308b70 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -32,6 +32,8 @@ class PPTestOptions(NamedTuple): multi_node_only: bool trust_remote_code: bool tokenizer_mode: Optional[str] + load_format: Optional[str] = None + hf_overrides: Optional[str] = None @dataclass @@ -50,6 +52,8 @@ def detailed( task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -78,7 +82,9 @@ def detailed( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) @staticmethod @@ -90,6 +96,8 @@ def fast( multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -102,7 +110,9 @@ def fast( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) def iter_params(self, model_name: str): @@ -161,9 +171,8 @@ def iter_params(self, model_name: str): "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "adept/persimmon-8b-chat": PPTestSettings.fast(), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), @@ -214,9 +223,9 @@ def iter_params(self, model_name: str): # NOTE: You can update this on your local machine to run specific tests TEST_MODELS = [ # [LANGUAGE GENERATION] + "microsoft/Phi-3.5-MoE-instruct", "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", - "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -238,7 +247,8 @@ def _compare_tp( method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup - multi_node_only, trust_remote_code, tokenizer_mode = test_options + multi_node_only, trust_remote_code, tokenizer_mode, \ + load_format, hf_overrides = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") @@ -267,6 +277,10 @@ def _compare_tp( common_args.append("--trust-remote-code") if tokenizer_mode: common_args.extend(["--tokenizer-mode", tokenizer_mode]) + if load_format: + common_args.extend(["--load-format", load_format]) + if hf_overrides: + common_args.extend(["--hf-overrides", hf_overrides]) if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2 and chunked_prefill): diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 3c7facc12c59a..50444d3abfaf2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -1,11 +1,13 @@ +import socket + import pytest import ray import torch -import torch.distributed as dist import vllm.envs as envs -from vllm.distributed.utils import stateless_init_process_group -from vllm.utils import (cuda_device_count_stateless, +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.utils import StatelessProcessGroup +from vllm.utils import (cuda_device_count_stateless, get_open_port, update_environment_variables) from ..utils import multi_gpu_test @@ -40,43 +42,50 @@ def test_cuda_device_count_stateless(): assert ray.get(actor.get_count.remote()) == 0 -def cpu_worker(rank, WORLD_SIZE): - pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29500", +def cpu_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, - world_size=WORLD_SIZE, - backend="gloo") + world_size=WORLD_SIZE) if rank <= 2: - pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29501", + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, rank=rank, - world_size=3, - backend="gloo") + world_size=3) data = torch.tensor([rank]) - dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1) + data = pg1.broadcast_obj(data, src=2) + assert data.item() == 2 if rank <= 2: - dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2) - item = data[0].item() - print(f"rank: {rank}, item: {item}") - if rank == 3: - assert item == 6 - else: - assert item == 18 + data = torch.tensor([rank + 1]) + data = pg2.broadcast_obj(data, src=2) + assert data.item() == 3 + pg2.barrier() + pg1.barrier() -def gpu_worker(rank, WORLD_SIZE): - pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29502", +def gpu_worker(rank, WORLD_SIZE, port1, port2): + torch.cuda.set_device(rank) + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, - world_size=WORLD_SIZE, - backend="nccl") + world_size=WORLD_SIZE) + pynccl1 = PyNcclCommunicator(pg1, device=rank) + pynccl1.disabled = False if rank <= 2: - pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29503", + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, rank=rank, - world_size=3, - backend="nccl") - torch.cuda.set_device(rank) + world_size=3) + pynccl2 = PyNcclCommunicator(pg2, device=rank) + pynccl2.disabled = False data = torch.tensor([rank]).cuda() - dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1) + pynccl1.all_reduce(data) + pg1.barrier() + torch.cuda.synchronize() if rank <= 2: - dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2) + pynccl2.all_reduce(data) + pg2.barrier() + torch.cuda.synchronize() item = data[0].item() print(f"rank: {rank}, item: {item}") if rank == 3: @@ -85,16 +94,45 @@ def gpu_worker(rank, WORLD_SIZE): assert item == 18 +def broadcast_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + if rank == 2: + pg1.broadcast_obj("secret", src=2) + else: + obj = pg1.broadcast_obj(None, src=2) + assert obj == "secret" + pg1.barrier() + + +def allgather_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + data = pg1.all_gather_obj(rank) + assert data == list(range(WORLD_SIZE)) + pg1.barrier() + + @multi_gpu_test(num_gpus=4) -@pytest.mark.parametrize("worker", [cpu_worker, gpu_worker]) -def test_stateless_init_process_group(worker): +@pytest.mark.parametrize( + "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]) +def test_stateless_process_group(worker): + port1 = get_open_port() + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port1)) + port2 = get_open_port() WORLD_SIZE = 4 from multiprocessing import get_context ctx = get_context("fork") processes = [] for i in range(WORLD_SIZE): rank = i - processes.append(ctx.Process(target=worker, args=(rank, WORLD_SIZE))) + processes.append( + ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))) for p in processes: p.start() for p in processes: diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index f2d7e9fd78cf3..fa5d6a69a9bc8 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,7 +7,7 @@ import pytest from transformers import AutoModelForSeq2SeqLM -from vllm.attention.selector import (_Backend, +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs @@ -34,6 +34,13 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py new file mode 100644 index 0000000000000..6bf7190a656b8 --- /dev/null +++ b/tests/entrypoints/llm/test_accuracy.py @@ -0,0 +1,56 @@ +""" +This file test accuracy of the vLLM server via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import lm_eval +import pytest + +from vllm.platforms import current_platform + +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +EXPECTED_VALUE = 0.58 + + +def run_test(): + """Run the end to end accuracy test.""" + + model_args = f"pretrained={MODEL_NAME},max_model_len=2048" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + measured_value = results["results"][TASK][FILTER] + assert (measured_value - RTOL < EXPECTED_VALUE + and measured_value + RTOL > EXPECTED_VALUE + ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 is currently only supported on CUDA.") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test() + + +def test_lm_eval_accuracy_v0_engine(monkeypatch): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test() diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py index a16e95f94171e..b1d4461d164aa 100644 --- a/tests/entrypoints/openai/test_accuracy.py +++ b/tests/entrypoints/openai/test_accuracy.py @@ -37,11 +37,11 @@ MAX_WAIT_SECONDS = 600 -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy(more_args): +def run_test(more_args): + """Run the end to end accuracy test.""" + args = list(DEFAULT_ARGS) args.extend(more_args) - print(f"Running with: {args}") with RemoteOpenAIServer( @@ -64,3 +64,22 @@ def test_lm_eval_accuracy(more_args): assert (measured_value - RTOL < EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 currently only supported on CUDA") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test([]) + + +@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) +def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test(more_args) diff --git a/vllm/v1/tokenizer/__init__.py b/tests/entrypoints/openai/tool_parsers/__init__.py similarity index 100% rename from vllm/v1/tokenizer/__init__.py rename to tests/entrypoints/openai/tool_parsers/__init__.py diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py new file mode 100644 index 0000000000000..47b0b6bb80ffe --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -0,0 +1,160 @@ +from typing import List +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, run_tool_extraction_streaming) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager + +# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1 +SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')" +SIMPLE_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "San Francisco", "metric": "celsius"}', +) +MORE_TYPES_FUNCTION_OUTPUT = ( + "register_user(name='John Doe', " + "age=37, " + "address={'city': 'San Francisco', 'state': 'CA'}, " + "role=None, " + "passed_test=True, " + "aliases=['John', 'Johnny'])") +MORE_TYPES_FUNCTION_CALL = FunctionCall( + name="register_user", + arguments='{"name": "John Doe", ' + '"age": 37, ' + '"address": {"city": "San Francisco", "state": "CA"}, ' + '"role": null, ' + '"passed_test": true, ' + '"aliases": ["John", "Johnny"]}', +) +PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()" +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{}', +) +EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})" +EMPTY_DICT_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"additional_data": {}}', +) +EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])" +EMPTY_LIST_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"steps": []}', +) +ESCAPED_STRING_FUNCTION_OUTPUT = ( + r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')") +ESCAPED_STRING_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', +) + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + model_output = "How can I help you today?" + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content == model_output + assert len(tool_calls) == 0 + + +TEST_CASES = [ + pytest.param(True, + f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL], + id="simple_streaming"), + pytest.param(False, + f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL], + id="simple_nonstreaming"), + pytest.param(True, + f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming"), + pytest.param(False, + f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming"), + pytest.param(True, + f"[{PARAMETERLESS_FUNCTION_OUTPUT}]", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_streaming"), + pytest.param(False, + f"[{PARAMETERLESS_FUNCTION_OUTPUT}]", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_nonstreaming"), + pytest.param(True, + f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_streaming"), + pytest.param(False, + f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_nonstreaming"), + pytest.param(True, + f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_streaming"), + pytest.param(False, + f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_nonstreaming"), + pytest.param(True, + f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_streaming"), + pytest.param(False, + f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_nonstreaming"), + pytest.param(True, + f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_streaming"), + pytest.param(False, + f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_nonstreaming"), +] + + +@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", + TEST_CASES) +def test_tool_call(streaming: bool, model_output: str, + expected_tool_calls: List[FunctionCall]): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content is None + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function == expected + + +def test_streaming_tool_call_with_large_steps(): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + model_output_deltas = [ + "[get_weather(city='San", + " Francisco', metric='celsius'), " + f"{PARAMETERLESS_FUNCTION_OUTPUT}, " + f"{EMPTY_LIST_FUNCTION_OUTPUT}]", + ] + + reconstructor = run_tool_extraction_streaming( + tool_parser, model_output_deltas, assert_one_tool_per_delta=False) + + assert reconstructor.other_content == "" + assert len(reconstructor.tool_calls) == 3 + assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL + assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL + assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py new file mode 100644 index 0000000000000..f0a2a32c16786 --- /dev/null +++ b/tests/entrypoints/openai/tool_parsers/utils.py @@ -0,0 +1,123 @@ +from typing import Iterable, List, Tuple, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers import ToolParser + + +class StreamingToolReconstructor: + + def __init__(self, assert_one_tool_per_delta: bool = True): + self.tool_calls: List[ToolCall] = [] + self.other_content: str = "" + self._assert_one_tool_per_delta = assert_one_tool_per_delta + + def append_delta(self, delta: DeltaMessage): + if delta.content is not None: + self.other_content += delta.content + else: + assert delta.tool_calls, ( + "Streaming results should have either content or tool calls " + "(or both)") + if self._assert_one_tool_per_delta: + # Note: This isn't strictly required by the API and may not be + # possible to adhere to depending on the token space and number of + # tokens per streamed response from the model, but it is required + # by tool_use tests, so we enforce it here by default also. + assert len(delta.tool_calls) < 2, ( + "Streaming should include only one tool call per update.") + for call_delta in delta.tool_calls: + assert call_delta.type == "function", ( + "Streaming tool calls should only emit function calls. Got " + f"{call_delta.type}") + current_tool_call = self.tool_calls[ + call_delta.index] if call_delta.index < len( + self.tool_calls) else None + if current_tool_call: + assert (not call_delta.function.name), ( + "Streaming tool calls should emit the full function name " + f"exactly once. Got {call_delta.function.name}") + assert (not call_delta.id), ( + "Streaming tool calls must emit function id only once. Got " + f"{call_delta.id}") + assert (call_delta.index == len(self.tool_calls) - 1), ( + f"Incorrect index for tool delta. Got {call_delta.index}, " + f"expected {len(self.tool_calls) - 1}") + current_tool_call.function.arguments += ( + call_delta.function.arguments) + else: + assert call_delta.id is not None, ( + "Streaming tool calls must have an id on first appearance") + assert call_delta.function.name is not None, ( + "Streaming tool calls must have a function name on first " + "appearance") + assert call_delta.index == len(self.tool_calls), ( + f"Incorrect index for tool delta. Got {call_delta.index}, " + f"expected {len(self.tool_calls)}") + self.tool_calls.append( + ToolCall(id=call_delta.id, + function=FunctionCall( + name=call_delta.function.name, + arguments=call_delta.function.arguments + or ""))) + + +def run_tool_extraction( + tool_parser: ToolParser, + model_output: str, + request: Union[ChatCompletionRequest, None] = None, + streaming: bool = False, + assert_one_tool_per_delta: bool = True, +) -> Tuple[Union[str, None], List[ToolCall]]: + if streaming: + reconstructor = run_tool_extraction_streaming( + tool_parser, + model_output, + request, + assert_one_tool_per_delta=assert_one_tool_per_delta) + return reconstructor.other_content or None, reconstructor.tool_calls + else: + extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, + request) + assert extracted.tools_called == bool(extracted.tool_calls) + return extracted.content, extracted.tool_calls + + +def run_tool_extraction_nonstreaming( + tool_parser: ToolParser, + model_output: str, + request: Union[ChatCompletionRequest, None] = None +) -> ExtractedToolCallInformation: + request = request or ChatCompletionRequest(messages=[], model="test-model") + return tool_parser.extract_tool_calls(model_output, request) + + +def run_tool_extraction_streaming( + tool_parser: ToolParser, + model_deltas: Iterable[str], + request: Union[ChatCompletionRequest, None] = None, + assert_one_tool_per_delta: bool = True, +) -> StreamingToolReconstructor: + request = request or ChatCompletionRequest(messages=[], model="test-model") + reconstructor = StreamingToolReconstructor( + assert_one_tool_per_delta=assert_one_tool_per_delta) + previous_text = "" + previous_tokens: List[int] = [] + for delta in model_deltas: + token_delta = [ + tool_parser.vocab.get(token) + for token in tool_parser.model_tokenizer.tokenize(delta) + if token in tool_parser.vocab + ] + current_text = previous_text + delta + current_tokens = previous_tokens + token_delta + delta_message = tool_parser.extract_tool_calls_streaming( + previous_text, current_text, delta, previous_tokens, + current_tokens, token_delta, request) + if delta_message is not None: + reconstructor.append_delta(delta_message) + previous_text = current_text + previous_tokens = current_tokens + return reconstructor diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 816d3986fe333..29ecf37808205 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,6 +152,11 @@ def sql_lora_files(sql_lora_huggingface_id): return snapshot_download(repo_id=sql_lora_huggingface_id) +@pytest.fixture(scope="session") +def lora_bias_files(): + return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias") + + @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index eb882faf3974a..15e576cb065c7 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -51,6 +51,7 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] + # We will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) STAGES = [True, False] @@ -120,11 +121,12 @@ def populate_loras( subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): - sublora = DummyLoRAManager().init_random_lora( - module_name=f"fake_{i}", - weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, - ) + sublora = DummyLoRAManager( + layer_weights.device).init_random_lora( + module_name=f"fake_{i}", + weight=layer_weights, + generate_embeddings_tensor=generate_embeddings_tensor, + ) sublora.lora_b = sublora.lora_b[:, (sublora_len * i):(sublora_len * (i + 1))] sublora.optimize() @@ -152,6 +154,7 @@ def create_random_inputs( input_size: Tuple[int, ...], input_range: Tuple[float, float], input_type: torch.dtype = torch.int, + device: torch.device = "cuda" ) -> Tuple[List[torch.Tensor], List[int], List[int]]: """Creates random inputs. @@ -173,10 +176,14 @@ def create_random_inputs( for _ in range(num_inputs): if input_type == torch.int: inputs.append( - torch.randint(low=int(low), high=int(high), size=input_size)) + torch.randint(low=int(low), + high=int(high), + size=input_size, + device=device)) else: inputs.append( - torch.rand(size=input_size, dtype=input_type) * high + low) + torch.rand(size=input_size, dtype=input_type, device=device) * + high + low) lora_id = random.choice(active_lora_ids) index_mapping += [lora_id] * input_size[0] @@ -191,6 +198,10 @@ def create_random_inputs( @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA + # device, see: https://github.com/triton-lang/triton/issues/2925 + # Same below. + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -225,7 +236,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -263,7 +274,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -291,6 +302,7 @@ def create_random_embedding_layer(): def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -345,7 +357,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -400,7 +412,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) original_inputs = deepcopy(inputs) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, @@ -426,6 +438,7 @@ def create_random_embedding_layer(): def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -471,7 +484,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -520,7 +533,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -554,6 +567,7 @@ def _pretest(): @pytest.mark.parametrize("stage", STAGES) def test_linear_replicated(dist_init, num_loras, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -592,7 +606,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -631,7 +645,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -658,6 +672,7 @@ def create_random_linear_replicated_layer(): def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -706,7 +721,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -745,7 +760,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -772,6 +787,7 @@ def create_random_linear_parallel_layer(): def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, device, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -842,7 +858,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -883,7 +899,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -962,7 +978,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, input_size=(1, max_position), input_range=(0, lora_config.lora_extra_vocab_size), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping) long_lora_context = LongContextLoRAContext(list(scaling_factors), diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py new file mode 100644 index 0000000000000..c2520c847d873 --- /dev/null +++ b/tests/lora/test_lora_bias_e2e.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ibm-granite/granite-3b-code-base" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + generated_texts: List[str] = [] + for output in outputs: + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + return generated_texts + + +@pytest.mark.parametrize("lora_bias", [True]) +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_lora_rank=8, + max_loras=1, + enable_lora_bias=lora_bias, + tensor_parallel_size=1, + fully_sharded_loras=fully_sharded) + + print("lora adapter created") + output1 = do_sample(llm, lora_bias_files, lora_id=0) + + print("lora") + output2 = do_sample(llm, lora_bias_files, lora_id=1) + + if lora_bias: + assert output1 != output2 + else: + assert output1 == output2 diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 67cf298b4df2b..8d109b2c81503 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -25,8 +25,13 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] -def test_from_lora_tensors(sql_lora_files): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( @@ -36,7 +41,7 @@ def test_from_lora_tensors(sql_lora_files): 8, 16, tensors, - "cuda", + device, embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES) @@ -46,6 +51,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.lora_alpha == 16 assert lora.lora_a is not None assert lora.lora_b is not None + assert lora.lora_a.device == torch.device(device) + assert lora.lora_b.device == torch.device(device) assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" assert lora.lora_a.shape[1] == 8 @@ -60,8 +67,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, - sub_modules: List[str]) -> LoRAModel: +def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], + device: torch.device) -> LoRAModel: loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight @@ -69,8 +76,8 @@ def create_lora(lora_id: int, model: nn.Module, name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), - torch.rand([8, w.shape[0]], device="cuda"), + torch.rand([w.shape[1], 8], device=device), + torch.rand([8, w.shape[0]], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -80,6 +87,7 @@ def create_packed_lora( model: nn.Module, module_name, replaced_module_names, + device: torch.device, empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight @@ -91,9 +99,9 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), + torch.rand([w.shape[1], 8], device=device), torch.rand([8, w.shape[0] // len(replaced_module_names)], - device="cuda"), + device=device), ) return LoRAModel(lora_id, 8, loras) @@ -104,7 +112,8 @@ def test_replace_submodules(dist_init, dummy_model): model.packed_modules_mapping = {} manager = LoRAModelManager( model, 1, 1, 1, - LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8)) + LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), + torch.device("cuda")) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -116,16 +125,28 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -def test_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -161,17 +182,32 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 + assert manager.device == device + assert manager.punica_wrapper.device == device -def test_lora_lru_cache_model_manager(dist_init, dummy_model): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -238,20 +274,37 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): with pytest.raises(ValueError): assert manager.pin_adapter(3) + assert manager.punica_wrapper.device == device + assert manager.device == device + -def test_lru_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora4 = create_lora(4, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) @@ -351,14 +404,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} + assert manager.punica_wrapper.device == device + assert manager.device == device +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = LRUCacheWorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -426,14 +482,19 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): # Should remove every LoRA not specified in the request. lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = WorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -497,8 +558,13 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + -def test_packed_loras(dist_init, dummy_model_gate_up): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] model.packed_modules_mapping = { @@ -511,18 +577,25 @@ def test_packed_loras(dist_init, dummy_model_gate_up): 1, model, module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"]) + replaced_module_names=["gate_proj", "up_proj"], + device=device) model_lora1 = create_packed_lora( 2, model, module_name="gate_up_proj", replaced_module_names=["gate_proj", "up_proj"], + device=device, empty_replaced_module_name="gate_proj", ) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) model = manager.model assert isinstance(model.get_submodule("gate_up_proj"), diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py index e756544d96e98..66b5f82bbb97d 100644 --- a/tests/lora/test_punica_sizes.py +++ b/tests/lora/test_punica_sizes.py @@ -4,8 +4,6 @@ whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. """ -from unittest.mock import patch - import pytest import torch @@ -16,7 +14,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.platforms import current_platform -from vllm.triton_utils.libentry import LibEntry from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -235,9 +232,6 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -262,33 +256,21 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -324,7 +306,6 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) current_platform.seed_everything(seed) @@ -374,22 +355,16 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py index dc0edeb10ef46..52b82f25d23e1 100644 --- a/tests/lora/test_punica_variation.py +++ b/tests/lora/test_punica_variation.py @@ -3,8 +3,6 @@ under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ -from unittest.mock import patch - import pytest import torch @@ -15,7 +13,6 @@ from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink from vllm.platforms import current_platform -from vllm.triton_utils.libentry import LibEntry from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -150,8 +147,6 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) current_platform.seed_everything(seed) @@ -177,33 +172,22 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -239,8 +223,6 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel - torch.set_default_device(device) current_platform.seed_everything(seed) @@ -289,22 +271,15 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index db02bacdb6439..85110b8fa8cd2 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -12,36 +12,40 @@ def test_parse_fine_tuned_lora_name_valid(): fixture = { - ("base_model.model.lm_head.lora_A.weight", "lm_head", True), - ("base_model.model.lm_head.lora_B.weight", "lm_head", False), + ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False), + ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False), ( "base_model.model.model.embed_tokens.lora_embedding_A", "model.embed_tokens", True, + False, ), ( "base_model.model.model.embed_tokens.lora_embedding_B", "model.embed_tokens", False, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight", "model.layers.9.mlp.down_proj", True, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight", "model.layers.9.mlp.down_proj", False, + False, ), } - for name, module_name, is_lora_a in fixture: - assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name) + for name, module_name, is_lora_a, is_bias in fixture: + assert (module_name, is_lora_a, + is_bias) == parse_fine_tuned_lora_name(name) def test_parse_fine_tuned_lora_name_invalid(): fixture = { - "weight", "base_model.weight", "base_model.model.weight", } diff --git a/tests/lora/utils.py b/tests/lora/utils.py index 00f8e26d1041f..e394c33b3f9ea 100644 --- a/tests/lora/utils.py +++ b/tests/lora/utils.py @@ -7,9 +7,10 @@ class DummyLoRAManager: - def __init__(self): + def __init__(self, device: torch.device = "cuda:0"): super().__init__() self._loras: Dict[str, LoRALayerWeights] = {} + self._device = device def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora @@ -28,16 +29,16 @@ def init_random_lora(self, lora_alpha=1, lora_a=torch.rand([weight.shape[1], rank], dtype=weight.dtype, - device="cuda"), + device=self._device), lora_b=torch.rand([rank, weight.shape[0]], dtype=weight.dtype, - device="cuda"), + device=self._device), ) if generate_embeddings_tensor: lora.embeddings_tensor = torch.rand(5, generate_embeddings_tensor, dtype=weight.dtype, - device="cuda") + device=self._device) self.set_module_lora(module_name, lora) return lora diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index e6ed87fc8ea08..163220c91a27d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -6,7 +6,7 @@ from PIL.Image import Image from vllm.inputs import InputContext, token_inputs -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import IMAGE_ASSETS diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py new file mode 100644 index 0000000000000..718c675b86fb4 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -0,0 +1,428 @@ +from typing import Any, List, Optional, Tuple, Type, TypedDict, Union + +import numpy.typing as npt +import pytest +import torch +from PIL import Image + +from vllm.entrypoints.llm import LLM +from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, + sample_frames_from_video) + +from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, + PromptVideoInput, VllmRunner) +from ...utils import check_logprobs_close + +models = ["Qwen/Qwen2-VL-2B-Instruct"] +target_dtype = "half" + +IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" +VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" + + +def qwen2_vl_chat_template(*query): + return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the biggest text's content in this image?", + ), + "cherry_blossom": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the season shown in this image? ", + "Reply with a short sentence (no more than 20 words)", + ), +}) + +VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ + "sample_demo_1": + qwen2_vl_chat_template( + VIDEO_PLACEHOLDER, + "Describe this video with a short sentence ", + "(no more than 20 words)", + ), +}) + +MULTIIMAGE_PROMPT = qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + IMAGE_PLACEHOLDER, + "Describe these two images separately. ", + "For each image, reply with a short sentence ", + "(no more than 10 words).", +) + + +class Qwen2VLPromptImageEmbeddingInput(TypedDict): + image_embeds: torch.Tensor + image_grid_thw: torch.Tensor + + +class Qwen2VLPromptVideoEmbeddingInput(TypedDict): + video_embeds: torch.Tensor + video_grid_thw: torch.Tensor + + +def batch_make_image_embeddings( + image_batches: List[Union[Image.Image, List[Image.Image]]], processor, + llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + """batched image embeddings for Qwen2-VL + + This will infer all images' embeddings in a single batch, + and split the result according to input batches. + + image_batches: + - Single-image batches: `List[Image.Image]` + - Multiple-image batches: `List[List[Image.Image]]]` + + returns: `List[Qwen2VLPromptImageEmbeddingInput]` + """ + + image_batches_: List[Any] = image_batches[:] + + # convert single-image batches to multiple-image batches + for idx in range(len(image_batches_)): + if not isinstance(image_batches_[idx], list): + image_batches_[idx] = [image_batches_[idx]] + + assert isinstance(image_batches_[idx], list) + + # append all images into a list (as a batch) + images: List[Image.Image] = [] + for image_batch in image_batches_: + images += image_batch + + # image to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=images, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values"] + image_grid_thw = preprocess_result["image_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker. \ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + image_embeds = visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptImageEmbeddingInput] = [] + image_counter = 0 + embed_counter = 0 + for image_batch in image_batches_: + cur_batch_image_count = len(image_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in image_grid_thw[image_counter:image_counter + + cur_batch_image_count] + ]) + + result.append({ + "image_embeds": + image_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "image_grid_thw": + image_grid_thw[image_counter:image_counter + + cur_batch_image_count], + }) + + embed_counter += cur_batch_embed_len + image_counter += cur_batch_image_count + + # ensure we don't lost any images or embeddings + assert embed_counter == image_embeds.size(0) + assert image_counter == image_grid_thw.size(0) + assert len(image_batches) == len(result) + + return result + + +def batch_make_video_embeddings( + video_batches: PromptVideoInput, processor, + llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + """batched video embeddings for Qwen2-VL + + A NDArray represents a single video's all frames. + + This will infer all videos' embeddings in a single batch, + and split the result according to input batches. + + video_batches: + - Single-video batches: `List[NDArray]` + - Multiple-video batches: `List[List[NDArray]]` + """ + + video_batches_: List[Any] = video_batches[:] + + for idx in range(len(video_batches_)): + if not isinstance(video_batches_[idx], list): + single_video_batch: List[npt.NDArray] = [video_batches_[idx]] + video_batches_[idx] = single_video_batch + + assert isinstance(video_batches_[idx], list) + + # append all videos into a list (as a batch) + videos: List[npt.NDArray] = [] + for video_batch in video_batches_: + videos += video_batch + + # video to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=None, videos=videos, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values_videos"] + video_grid_thw = preprocess_result["video_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker.\ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + video_embeds = visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptVideoEmbeddingInput] = [] + video_counter = 0 + embed_counter = 0 + for video_batch in video_batches_: + cur_batch_video_count = len(video_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in video_grid_thw[video_counter:video_counter + + cur_batch_video_count] + ]) + + result.append({ + "video_embeds": + video_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "video_grid_thw": + video_grid_thw[video_counter:video_counter + + cur_batch_video_count], + }) + + embed_counter += cur_batch_embed_len + video_counter += cur_batch_video_count + + # ensure we don't lost any videos or embeddings + assert embed_counter == video_embeds.size(0) + assert video_counter == video_grid_thw.size(0) + assert len(video_batches) == len(result) + + return result + + +def run_test( + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between + original image/video input and image/video embeddings input. + """ + from transformers import AutoProcessor # noqa: F401 + + processor = AutoProcessor.from_pretrained(model) + + # NOTE: + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + task="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + + outputs_per_case_for_original_input = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images or None, + videos=videos or None) + for prompts, images, videos in inputs + ] + + outputs_per_case_for_embeddings_input = [ + vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=batch_make_image_embeddings( + images, processor, vllm_model.model) if images else None, + videos=batch_make_video_embeddings( + videos, processor, vllm_model.model) if videos else None) + for prompts, images, videos in inputs + ] + + for outputs_for_original_input, \ + outputs_for_embeddings_input \ + in zip(outputs_per_case_for_original_input, + outputs_per_case_for_embeddings_input): + check_logprobs_close( + outputs_0_lst=outputs_for_original_input, + outputs_1_lst=outputs_for_embeddings_input, + name_0="original_input", + name_1="embeddings_input", + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + [], + ) for image, prompt in zip(images, IMAGE_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + [], + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, + model, size_factors, + dtype: str, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[List[str], PromptImageInput, + PromptVideoInput]] = [( + [MULTIIMAGE_PROMPT for _ in size_factors], + [[ + rescale_image_size(image, factor) + for image in images + ] for factor in size_factors], + [], + )] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=2, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.25, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + num_frames = 4 + sampled_vids = [ + sample_frames_from_video(asset.np_ndarrays, num_frames) + for asset in video_assets + ] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [], + [rescale_video_size(video, factor) for factor in size_factors], + ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index d8ca6d361f0e3..40ee49cf60742 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -21,14 +21,14 @@ def test_classification_models( model: str, dtype: str, ) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.classify(example_prompts) + with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSequenceClassification) as hf_model: hf_outputs = hf_model.classify(example_prompts) - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.classify(example_prompts) - print(hf_outputs, vllm_outputs) # check logits difference diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 39b6bbaf43180..cd920aec6502e 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,6 +4,8 @@ """ import pytest +from vllm.utils import current_platform + from ..utils import check_embeddings_close # Model, Guard @@ -21,15 +23,14 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models( - monkeypatch, hf_runner, vllm_runner, example_prompts, model, dtype: str, ) -> None: - if model in ENCODER_ONLY: - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + if model not in ENCODER_ONLY and current_platform.is_cpu(): + pytest.skip("Skip large embedding models test on CPU.") # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py new file mode 100644 index 0000000000000..3dd8cb729f8a6 --- /dev/null +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -0,0 +1,209 @@ +from functools import partial +from typing import Callable, Dict, List, Type + +import pytest +import torch +from PIL import Image +from transformers import BatchEncoding, Qwen2VLForConditionalGeneration + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ..utils import check_embeddings_close + +HF_TEXT_PROMPTS = [ + # T -> X + ( + "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501, + Image.new("RGB", (56, 56))), + # T -> X + ("Query: Retrieve an image of this caption: cherry blossom", + Image.new("RGB", (56, 56))), +] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "What is shown in this image?", + "cherry_blossom": + "What is shown in this image?" +}) + +MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"] + + +def get_messages(image: Image.Image, text: str, embed_text: bool): + # assert False, 'remember to use outer [] as required' + if embed_text: + messages = [{ + "role": + "user", + "content": [ + { + "type": "image", + "image": Image.new("RGB", (56, 56)), + "resized_height": 1, + "resized_width": 1 + }, # need a dummy image here for an easier process. + { + "type": "text", + "text": text + }, + ] + }] + else: + messages = [{ + "role": + "user", + "content": [{ + "type": "image", + "image": image + }, { + "type": "text", + "text": text + }] + }] + return messages + + +def apply_chat_template_and_add_eos( + messages: List[Dict], + apply_chat_template_fn: Callable, +): + prompt = apply_chat_template_fn( + messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>" + return prompt + + +def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs): + return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs) + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, + embed_texts: List[bool], + model: str, + *, + dtype: str, +) -> None: + '''SET PYTHONPATH''' + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + task="embedding", + dtype=dtype, + enforce_eager=True, + max_model_len=8192) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + texts = [ + # this is necessary because vllm_model.encode will not apply any + # templating to the prompt, and therefore lacks an image_pad + # token unless one is inserted beforehand (the (28,28) image + # above is converted to an image pad token by the chat template). + apply_chat_template_and_add_eos( + get_messages(image, text, False), + apply_chat_template_fn=tokenizer.apply_chat_template, + ) for text, image in zip(input_texts, input_images) + # vllm will replace the pad token with the actual image, + # which may be a placeholder image, later. + ] + vllm_outputs = vllm_model.encode(texts, images=input_images) + + hf_outputs = [] + with hf_runner(model, + dtype=dtype, + auto_cls=Qwen2VLForConditionalGeneration) as hf_model: + hf_model.postprocess_inputs = partial( + postprocess_inputs, + hf_model, + cache_position=torch.arange( + 0, + 1, # 1 for batch size + requires_grad=False), + use_cache=False) + for text, image, embed_text in zip(input_texts, input_images, + embed_texts): + # dse requires non-standard input processing + # because it needs an image_pad token + messages = get_messages(image, text, embed_text) + prompt = apply_chat_template_and_add_eos( + messages, hf_model.processor.apply_chat_template) + inputs = hf_model.get_inputs( + prompts=[[prompt]], + images=[[image]], + ) + with torch.no_grad(): + outputs = hf_model.model( + **hf_model.wrap_device(inputs[0], + device=hf_model.model.device.type), + return_dict=True, + output_hidden_states=True, + ) + pooled_output = torch.nn.functional.normalize( + outputs.hidden_states[-1][0, -1], p=2, dim=-1) + hf_outputs.append(pooled_output.tolist()) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, image_placeholder) + for text, image_placeholder in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [True] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + embed_texts, + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [False] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + embed_texts, + model, + dtype=dtype, + ) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 7f82347841cdb..a3b1c0950d9a2 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -4,6 +4,8 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, + global_force_attn_backend_context_manager) from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs @@ -14,6 +16,8 @@ _LIMIT_IMAGE_PER_PROMPT = 3 +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": "<|image|><|begin_of_text|>The meaning of the image is", @@ -221,6 +225,13 @@ def process(hf_inputs: BatchEncoding, **kwargs): ) +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @large_gpu_test(min_gb=48) @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( @@ -244,20 +255,26 @@ def process(hf_inputs: BatchEncoding, **kwargs): @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, model, sizes, dtype, max_tokens, - num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + num_logprobs, + attn_backend: _Backend) -> None: + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + sizes=sizes, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -265,9 +282,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: + model, dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -291,17 +309,20 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, cherry_blossom.resize((512, 1024)), ], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -309,8 +330,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, - dtype, max_tokens, num_logprobs) -> None: + dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -325,14 +348,17 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, [stop_sign], [stop_sign, cherry_blossom], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_inputs.py similarity index 97% rename from tests/multimodal/test_base.py rename to tests/multimodal/test_inputs.py index bfaf2cdeaa8d4..678bbb52b8c2f 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_inputs.py @@ -1,6 +1,6 @@ import torch -from vllm.multimodal.base import MultiModalKwargs, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors def assert_nested_tensors_equal(expected: NestedTensors, diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index 4d3bbd805c152..e6c8793989e13 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -1,12 +1,12 @@ from array import array -from typing import Mapping +from typing import Callable, Dict, Mapping, Optional from unittest.mock import patch import pytest import torch from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext, - InputRegistry, token_inputs) + InputRegistry, ProcessorInputs, token_inputs) from vllm.multimodal import MultiModalRegistry from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData @@ -34,10 +34,9 @@ def custom_processor(ctx: InputContext, inputs: DecoderOnlyInputs, *, num_crops=DEFAULT_NUM_CROPS): - # For testing purposes, we don't worry about the llm inputs / return - # type validation, and just return the value of the kwarg that we - # clobber. - return num_crops + # For testing purposes, we don't worry about the prompt + return token_inputs(prompt_token_ids=[], + mm_processor_kwargs={"num_crops": num_crops}) with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor", return_value=custom_processor): @@ -109,6 +108,21 @@ def _get_num_crops_info(init_num_crops: int, inference_num_crops: int): return init_kwargs, inference_kwargs, expected_seq_count +def _get_processed_num_crops( + processor: Callable[[ProcessorInputs], ProcessorInputs], + inference_kwargs: Optional[Dict[str, int]], +) -> int: + processed_inputs = processor( + token_inputs(prompt_token_ids=[], + prompt="", + mm_processor_kwargs=inference_kwargs)) + + assert "type" in processed_inputs + assert processed_inputs["type"] == "token" + assert "mm_processor_kwargs" in processed_inputs + return processed_inputs["mm_processor_kwargs"]["num_crops"] + + @pytest.mark.parametrize("init_num_crops,inference_num_crops", [ (None, None), (NUM_CROPS_OVERRIDE, None), @@ -124,10 +138,8 @@ def test_input_processor_kwargs(use_processor_mock, init_num_crops, ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs) processor = dummy_registry.create_input_processor(ctx.model_config) - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=inference_kwargs)) + num_crops_val = _get_processed_num_crops(processor, inference_kwargs) + assert num_crops_val == expected_seq_count @@ -153,10 +165,7 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock, processor = dummy_registry.create_input_processor(ctx.model_config) # Should filter out the inference time kwargs - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=mm_processor_kwargs)) + num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs) assert num_crops_val == DEFAULT_NUM_CROPS diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index fd6564bbfe630..50723dbb610ac 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -27,6 +27,7 @@ @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("cached_position", [0, 1]) +@pytest.mark.parametrize("block_size", [16]) def test_mixed_requests( hf_runner, vllm_runner, @@ -36,11 +37,12 @@ def test_mixed_requests( dtype: str, max_tokens: int, cached_position: int, + block_size: int, monkeypatch, ) -> None: """ Test the case when some sequences have the prefix cache hit - and the others don't. The cached position determines where + and the others don't. The cached position determines where the sequence is at among the batch of prefills. """ override_backend_env_variable(monkeypatch, backend) @@ -53,12 +55,30 @@ def test_mixed_requests( model, dtype=dtype, enable_prefix_caching=True, + block_size=block_size, ) as vllm_model: # Run the first prompt so the cache is populated vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) # Run all the promopts - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert req_outputs[ + i].num_cached_tokens == expected_num_cached_tokens + + vllm_outputs = [ + (output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text) for output in req_outputs + ] check_outputs_equal( outputs_0_lst=hf_outputs, diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 0f01f5f819ea4..569fc8dfb6a21 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -9,7 +9,7 @@ import torch from tests.quantization.utils import is_quant_method_supported -from tests.utils import fork_new_process_for_each_test +from tests.utils import compare_two_settings, fork_new_process_for_each_test models_4bit_to_test = [ ("facebook/opt-125m", "quantize opt model inflight"), @@ -82,6 +82,34 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, vllm_tp_size=2) +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason='Test requires at least 2 GPUs.') +@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), + reason='bitsandbytes is not supported on this GPU type.') +@pytest.mark.parametrize("model_name, description", models_4bit_to_test) +@fork_new_process_for_each_test +def test_load_pp_4bit_bnb_model(model_name, description) -> None: + common_args = [ + "--disable-log-stats", + "--disable-log-requests", + "--dtype", + "bfloat16", + "--enable-prefix-caching", + "--quantization", + "bitsandbytes", + "--load-format", + "bitsandbytes", + "--gpu-memory-utilization", + "0.7", + ] + pp_args = [ + *common_args, + "--pipeline-parallel-size", + "2", + ] + compare_two_settings(model_name, common_args, pp_args) + + def log_generated_texts(prompts, outputs, runner_name): logged_texts = [] for i, (_, generated_text) in enumerate(outputs): diff --git a/tests/test_config.py b/tests/test_config.py index 36c426d6c51f6..df382d22d83ec 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -243,6 +243,8 @@ def test_rope_customization(): assert longchat_model_config.max_model_len == 4096 +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Encoder Decoder models not supported on ROCm.") @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ ("facebook/opt-125m", False), ("facebook/bart-base", True), diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index 576555b368afe..6818ac44b2478 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -122,7 +122,17 @@ def ensure_system_prompt(messages: List[Dict[str, Any]], ], "supports_parallel": False, - } + }, + "toolACE": { + "model": + "Team-ACE/ToolACE-8B", + "arguments": [ + "--tool-call-parser", "pythonic", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja") + ], + "supports_parallel": + True, + }, } WEATHER_TOOL: ChatCompletionToolParam = { diff --git a/tests/v1/__init__.py b/tests/v1/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index e5a3b62258dd8..d614d3e67460f 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,5 +1,5 @@ """Compare the with and without prefix caching.""" -from vllm.inputs import DecoderOnlyInputs +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_manager import KVCacheManager, Request from vllm.v1.core.kv_cache_utils import hash_block_tokens @@ -8,7 +8,7 @@ def make_request(request_id, prompt_token_ids): return Request( request_id=request_id, - inputs=DecoderOnlyInputs(prompt_token_ids=prompt_token_ids), + inputs=token_inputs(prompt_token_ids=prompt_token_ids), sampling_params=SamplingParams(max_tokens=17), eos_token_id=100, arrival_time=0, diff --git a/tests/v1/engine/__init__.py b/tests/v1/engine/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py new file mode 100644 index 0000000000000..1f26fe0fc892f --- /dev/null +++ b/tests/v1/engine/test_async_llm.py @@ -0,0 +1,66 @@ +import asyncio +from typing import Tuple + +import pytest + +from vllm import SamplingParams +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.platforms import current_platform +from vllm.v1.engine.async_llm import AsyncLLM + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B", + disable_log_requests=True) + + +async def generate(engine: AsyncLLM, request_id: str, + max_tokens: int) -> Tuple[int, str]: + count = 0 + async for _ in engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=SamplingParams( + max_tokens=max_tokens, temperature=0)): + + count += 1 + await asyncio.sleep(0.) + + return count, request_id + + +@pytest.mark.asyncio +async def test_load(monkeypatch): + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + + NUM_REQUESTS = 10000 + NUM_EXPECTED_TOKENS = 10 + + request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)] + + # Create concurrent requests. + tasks = [] + for request_id in request_ids: + tasks.append( + asyncio.create_task( + generate(engine, request_id, NUM_EXPECTED_TOKENS))) + + # Confirm that we got all the EXPECTED tokens from the requests. + failed_request_id = None + tokens = None + for task in tasks: + num_generated_tokens, request_id = await task + if (num_generated_tokens != NUM_EXPECTED_TOKENS + and failed_request_id is None): + failed_request_id = request_id + tokens = num_generated_tokens + + assert failed_request_id is None, ( + f"{failed_request_id} generated {tokens} but " + f"expected {NUM_EXPECTED_TOKENS}") + + engine.shutdown() diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py new file mode 100644 index 0000000000000..07f343666cb5e --- /dev/null +++ b/tests/v1/engine/test_detokenizer.py @@ -0,0 +1,205 @@ +from typing import List + +import pytest +from transformers import AutoTokenizer + +from vllm.sampling_params import RequestOutputKind +from vllm.v1.engine import EngineCoreOutput +from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest + +TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" +tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) + +FULL_STRINGS = [ + "My name is Robert from Neural Magic and I love working on vLLM so much!", + "Red Hat is the best open source company by far across Linux, K8s, and AI.", + "Nick is the name of my brother in addition to my colleague from Red Hat.", +] + +STOP_STRINGS = ["I love working on", "company by far", "brother in"] + +FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS] +PROMPT_LEN = 5 +PROMPT_TOKENS = [ + tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS +] +GENERATION_TOKENS = [ + tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS +] +PROMPT_STRINGS = [ + tokenizer.decode(prompt_tokens, skip_special_tokens=True) + for prompt_tokens in PROMPT_TOKENS +] +PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS] +GENERATION_STRINGS = [ + text[prompt_len:] + for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN) +] + + +class MockEngineCore: + """Mock outputs form premade tokens lists.""" + + def __init__(self, tokens_list: List[List[int]]): + self.tokens_list = tokens_list + self.current_idx = 0 + + def get_outputs(self) -> List[EngineCoreOutput]: + token_idx = self.current_idx + self.current_idx += 1 + + outputs = [] + for req_idx, token_ids in enumerate(self.tokens_list): + if len(token_ids) > token_idx: + output = EngineCoreOutput(request_id=f"request-{req_idx}", + new_token_ids=[token_ids[token_idx]], + finished=False) + if token_idx == len(token_ids) - 1: + output.finished = True + output.finish_reason = "stopped" + outputs.append(output) + + return outputs + + +@pytest.mark.parametrize( + "request_output_kind", + [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) +def test_incremental_detokenization(request_output_kind: RequestOutputKind): + detokenizer = Detokenizer(TOKENIZER_NAME) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + DetokenizerRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False, + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + detokenizer.add_request(request) + + gen_strings = {} + gen_tokens = {} + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + request_outputs, requests_to_abort = detokenizer.step(outputs) + assert len(requests_to_abort) == 0 + + # Update tracking. + for request_output in request_outputs: + request_id = request_output.request_id + new_text = request_output.outputs[0].text + new_tokens = request_output.outputs[0].token_ids + if request_id not in gen_strings: + gen_strings[request_id] = new_text + gen_tokens[request_id] = new_tokens + else: + gen_strings[request_id] += new_text + gen_tokens[request_id].extend(new_tokens) + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, ref_gen_toks) in enumerate( + zip(GENERATION_STRINGS, GENERATION_TOKENS)): + gen_str = gen_strings[f"request-{idx}"] + gen_toks = gen_tokens[f"request-{idx}"] + + assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}" + assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}" + + assert detokenizer.get_num_unfinished_requests() == 0 + assert not detokenizer.has_unfinished_requests() + + +@pytest.mark.parametrize("include_stop_str_in_output", [True, False]) +def test_stop_string(include_stop_str_in_output: bool): + detokenizer = Detokenizer(TOKENIZER_NAME) + engine_core = MockEngineCore(GENERATION_TOKENS) + + # Make N requests. + requests = [ + DetokenizerRequest( + request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + ) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + ] + + # Add requests to the detokenizer. + for request in requests: + detokenizer.add_request(request) + + gen_strings = {} + aborted = [] + while True: + # Mock output from the EngineCore. + outputs = engine_core.get_outputs() + if len(outputs) == 0: + break + + # Step the Detokenizer. + request_outputs, requests_to_abort = detokenizer.step(outputs) + for request_output in request_outputs: + # If aborted, we should not get a request output. + assert request_output.request_id not in aborted + aborted.extend(requests_to_abort) + + # Update tracking. + for request_output in request_outputs: + if request_output.finished: + assert request_output.outputs[0].finish_reason == "stop" + + request_id = request_output.request_id + new_text = request_output.outputs[0].text + if request_id not in gen_strings: + gen_strings[request_id] = new_text + else: + gen_strings[request_id] += new_text + + # Confirmed tracked values matches what we expected. + for idx, (ref_gen_str, + stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)): + + # Request should be aborted. + request_id = f"request-{idx}" + assert request_id in aborted + + # Collected values that were generated. + gen_str = gen_strings[request_id] + + # Construct reference strings. + stop_str_idx = ref_gen_str.find(stop_str) + ref_str_exc_stop = ref_gen_str[:stop_str_idx] + ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str + + if include_stop_str_in_output: + assert gen_str == ref_str_inc_stop, ( + f"{gen_str=}, {ref_str_inc_stop=}") + else: + assert gen_str == ref_str_exc_stop, ( + f"{gen_str=}, {ref_str_exc_stop=}") + + assert detokenizer.get_num_unfinished_requests() == 0 + assert not detokenizer.has_unfinished_requests() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py new file mode 100644 index 0000000000000..b3692b594326a --- /dev/null +++ b/tests/v1/engine/test_engine_core.py @@ -0,0 +1,140 @@ +import time +import uuid + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.core import EngineCore + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) +PROMPT = "Hello my name is Robert and I love quantization kernels" +PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids + + +def make_request() -> EngineCoreRequest: + return EngineCoreRequest( + request_id=uuid.uuid4(), + prompt=PROMPT, + prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, + sampling_params=SamplingParams(), + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, + ) + + +def test_engine_core(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + """Setup the EngineCore.""" + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + + engine_core = EngineCore(vllm_config=vllm_config, + executor_class=executor_class, + usage_context=UsageContext.UNKNOWN_CONTEXT) + """Test basic request lifecycle.""" + + # First request. + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + + # Second request. + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 1 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Add two requests in a row. + engine_core.add_request(make_request()) + engine_core.add_request(make_request()) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 4 + + # Loop through until they are all done. + while len(engine_core.step()) > 0: + pass + + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + """Test abort cycle.""" + + # Basic abort. + req = make_request() + request_id = req.request_id + + engine_core.add_request(req) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 1 + + engine_core.abort_requests([request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + # Add, step, abort 1 of the 3. + req0 = make_request() + req1 = make_request() + req2 = make_request() + + engine_core.add_request(req0) + engine_core.add_request(req1) + assert len(engine_core.scheduler.waiting) == 2 + assert len(engine_core.scheduler.running) == 0 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + engine_core.add_request(req2) + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 3 + + # Abort just one. + engine_core.abort_requests([req1.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + _ = engine_core.step() + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 2 + + # Abort the other requests at the same time. + engine_core.abort_requests([req2.request_id, req0.request_id]) + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py new file mode 100644 index 0000000000000..7b241bf836a0e --- /dev/null +++ b/tests/v1/engine/test_engine_core_client.py @@ -0,0 +1,205 @@ +import asyncio +import time +import uuid +from typing import Dict, List + +import pytest +from transformers import AutoTokenizer + +from vllm import SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.platforms import current_platform +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine import EngineCoreRequest +from vllm.v1.engine.async_llm import AsyncLLM +from vllm.v1.engine.core_client import EngineCoreClient + +if not current_platform.is_cuda(): + pytest.skip(reason="V1 currently only supported on CUDA.", + allow_module_level=True) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" +TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) +PROMPT = "Hello my name is Robert and I love quantization kernels" +PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids + + +def make_request(params: SamplingParams) -> EngineCoreRequest: + return EngineCoreRequest( + request_id=str(uuid.uuid4()), + prompt=PROMPT, + prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, + sampling_params=params, + eos_token_id=None, + arrival_time=time.time(), + lora_request=None, + ) + + +def loop_until_done(client: EngineCoreClient, outputs: Dict): + + while True: + engine_core_outputs = client.get_output() + + if len(engine_core_outputs) == 0: + break + + all_finished = True + for out in engine_core_outputs: + outputs[out.request_id].append(out) + if not out.finished: + all_finished = False + + if all_finished: + break + + +async def loop_until_done_async(client: EngineCoreClient, outputs: Dict): + + while True: + engine_core_outputs = await client.get_output_async() + + if len(engine_core_outputs) == 0: + break + + all_finished = True + for out in engine_core_outputs: + outputs[out.request_id].append(out) + if not out.finished: + all_finished = False + + if all_finished: + break + + +@pytest.mark.parametrize("multiprocessing_mode", [True, False]) +def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + client = EngineCoreClient.make_client( + vllm_config, + executor_class, + UsageContext.UNKNOWN_CONTEXT, + multiprocess_mode=multiprocessing_mode, + asyncio_mode=False, + ) + + MAX_TOKENS = 20 + params = SamplingParams(max_tokens=MAX_TOKENS) + """Normal Request Cycle.""" + requests = [make_request(params) for _ in range(10)] + request_ids = [req.request_id for req in requests] + + # Add requests to the engine. + for request in requests: + client.add_request(request) + time.sleep(0.01) + + outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + loop_until_done(client, outputs) + + for req_id in request_ids: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{outputs[req_id]=}, {MAX_TOKENS=}") + """Abort Request Cycle.""" + + # Note: this code pathway will only work for multiprocessing + # since we have to call get_output() explicitly + + # Add requests to the engine. + for idx, request in enumerate(requests): + client.add_request(request) + time.sleep(0.01) + if idx % 2 == 0: + client.abort_requests([request.request_id]) + + outputs = {req_id: [] for req_id in request_ids} + loop_until_done(client, outputs) + + for idx, req_id in enumerate(request_ids): + if idx % 2 == 0: + assert len(outputs[req_id]) < MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + else: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + """Abort after request is finished.""" + + # Note: this code pathway will only work for multiprocessing + # since we have to call get_output() explicitly + + request = requests[0] + client.add_request(request) + time.sleep(10.) + + client.abort_requests([request.request_id]) + + # Shutdown the client. + client.shutdown() + + +@pytest.mark.asyncio +async def test_engine_core_client_asyncio(monkeypatch): + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + engine_args = EngineArgs(model=MODEL_NAME) + vllm_config = engine_args.create_engine_config() + executor_class = AsyncLLM._get_executor_cls(vllm_config) + client = EngineCoreClient.make_client( + vllm_config, + executor_class, + UsageContext.UNKNOWN_CONTEXT, + multiprocess_mode=True, + asyncio_mode=True, + ) + + MAX_TOKENS = 20 + params = SamplingParams(max_tokens=MAX_TOKENS) + """Normal Request Cycle.""" + + requests = [make_request(params) for _ in range(10)] + request_ids = [req.request_id for req in requests] + + # Add requests to the engine. + for request in requests: + await client.add_request_async(request) + await asyncio.sleep(0.01) + + outputs: Dict[str, List] = {req_id: [] for req_id in request_ids} + await loop_until_done_async(client, outputs) + + for req_id in request_ids: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{outputs[req_id]=}, {MAX_TOKENS=}") + """Abort Request Cycle.""" + + # Add requests to the engine. + for idx, request in enumerate(requests): + await client.add_request_async(request) + await asyncio.sleep(0.01) + if idx % 2 == 0: + await client.abort_requests_async([request.request_id]) + + outputs = {req_id: [] for req_id in request_ids} + await loop_until_done_async(client, outputs) + + for idx, req_id in enumerate(request_ids): + if idx % 2 == 0: + assert len(outputs[req_id]) < MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + else: + assert len(outputs[req_id]) == MAX_TOKENS, ( + f"{len(outputs[req_id])=}, {MAX_TOKENS=}") + + # Shutdown the client. + client.shutdown() diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index f985f70728a60..563178d3ab60d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -158,7 +158,8 @@ def get_seq_lens( * Appropriate sequence lengths tensor for key & value ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): seq_lens_q = self.seq_lens seq_lens_kv = self.seq_lens elif attn_type == AttentionType.ENCODER: @@ -189,7 +190,8 @@ def get_attn_bias( * Appropriate attention bias value given the attention type ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): return self.attn_bias elif attn_type == AttentionType.ENCODER: return self.encoder_attn_bias @@ -215,7 +217,8 @@ def set_attn_bias( encoder/decoder cross-attention ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): self.attn_bias = attn_bias elif attn_type == AttentionType.ENCODER: self.encoder_attn_bias = attn_bias @@ -252,7 +255,8 @@ def get_seq_len_block_table_args( * Appropriate block tables (or None) ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): # Decoder self-attention # Choose max_seq_len based on whether we are in prompt_run return (self.seq_lens_tensor, self.max_decode_seq_len, @@ -420,6 +424,8 @@ def forward( "Torch SDPA backend doesn't support prefix decoding.") if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") # Decoding run. ( seq_lens_arg, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index c3c670422defa..5682faa158069 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -389,6 +389,8 @@ class VllmBackend: returned_callable: Callable # Inductor passes to run on the graph pre-defunctionalization post_grad_passes: Sequence[Callable] + sym_tensor_indices: List[int] + input_buffers: List[torch.Tensor] def __init__(self, post_grad_passes: Sequence[Callable] = ()): global global_graph_pool @@ -401,6 +403,9 @@ def __init__(self, post_grad_passes: Sequence[Callable] = ()): self.graph_pool = global_graph_pool self.post_grad_passes = post_grad_passes + self.sym_tensor_indices = [] + self.input_buffers = [] + # `torch.compile` is JIT compiled, so we don't need to # do anything here @@ -461,7 +466,46 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self._called = True - return self.split_gm + if not self.compilation_configs.use_cudagraph or \ + not self.compilation_configs.cudagraph_copy_inputs: + return self.split_gm + + # if we need to copy input buffers for cudagraph + from torch._guards import detect_fake_mode + fake_mode = detect_fake_mode() + fake_args = [ + fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t + for t in example_inputs + ] + + # index of tensors that have symbolic shapes (batch size) + self.sym_tensor_indices = [ + i for i, x in enumerate(fake_args) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + ] + + # compiler managed cudagraph input buffers + # we assume the first run with symbolic shapes + # has the maximum size among all the tensors + self.input_buffers = [ + example_inputs[x].clone() for x in self.sym_tensor_indices + ] + + def copy_and_call(*args): + list_args = list(args) + for i, index in enumerate(self.sym_tensor_indices): + runtime_tensor = list_args[index] + runtime_shape = runtime_tensor.shape[0] + static_tensor = self.input_buffers[i][:runtime_shape] + + # copy the tensor to the static buffer + static_tensor.copy_(runtime_tensor) + + # replace the tensor in the list_args to the static buffer + list_args[index] = static_tensor + return self.split_gm(*list_args) + + return copy_and_call @dataclasses.dataclass diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py index 72377533140b5..3e663505c627d 100644 --- a/vllm/compilation/config.py +++ b/vllm/compilation/config.py @@ -32,6 +32,11 @@ class CompilationConfig(BaseModel): It means the first several runs will be treated as warmup runs. Only after that, the execution will be recorded, and the recorded cudagraph will be used for subsequent runs. + - cudagraph_copy_inputs: whether to copy input tensors for + cudagraph. If the caller can guarantee that the same input buffers + are always used, it can set this to False. Otherwise, it should + set this to True, and the compiler will copy the input to an + internally managed buffer. Default is False. - Inductor compilation: - use_inductor: whether to use inductor compilation. - False: inductor compilation is not used. graph runs in eager. @@ -78,6 +83,7 @@ class CompilationConfig(BaseModel): non_cudagraph_ops: List[str] = Field(default_factory=list) cudagraph_num_of_warmups: int = 0 cudagraph_capture_sizes: Optional[List[int]] = None + cudagraph_copy_inputs: bool = False dump_graph_stages: List[str] = Field(default_factory=list) dump_graph_dir: Path = Field(default=Path(".")) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 3053e57e0b63b..ca1e96a33c014 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -6,6 +6,7 @@ import vllm.envs as envs from vllm.compilation.levels import CompilationLevel from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo @@ -110,26 +111,26 @@ def _support_torch_compile(cls: type, """ A decorator to add support for compiling the forward method of a class. """ - - # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner - # will handle the compilation, so we don't need to do anything here. - if envs.VLLM_TORCH_COMPILE_LEVEL in [ - CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS - ] or not supports_dynamo(): + if TorchCompileWrapperWithCustomDispatcher in cls.__bases__: + # support decorating multiple times return cls # take care of method resolution order # make sure super().__init__ is called on the base class # other than TorchCompileWrapperWithCustomDispatcher - if TorchCompileWrapperWithCustomDispatcher not in cls.__bases__: - # support decorating multiple times - cls.__bases__ = cls.__bases__ + ( - TorchCompileWrapperWithCustomDispatcher, ) + cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, ) old_init = cls.__init__ # type: ignore - def __init__(self, *args, **kwargs): - old_init(self, *args, **kwargs) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs): + old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) + # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner + # will handle the compilation, so we don't need to do anything here. + self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [ + CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS + ] or not supports_dynamo() + if self.do_not_compile: + return TorchCompileWrapperWithCustomDispatcher.__init__(self) cls.__init__ = __init__ # type: ignore @@ -138,7 +139,7 @@ def __call__(self, *args, **kwargs): # torch.compiler.is_compiling() means we are inside the compilation # e.g. TPU has the compilation logic in model runner, so we don't # need to compile the model inside. - if torch.compiler.is_compiling(): + if self.do_not_compile or torch.compiler.is_compiling(): return self.forward(*args, **kwargs) # the first compilation needs to have dynamic shapes marked diff --git a/vllm/config.py b/vllm/config.py index d358d0b2deddd..f61665eb2214f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -19,9 +19,9 @@ ConfigFormat, get_config, get_hf_image_processor_config, get_hf_text_config, get_pooling_config, get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) +from vllm.transformers_utils.s3_utils import S3Model, is_s3 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, print_warning_once) -from vllm.transformers_utils.s3_utils import is_s3, S3Model if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -108,7 +108,7 @@ class ModelConfig: matches the model name exposed via the APIs. If multiple model names provided, the first name will be used. If not specified, the model name will be the same as `model`. - limit_mm_per_prompt: Maximum number of data instances per modality + limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. override_neuron_config: Initialize non default neuron config or override default neuron config that are specific to Neuron devices, @@ -195,16 +195,17 @@ def __init__( msg = ("`--rope-theta` will be removed in a future release. " f"'Please instead use `--hf-overrides '{hf_override!r}'`") warnings.warn(DeprecationWarning(msg), stacklevel=2) - + if is_s3(model): self.s3_model = S3Model() self.s3_model.pull_files(model, allow_pattern=["*config.json"]) self.model_weights = self.model self.model = self.s3_model.dir - + if is_s3(tokenizer): self.s3_tokenizer = S3Model() - self.s3_tokenizer.pull_files(model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) + self.s3_tokenizer.pull_files( + model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"]) self.tokenizer = self.s3_tokenizer.dir # The tokenizer version is consistent with the model version by default. @@ -964,9 +965,12 @@ class ParallelConfig: https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler. placement_group: ray distributed model workers placement group. distributed_executor_backend: Backend to use for distributed model - workers, either "ray" or "mp" (multiprocessing). If either - pipeline_parallel_size or tensor_parallel_size is greater than 1, - will default to "ray" if Ray is installed or "mp" otherwise. + workers, either "ray" or "mp" (multiprocessing). If the product + of pipeline_parallel_size and tensor_parallel_size is less than + or equal to the number of GPUs available, "mp" will be used to + keep processing on a single host. Otherwise, this will default + to "ray" if Ray is installed and fail otherwise. Note that tpu + and hpu only support Ray for distributed inference. """ def __init__( @@ -1697,6 +1701,7 @@ class LoRAConfig: # This is a constant. lora_vocab_padding_size: ClassVar[int] = 256 long_lora_scaling_factors: Optional[Tuple[float]] = None + bias_enabled: bool = False def __post_init__(self): # Setting the maximum rank to 256 should be able to satisfy the vast @@ -2051,12 +2056,15 @@ class VllmConfig: simplifies passing around the distinct configurations in the codebase. """ - model_config: ModelConfig - cache_config: CacheConfig - parallel_config: ParallelConfig - scheduler_config: SchedulerConfig - device_config: DeviceConfig - load_config: LoadConfig + model_config: ModelConfig = field(default=None, init=True) # type: ignore + cache_config: CacheConfig = field(default=None, init=True) # type: ignore + parallel_config: ParallelConfig = field(default=None, + init=True) # type: ignore + scheduler_config: SchedulerConfig = field(default=None, + init=True) # type: ignore + device_config: DeviceConfig = field(default=None, + init=True) # type: ignore + load_config: LoadConfig = field(default=None, init=True) # type: ignore lora_config: Optional[LoRAConfig] = None speculative_config: Optional[SpeculativeConfig] = None decoding_config: Optional[DecodingConfig] = None @@ -2101,11 +2109,14 @@ def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig": def __post_init__(self): """Verify configs are valid & consistent with each other. """ - self.model_config.verify_async_output_proc(self.parallel_config, - self.speculative_config, - self.device_config) - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) + if self.model_config is not None: + self.model_config.verify_async_output_proc(self.parallel_config, + self.speculative_config, + self.device_config) + self.model_config.verify_with_parallel_config(self.parallel_config) + + if self.cache_config is not None: + self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config: self.lora_config.verify_with_model_config(self.model_config) @@ -2119,3 +2130,44 @@ def __post_init__(self): self.model_config is not None and self.load_config is not None: self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) + + def __str__(self): + return ("model=%r, speculative_config=%r, tokenizer=%r, " + "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " + "override_neuron_config=%s, tokenizer_revision=%s, " + "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " + "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " + "pipeline_parallel_size=%d, " + "disable_custom_all_reduce=%s, quantization=%s, " + "enforce_eager=%s, kv_cache_dtype=%s, " + "quantization_param_path=%s, device_config=%s, " + "decoding_config=%r, observability_config=%r, " + "seed=%d, served_model_name=%s, " + "num_scheduler_steps=%d, enable_prefix_caching=%s, " + "use_async_output_proc=%s, mm_processor_kwargs=%s") % \ + (self.model_config.model, self.speculative_config, + self.model_config.tokenizer, + self.model_config.skip_tokenizer_init, + self.model_config.tokenizer_mode, + self.model_config.revision, + self.model_config.override_neuron_config, + self.model_config.tokenizer_revision, + self.model_config.trust_remote_code, + self.model_config.dtype, + self.model_config.max_model_len, + self.load_config.download_dir, + self.load_config.load_format, + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size, + self.parallel_config.disable_custom_all_reduce, + self.model_config.quantization, + self.model_config.enforce_eager, + self.cache_config.cache_dtype, + self.model_config.quantization_param_path, + self.device_config.device, self.decoding_config, + self.observability_config, self.model_config.seed, + self.model_config.served_model_name, + self.scheduler_config.num_scheduler_steps, + self.cache_config.enable_prefix_caching, + self.model_config.use_async_output_proc, + self.model_config.mm_processor_kwargs) diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index 61ed7afba12ed..21f4c63b6572d 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -393,7 +393,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: with num_lookahead_slots. Args: - seq_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap out. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. @@ -409,7 +409,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: swapping out the given sequence_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap out. Returns: List[Tuple[int, int]]: The mapping of swapping block from @@ -459,7 +459,7 @@ def _can_swap(self, on to the 'device'. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap in/out. device (Device): device to swap the 'seq_group' on. status (SequenceStatus): The status of sequence which is needed for action. RUNNING for swap out and SWAPPED for swap in diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 7319566545678..7c6f48e88637b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -9,6 +9,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum, ncclRedOpTypeEnum, ncclUniqueId) +from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger logger = init_logger(__name__) @@ -18,7 +19,7 @@ class PyNcclCommunicator: def __init__( self, - group: ProcessGroup, + group: Union[ProcessGroup, StatelessProcessGroup], device: Union[int, str, torch.device], library_path: Optional[str] = None, ): @@ -33,13 +34,18 @@ def __init__( It is the caller's responsibility to make sure each communicator is bind to a unique device. """ - assert dist.is_initialized() - assert dist.get_backend(group) != dist.Backend.NCCL, ( - "PyNcclCommunicator should be attached to a non-NCCL group.") + if not isinstance(group, StatelessProcessGroup): + assert dist.is_initialized() + assert dist.get_backend(group) != dist.Backend.NCCL, ( + "PyNcclCommunicator should be attached to a non-NCCL group.") + # note: this rank is the rank in the group + self.rank = dist.get_rank(group) + self.world_size = dist.get_world_size(group) + else: + self.rank = group.rank + self.world_size = group.world_size + self.group = group - # note: this rank is the rank in the group - self.rank = dist.get_rank(group) - self.world_size = dist.get_world_size(group) # if world_size == 1, no need to create communicator if self.world_size == 1: @@ -68,13 +74,17 @@ def __init__( else: # construct an empty unique id self.unique_id = ncclUniqueId() - tensor = torch.ByteTensor(list(self.unique_id.internal)) - ranks = dist.get_process_group_ranks(group) - # arg `src` in `broadcast` is the global rank - dist.broadcast(tensor, src=ranks[0], group=group) - byte_list = tensor.tolist() - for i, byte in enumerate(byte_list): - self.unique_id.internal[i] = byte + + if not isinstance(group, StatelessProcessGroup): + tensor = torch.ByteTensor(list(self.unique_id.internal)) + ranks = dist.get_process_group_ranks(group) + # arg `src` in `broadcast` is the global rank + dist.broadcast(tensor, src=ranks[0], group=group) + byte_list = tensor.tolist() + for i, byte in enumerate(byte_list): + self.unique_id.internal[i] = byte + else: + self.unique_id = group.broadcast_obj(self.unique_id, src=0) if isinstance(device, int): device = torch.device(f"cuda:{device}") elif isinstance(device, str): diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index d24ce898707fc..dcfcb848cbe06 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -2,14 +2,14 @@ # Adapted from # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -from typing import Sequence, Tuple +import dataclasses +import pickle +import time +from collections import deque +from typing import Any, Deque, Dict, Optional, Sequence, Tuple import torch -from torch.distributed import ProcessGroup -from torch.distributed.distributed_c10d import (Backend, PrefixStore, - _get_default_timeout, - is_nccl_available) -from torch.distributed.rendezvous import rendezvous +from torch.distributed import TCPStore import vllm.envs as envs from vllm.logger import init_logger @@ -91,69 +91,137 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int, return (start_layer, end_layer) -def stateless_init_process_group(init_method: str, rank: int, world_size: int, - backend: str) -> ProcessGroup: - """A replacement for `torch.distributed.init_process_group` that does not - pollute the global state. - - If we have process A and process B called `torch.distributed.init_process_group` - to form a group, and then we want to form another group with process A, B, C, - D, it is not possible in PyTorch, because process A and process B have already - formed a group, and process C and process D cannot join that group. This - function is a workaround for this issue. - - `torch.distributed.init_process_group` is a global call, while this function - is a stateless call. It will return a `ProcessGroup` object that can be used - for collective communication. With this function, process A and process B - can call `stateless_init_process_group` to form a group, and then process A, B, - C, and D can call `stateless_init_process_group` to form another group. - """ # noqa - - backend = Backend(backend) # it is basically string - timeout = _get_default_timeout(backend) - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) - - group_rank = rank - group_size = world_size - - # Use a PrefixStore to avoid accidental overrides of keys used by - # different systems (e.g. RPC) in case the store is multi-tenant. - prefix_store = PrefixStore(init_method, store) - - pg_options = ProcessGroup.Options(backend=backend, timeout=timeout) - - pg: ProcessGroup = ProcessGroup( - prefix_store, - group_rank, - group_size, - pg_options, - ) - - if backend == "gloo": - from torch.distributed.distributed_c10d import ProcessGroupGloo - backend_class = ProcessGroupGloo(prefix_store, - group_rank, - group_size, - timeout=timeout) - backend_type = ProcessGroup.BackendType.GLOO - device = torch.device("cpu") - elif backend == "nccl": - assert is_nccl_available() - from torch.distributed.distributed_c10d import ProcessGroupNCCL - - backend_options = ProcessGroupNCCL.Options() - backend_options._timeout = timeout - - backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size, - backend_options) - backend_type = ProcessGroup.BackendType.NCCL - device = torch.device("cuda") - - backend_class._set_sequence_number_for_group() - - pg._register_backend(device, backend_type, backend_class) - - return pg +@dataclasses.dataclass +class StatelessProcessGroup: + """A dataclass to hold a metadata store, and the rank, world_size of the + group. Only use it to communicate metadata between processes. + For data-plane communication, create NCCL-related objects. + """ + rank: int + world_size: int + store: torch._C._distributed_c10d.Store + data_expiration_seconds: int = 3600 # 1 hour + + # dst rank -> counter + send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + # src rank -> counter + recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict) + broadcast_send_counter: int = 0 + broadcast_recv_src_counter: Dict[int, int] = dataclasses.field( + default_factory=dict) + + # A deque to store the data entries, with key and timestamp. + entries: Deque[Tuple[str, + float]] = dataclasses.field(default_factory=deque) + + def __post_init__(self): + assert self.rank < self.world_size + self.send_dst_counter = {i: 0 for i in range(self.world_size)} + self.recv_src_counter = {i: 0 for i in range(self.world_size)} + self.broadcast_recv_src_counter = { + i: 0 + for i in range(self.world_size) + } + + def send_obj(self, obj: Any, dst: int): + """Send an object to a destination rank.""" + self.expire_data() + key = f"send_to/{dst}/{self.send_dst_counter[dst]}" + self.store.set(key, pickle.dumps(obj)) + self.send_dst_counter[dst] += 1 + self.entries.append((key, time.time())) + + def expire_data(self): + """Expire data that is older than `data_expiration_seconds` seconds.""" + while self.entries: + # check the oldest entry + key, timestamp = self.entries[0] + if time.time() - timestamp > self.data_expiration_seconds: + self.store.delete_key(key) + self.entries.popleft() + else: + break + + def recv_obj(self, src: int) -> Any: + """Receive an object from a source rank.""" + obj = pickle.loads( + self.store.get( + f"send_to/{self.rank}/{self.recv_src_counter[src]}")) + self.recv_src_counter[src] += 1 + return obj + + def broadcast_obj(self, obj: Optional[Any], src: int) -> Any: + """Broadcast an object from a source rank to all other ranks. + It does not clean up after all ranks have received the object. + Use it for limited times, e.g., for initialization. + """ + if self.rank == src: + self.expire_data() + key = (f"broadcast_from/{src}/" + f"{self.broadcast_send_counter}") + self.store.set(key, pickle.dumps(obj)) + self.broadcast_send_counter += 1 + self.entries.append((key, time.time())) + return obj + else: + key = (f"broadcast_from/{src}/" + f"{self.broadcast_recv_src_counter[src]}") + recv_obj = pickle.loads(self.store.get(key)) + self.broadcast_recv_src_counter[src] += 1 + return recv_obj + + def all_gather_obj(self, obj: Any) -> list[Any]: + """All gather an object from all ranks.""" + gathered_objs = [] + for i in range(self.world_size): + if i == self.rank: + gathered_objs.append(obj) + self.broadcast_obj(obj, src=self.rank) + else: + recv_obj = self.broadcast_obj(None, src=i) + gathered_objs.append(recv_obj) + return gathered_objs + + def barrier(self): + """A barrier to synchronize all ranks.""" + for i in range(self.world_size): + if i == self.rank: + self.broadcast_obj(None, src=self.rank) + else: + self.broadcast_obj(None, src=i) + + @staticmethod + def create( + host: str, + port: int, + rank: int, + world_size: int, + data_expiration_seconds: int = 3600, + ) -> "StatelessProcessGroup": + """A replacement for `torch.distributed.init_process_group` that does not + pollute the global state. + + If we have process A and process B called `torch.distributed.init_process_group` + to form a group, and then we want to form another group with process A, B, C, + D, it is not possible in PyTorch, because process A and process B have already + formed a group, and process C and process D cannot join that group. This + function is a workaround for this issue. + + `torch.distributed.init_process_group` is a global call, while this function + is a stateless call. It will return a `StatelessProcessGroup` object that can be + used for exchanging metadata. With this function, process A and process B + can call `StatelessProcessGroup.create` to form a group, and then process A, B, + C, and D can call `StatelessProcessGroup.create` to form another group. + """ # noqa + store = TCPStore( + host_name=host, + port=port, + world_size=world_size, + is_master=(rank == 0), + ) + + return StatelessProcessGroup( + rank=rank, + world_size=world_size, + store=store, + data_expiration_seconds=data_expiration_seconds) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index acfcb6456f878..cb178dafdecb0 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -19,8 +19,6 @@ from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.platforms import current_platform -from vllm.transformers_utils.config import ( - maybe_register_config_serialize_by_value) from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -145,6 +143,7 @@ class EngineArgs: limit_mm_per_prompt: Optional[Mapping[str, int]] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None enable_lora: bool = False + enable_lora_bias: bool = False max_loras: int = 1 max_lora_rank: int = 16 enable_prompt_adapter: bool = False @@ -373,9 +372,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '--distributed-executor-backend', choices=['ray', 'mp'], default=EngineArgs.distributed_executor_backend, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') + help='Backend to use for distributed model ' + 'workers, either "ray" or "mp" (multiprocessing). If the product ' + 'of pipeline_parallel_size and tensor_parallel_size is less than ' + 'or equal to the number of GPUs available, "mp" will be used to ' + 'keep processing on a single host. Otherwise, this will default ' + 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' + 'and hpu only support Ray for distributed inference.') + parser.add_argument( '--worker-use-ray', action='store_true', @@ -583,6 +587,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--enable-lora', action='store_true', help='If True, enable handling of LoRA adapters.') + parser.add_argument('--enable-lora-bias', + action='store_true', + help='If True, enable bias for LoRA adapters.') parser.add_argument('--max-loras', type=int, default=EngineArgs.max_loras, @@ -621,8 +628,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=EngineArgs.max_cpu_loras, help=('Maximum number of LoRAs to store in CPU memory. ' - 'Must be >= than max_num_seqs. ' - 'Defaults to max_num_seqs.')) + 'Must be >= than max_loras. ' + 'Defaults to max_loras.')) parser.add_argument( '--fully-sharded-loras', action='store_true', @@ -1015,8 +1022,6 @@ def create_engine_config(self) -> VllmConfig: "supported for multimodal models and has been disabled.") self.enable_prefix_caching = False - maybe_register_config_serialize_by_value(self.trust_remote_code) - cache_config = CacheConfig( # neuron needs block_size = max_model_len block_size=self.block_size if self.device != "neuron" else @@ -1149,6 +1154,7 @@ def create_engine_config(self) -> VllmConfig: and parallel_config.use_ray), policy=self.scheduling_policy) lora_config = LoRAConfig( + bias_enabled=self.enable_lora_bias, max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, fully_sharded_loras=self.fully_sharded_loras, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1a371b52bb64b..5a5388708b1c6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -19,6 +19,7 @@ from vllm.executor.gpu_executor import GPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( @@ -729,6 +730,9 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.engine.input_preprocessor + async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index d550b1d244af8..f5299746d845d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -30,7 +30,7 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, - PromptType) + PromptType, SingletonInputsAdapter) from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -39,6 +39,7 @@ from vllm.model_executor.guided_decoding import ( get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams @@ -226,6 +227,7 @@ def __init__( usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: @@ -335,7 +337,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: model_config) self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) + self.tokenizer, + mm_registry) self.input_registry = input_registry self.input_processor = input_registry.create_input_processor( @@ -851,13 +854,6 @@ def add_request( ) processed_inputs = self.input_processor(preprocessed_inputs) - # This is a bit of a hack - copy the mm_processor_kwargs that were - # used in the input processor to the processed output, since these - # kwargs are presumed to be immutable and the values should be aligned - # between the input processor (here) and the input mapper. - processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get( - "mm_processor_kwargs") - self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, @@ -1672,6 +1668,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 + num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else @@ -1680,6 +1677,10 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_queue_requests: List[float] = [] + time_inference_requests: List[float] = [] + time_prefill_requests: List[float] = [] + time_decode_requests: List[float] = [] time_in_queue_requests: List[float] = [] model_forward_time_requests: List[float] = [] model_execute_time_requests: List[float] = [] @@ -1687,6 +1688,7 @@ def _get_stats(self, num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] n_requests: List[int] = [] + max_num_generation_tokens_requests: List[int] = [] max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] @@ -1777,6 +1779,18 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + if (seq_group.metrics.first_scheduled_time is not None and + seq_group.metrics.first_token_time is not None): + time_queue_requests.append( + seq_group.metrics.first_scheduled_time - + seq_group.metrics.arrival_time) + time_prefill_requests.append( + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time) + time_decode_requests.append( + now - seq_group.metrics.first_token_time) + time_inference_requests.append( + now - seq_group.metrics.first_scheduled_time) if seq_group.metrics.time_in_queue is not None: time_in_queue_requests.append( seq_group.metrics.time_in_queue) @@ -1793,6 +1807,9 @@ def _get_stats(self, seq.get_output_len() for seq in seq_group.get_finished_seqs() ]) + max_num_generation_tokens_requests.append( + max(seq.get_output_len() + for seq in seq_group.get_seqs())) if seq_group.sampling_params is not None: n_requests.append(seq_group.sampling_params.n) max_tokens_requests.append( @@ -1811,7 +1828,8 @@ def _get_stats(self, num_generation_tokens_iter = ( actual_num_batched_tokens - num_prompt_tokens_iter + num_generation_tokens_from_prefill_groups) - + num_tokens_iter = (num_generation_tokens_iter + + num_prompt_tokens_iter) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. if model_output and (model_output[0].spec_decode_worker_metrics @@ -1837,6 +1855,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter=num_prompt_tokens_iter, num_generation_tokens_iter=num_generation_tokens_iter, + num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, spec_decode_metrics=spec_decode_metrics, @@ -1845,12 +1864,18 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_queue_requests=time_queue_requests, + time_inference_requests=time_inference_requests, + time_prefill_requests=time_prefill_requests, + time_decode_requests=time_decode_requests, time_in_queue_requests=time_in_queue_requests, model_forward_time_requests=model_forward_time_requests, model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, + max_num_generation_tokens_requests= + max_num_generation_tokens_requests, n_requests=n_requests, max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, @@ -1990,7 +2015,7 @@ def _validate_model_inputs(self, inputs: ProcessorInputs, else: prompt_inputs = inputs - prompt_ids = prompt_inputs.get("prompt_token_ids") + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 3e3357ed74633..e896bcdded2d1 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -111,6 +111,15 @@ def __init__(self, labelnames: List[str], max_model_len: int): name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames) + self.counter_tokens = self._counter_cls( + name="vllm:tokens_total", + documentation="Number of prefill plus generation tokens processed.", + labelnames=labelnames) + self.histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + labelnames=labelnames, + buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096]) self.histogram_time_to_first_token = self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", @@ -130,23 +139,45 @@ def __init__(self, labelnames: List[str], max_model_len: int): # Request stats # Latency + request_latency_buckets = [ + 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, + 40.0, 50.0, 60.0 + ] self.histogram_e2e_time_request = self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of end to end request latency in seconds.", labelnames=labelnames, - buckets=[ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 - ]) + buckets=request_latency_buckets) + self.histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) + self.histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + labelnames=labelnames, + buckets=request_latency_buckets) self.histogram_time_in_queue_request = self._histogram_cls( name="vllm:time_in_queue_requests", documentation= "Histogram of time the request spent in the queue in seconds.", labelnames=labelnames, - buckets=[ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0 - ]) + buckets=request_latency_buckets) self.histogram_model_forward_time_request = self._histogram_cls( name="vllm:model_forward_time_milliseconds", documentation= @@ -173,6 +204,12 @@ def __init__(self, labelnames: List[str], max_model_len: int): labelnames=labelnames, buckets=build_1_2_5_buckets(max_model_len), ) + self.histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len)) self.histogram_n_request = self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", @@ -526,6 +563,8 @@ def _log_prometheus(self, stats: Stats) -> None: stats.num_prompt_tokens_iter) self._log_counter(self.metrics.counter_generation_tokens, stats.num_generation_tokens_iter) + self._log_histogram(self.metrics.histogram_iteration_tokens, + [stats.num_tokens_iter]) self._log_histogram(self.metrics.histogram_time_to_first_token, stats.time_to_first_tokens_iter) self._log_histogram(self.metrics.histogram_time_per_output_token, @@ -535,6 +574,14 @@ def _log_prometheus(self, stats: Stats) -> None: # Latency self._log_histogram(self.metrics.histogram_e2e_time_request, stats.time_e2e_requests) + self._log_histogram(self.metrics.histogram_queue_time_request, + stats.time_queue_requests) + self._log_histogram(self.metrics.histogram_inference_time_request, + stats.time_inference_requests) + self._log_histogram(self.metrics.histogram_decode_time_request, + stats.time_prefill_requests) + self._log_histogram(self.metrics.histogram_prefill_time_request, + stats.time_decode_requests) self._log_histogram(self.metrics.histogram_time_in_queue_request, stats.time_in_queue_requests) self._log_histogram(self.metrics.histogram_model_forward_time_request, @@ -553,6 +600,9 @@ def _log_prometheus(self, stats: Stats) -> None: self.metrics.histogram_num_generation_tokens_request, stats.num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_n_request, stats.n_requests) + self._log_histogram( + self.metrics.histogram_max_num_generation_tokens_request, + stats.max_num_generation_tokens_requests) self._log_histogram(self.metrics.histogram_max_tokens_request, stats.max_tokens_requests) diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 19dcbfe57d112..5f7ec3bbcb269 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -39,6 +39,7 @@ class Stats: # Iteration stats (should have _iter suffix) num_prompt_tokens_iter: int num_generation_tokens_iter: int + num_tokens_iter: int time_to_first_tokens_iter: List[float] time_per_output_tokens_iter: List[float] num_preemption_iter: int @@ -46,6 +47,10 @@ class Stats: # Request stats (should have _requests suffix) # Latency time_e2e_requests: List[float] + time_queue_requests: List[float] + time_inference_requests: List[float] + time_prefill_requests: List[float] + time_decode_requests: List[float] time_in_queue_requests: List[float] model_forward_time_requests: List[float] model_execute_time_requests: List[float] @@ -53,6 +58,7 @@ class Stats: num_prompt_tokens_requests: List[int] num_generation_tokens_requests: List[int] n_requests: List[int] + max_num_generation_tokens_requests: List[int] max_tokens_requests: List[int] finished_reason_requests: List[str] waiting_lora_adapters: List[str] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 882742c2fc61b..fe21c58c775fe 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -31,6 +31,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -94,6 +95,8 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, parallel_config=engine_config.parallel_config, enable_lora=bool(engine_config.lora_config), ) + self.input_preprocessor = InputPreprocessor(self.model_config, + self.tokenizer) # Send RPCGenerateRequest to the MQLLMEngine. self.input_socket: Socket = self.context.socket(zmq.constants.PUSH) @@ -345,6 +348,9 @@ async def _check_success(error_message: str, socket: Socket): or response != VLLM_RPC_SUCCESS_STR): raise ValueError(error_message) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.input_preprocessor + async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): return await self.tokenizer.get_lora_tokenizer_async(lora_request) diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 889845ee67312..7de23643a2e1c 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -6,7 +6,6 @@ import cloudpickle import zmq -import vllm.envs from vllm import AsyncEngineArgs, SamplingParams from vllm.engine.llm_engine import LLMEngine # yapf conflicts with isort for this block @@ -113,17 +112,9 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs, load_general_plugins() engine_config = engine_args.create_engine_config() - if vllm.envs.VLLM_USE_V1: - # Lazy import: the v1 package isn't distributed - from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine - engine_class = V1LLMEngine - else: - engine_class = LLMEngine - - executor_class = engine_class._get_executor_cls(engine_config) + executor_class = LLMEngine._get_executor_cls(engine_config) - use_async_sockets = (engine_config.model_config.use_async_output_proc - and not vllm.envs.VLLM_USE_V1) + use_async_sockets = engine_config.model_config.use_async_output_proc return cls(ipc_path=ipc_path, use_async_sockets=use_async_sockets, diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py index a71ad493d9920..4b701f81504bb 100644 --- a/vllm/engine/output_processor/stop_checker.py +++ b/vllm/engine/output_processor/stop_checker.py @@ -1,4 +1,4 @@ -from typing import Callable, Optional +from typing import Callable, List, Optional, Tuple from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams @@ -67,9 +67,13 @@ def maybe_stop_sequence( return # Check if any stop strings are matched. - stop_str = self._check_stop_strings(seq, new_char_count, - sampling_params) - if stop_str is not None: + stop = self.check_stop_strings( + seq.output_text, new_char_count, sampling_params.stop, + sampling_params.include_stop_str_in_output) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + seq.output_text = seq.output_text[:truncate_to] seq.status = SequenceStatus.FINISHED_STOPPED seq.stop_reason = stop_str return @@ -85,33 +89,40 @@ def maybe_stop_sequence( return @staticmethod - def _check_stop_strings(seq: Sequence, new_char_count: int, - sampling_params: SamplingParams) -> Optional[str]: + def check_stop_strings( + output_text: str, + new_char_count: int, + stop: List[str], + include_in_output: bool, + ) -> Optional[Tuple[str, int]]: """Check if any stop strings are matched and truncate sequence output text accordingly. - Returns the stop string if matched or else None. + Returns tuple (stop_string, offset) if matched or else None. + + Where stop_string is the matched stop string and offset is the + length to which output_text should be truncated, or -1 for no + truncation. """ - if not new_char_count or not sampling_params.stop: + if not new_char_count or not stop: return None - for stop_str in sampling_params.stop: + for stop_str in stop: stop_string_len = len(stop_str) # Avoid searching already-searched text. - stop_index = seq.output_text.find( - stop_str, -new_char_count - stop_string_len) + stop_index = output_text.find(stop_str, + -new_char_count - stop_string_len) if stop_index == -1: continue - if sampling_params.include_stop_str_in_output: + if include_in_output: # Truncate to end of stop string. stop_index += stop_string_len - if stop_index >= len(seq.output_text): + if stop_index >= len(output_text): # No truncation required. - return stop_str + return stop_str, -1 # Truncate the output text to either the beginning # or end of the stop string. - seq.output_text = seq.output_text[:stop_index] - return stop_str + return stop_str, stop_index return None diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e0b59d94cfdc3..e15395d75c91f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -62,7 +62,6 @@ def generate( async def beam_search( self, prompt: PromptType, - model_config: ModelConfig, request_id: str, params: BeamSearchParams, ) -> AsyncGenerator[RequestOutput, None]: @@ -74,13 +73,14 @@ async def beam_search( length_penalty = params.length_penalty include_stop_str_in_output = params.include_stop_str_in_output - tokenizer = await self.get_tokenizer() - input_preprocessor = InputPreprocessor(model_config, tokenizer) + preprocessor = await self.get_input_preprocessor() + tokenizer_group = preprocessor.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async() if is_explicit_encoder_decoder_prompt(prompt): raise NotImplementedError else: - processed_inputs = input_preprocessor._prompt_to_llm_inputs( + processed_inputs = preprocessor._prompt_to_llm_inputs( prompt, request_id=request_id, ) @@ -220,6 +220,7 @@ async def abort(self, request_id: str) -> None: Args: request_id: The unique id of the request. """ + ... @abstractmethod async def get_model_config(self) -> ModelConfig: @@ -228,8 +229,13 @@ async def get_model_config(self) -> ModelConfig: @abstractmethod async def get_decoding_config(self) -> DecodingConfig: - ... """Get the decoding configuration of the vLLM engine.""" + ... + + @abstractmethod + async def get_input_preprocessor(self) -> InputPreprocessor: + """Get the input processor of the vLLM engine.""" + ... @abstractmethod async def get_tokenizer( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f830839776364..a15dbd1c45119 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -210,8 +210,11 @@ def __init__( # Logic to switch between engines is done at runtime instead of import # to avoid import order issues self.engine_class = self.get_engine_class() + + # TODO(rob): enable mp by default (issue with fork vs spawn) self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) + self.request_counter = Counter() @staticmethod diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index b8b7912742d45..6a24cdbc6a18f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -26,7 +26,6 @@ import vllm.envs as envs from vllm.config import ModelConfig from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.engine import run_mp_engine from vllm.engine.protocol import EngineClient @@ -61,6 +60,11 @@ from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path from vllm.version import __version__ as VLLM_VERSION +if envs.VLLM_USE_V1: + from vllm.v1.engine.async_llm import AsyncLLMEngine # type: ignore +else: + from vllm.engine.async_llm_engine import AsyncLLMEngine # type: ignore + TIMEOUT_KEEP_ALIVE = 5 # seconds prometheus_multiproc_dir: tempfile.TemporaryDirectory @@ -126,7 +130,8 @@ async def build_async_engine_client_from_engine_args( # Fall back # TODO: fill out feature matrix. if (MQLLMEngineClient.is_unsupported_config(engine_args) - or disable_frontend_multiprocessing): + or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): + engine_config = engine_args.create_engine_config() uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), "uses_ray", False) @@ -143,6 +148,8 @@ async def build_async_engine_client_from_engine_args( None, build_engine) yield engine_client + if hasattr(engine_client, "shutdown"): + engine_client.shutdown() return # Otherwise, use the multiprocessing AsyncLLMEngine. @@ -533,6 +540,7 @@ def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index a089985ac9758..eb08a89293370 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -190,7 +190,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help= "Enable auto tool choice for supported models. Use --tool-call-parser" - "to specify which parser to use") + " to specify which parser to use") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -228,6 +228,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 1335e51bd152c..820aefd8800d9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -99,10 +99,15 @@ class ModelList(OpenAIBaseModel): data: List[ModelCard] = Field(default_factory=list) +class PromptTokenUsageInfo(OpenAIBaseModel): + cached_tokens: Optional[int] = None + + class UsageInfo(OpenAIBaseModel): prompt_tokens: int = 0 total_tokens: int = 0 completion_tokens: Optional[int] = 0 + prompt_tokens_details: Optional[PromptTokenUsageInfo] = None class RequestResponseMetadata(BaseModel): @@ -454,6 +459,12 @@ def check_tool_usage(cls, data): if "tool_choice" not in data and data.get("tools"): data["tool_choice"] = "auto" + # if "tool_choice" is "none" -- ignore tools if present + if "tool_choice" in data and data["tool_choice"] == "none": + # ensure that no tools are present + data.pop("tools", None) + return data + # if "tool_choice" is specified -- validation if "tool_choice" in data: @@ -467,8 +478,8 @@ def check_tool_usage(cls, data): if data["tool_choice"] != "auto" and not isinstance( data["tool_choice"], dict): raise ValueError( - "`tool_choice` must either be a named tool or \"auto\". " - "`tool_choice=\"none\" is not supported.") + "`tool_choice` must either be a named tool, \"auto\", " + "or \"none\".") # ensure that if "tool_choice" is specified as an object, # it matches a valid tool diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 0d016d949d22b..1b422a93263b2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -78,6 +78,11 @@ def parse_args(): help="Port number for the Prometheus metrics server " "(only needed if enable-metrics is set).", ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser.parse_args() @@ -217,6 +222,7 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9551b4f2091dd..5178481c737b4 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -18,8 +18,8 @@ ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, - DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata, - ToolCall, UsageInfo) + DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, + RequestResponseMetadata, ToolCall, UsageInfo) from vllm.entrypoints.openai.serving_engine import (BaseModelPath, LoRAModulePath, OpenAIServing, @@ -49,7 +49,8 @@ def __init__(self, chat_template: Optional[str], return_tokens_as_token_ids: bool = False, enable_auto_tools: bool = False, - tool_parser: Optional[str] = None): + tool_parser: Optional[str] = None, + enable_prompt_tokens_details: bool = False): super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -73,6 +74,11 @@ def __init__(self, self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None if self.enable_auto_tools: try: + if (tool_parser == "pythonic" and + model_config.model.startswith("meta-llama/Llama-3.2")): + logger.warning( + "Llama3.2 models may struggle to emit valid pythonic" + " tool calls") self.tool_parser = ToolParserManager.get_tool_parser( tool_parser) except Exception as e: @@ -80,6 +86,8 @@ def __init__(self, f"tool_parser:'{tool_parser}' which has not " "been registered") from e + self.enable_prompt_tokens_details = enable_prompt_tokens_details + async def create_chat_completion( self, request: ChatCompletionRequest, @@ -119,6 +127,42 @@ async def create_chat_completion( return self.create_error_response( "tool_choice = \"required\" is not supported!") + # NOTE: There is currently a bug in pydantic where attributes + # declared as iterables are replaced in in the instances by + # pydantic-core ValidatorIterator instance. In particular, this + # affects tool_calls defined in ChatCompletionAssistantMessageParam + # model: + # see: + # - https://github.com/pydantic/pydantic/issues/9467 + # As a result, tool_calls from assistant messages are never + # deserialized in the request object if the tool_calls iterator is + # not consumed. This affect messages passed to the MistralTokenizer + # since no chat template is applied and therefore the tools_calls + # iterator is not directly consumed. + # Issue is tracked on Pydantic side, with resolution planned for + # v2.11 release. In the meantime, the official workaround is to + # consume the iterator so the tool_calls are correctly deserialized + # in the OpenAI ChatCompletionAssistantMessageParam object + # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501 + # Official Pydantic Issues: + # - https://github.com/pydantic/pydantic/issues/9541 + # TODO: remove when pydantic v2.11 is released + if isinstance(tokenizer, MistralTokenizer): + for i, message in enumerate(request.messages): + if message.get("role") == 'assistant': + tool_calls_validator = message.get( + "tool_calls", ().__iter__()) + validated_tool_calls = [] + while True: + try: + tool_call = next( + tool_calls_validator) # type: ignore + validated_tool_calls.append(tool_call) + except StopIteration: + break + request.messages[i][ + "tool_calls"] = validated_tool_calls + if (request.tool_choice == "auto" and not (self.enable_auto_tools and tool_parser is not None) and not isinstance(tokenizer, MistralTokenizer)): @@ -187,7 +231,6 @@ async def create_chat_completion( if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( prompt=engine_prompt, - model_config=self.model_config, request_id=request_id, params=sampling_params, ) @@ -252,6 +295,7 @@ async def chat_completion_stream_generator( previous_num_tokens = [0] * num_choices finish_reason_sent = [False] * num_choices num_prompt_tokens = 0 + num_cached_tokens = None if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): tool_choice_function_name = request.tool_choice.function.name @@ -305,6 +349,7 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: + num_cached_tokens = res.num_cached_tokens # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) @@ -530,11 +575,13 @@ async def chat_completion_stream_generator( # is sent, send the usage if include_usage: completion_tokens = sum(previous_num_tokens) - final_usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) + final_usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + + completion_tokens) + if self.enable_prompt_tokens_details and num_cached_tokens: + final_usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=num_cached_tokens) final_usage_chunk = ChatCompletionStreamResponse( id=request_id, @@ -702,11 +749,13 @@ async def chat_completion_full_generator( num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) + usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + + num_generated_tokens) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens) request_metadata.final_usage_info = usage diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index db31b1153d97e..936aae8f1c267 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -140,7 +140,6 @@ async def create_completion( if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( prompt=engine_prompt, - model_config=self.model_config, request_id=request_id, params=sampling_params, ) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index e31dc2ced61fb..fa315fa516632 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -469,12 +469,19 @@ async def _preprocess_chat( mm_data = await mm_data_future - if tool_parser is not None: + # tool parsing is done only if a tool_parser has been set and if + # tool_choice is not "none" (if tool_choice is "none" but a tool_parser + # is set, we want to prevent parsing a tool_call hallucinated by the LLM + should_parse_tools = tool_parser is not None and (hasattr( + request, "tool_choice") and request.tool_choice != "none") + + if should_parse_tools: if not isinstance(request, ChatCompletionRequest): msg = "Tool usage is only supported for Chat Completions API" raise NotImplementedError(msg) - request = tool_parser(tokenizer).adjust_request(request=request) + request = tool_parser(tokenizer).adjust_request( # type: ignore + request=request) if isinstance(request_prompt, str): prompt_inputs = self._tokenize_prompt_input( diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 2187862e8380b..2850349a44835 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -6,9 +6,11 @@ from .jamba_tool_parser import JambaToolParser from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser +from .pythonic_tool_parser import PythonicToolParser __all__ = [ "ToolParser", "ToolParserManager", "Granite20bFCToolParser", "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", - "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser" + "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", + "PythonicToolParser" ] diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py new file mode 100644 index 0000000000000..26da4d689fb8b --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -0,0 +1,289 @@ +import ast +import json +import re +from typing import Any, Sequence, Tuple, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class _UnexpectedAstError(Exception): + pass + + +@ToolParserManager.register_module("pythonic") +class PythonicToolParser(ToolParser): + """ + Tool call parser for models that produce tool calls in a pythonic style, + such as Llama 3.2 models. + + Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set + """ + # TODO(mdepinet): Possible future improvements: + # 1. Support text + tools separated by either <|python_tag|> or \n\n + # 2. Support tools outside of a list (or separated by a semicolon). + # This depends on item 1 for consistent streaming. + # Neither of these are necessary for e.g. ToolACE, but both would help make + # Llama3.2 models more reliable. + + TOOL_CALL_REGEX = re.compile( + r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]", + re.DOTALL) + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # Rename for readability. This is NOT a tool id. + @property + def current_tool_index(self) -> int: + return self.current_tool_id + + @current_tool_index.setter + def current_tool_index(self, value: int) -> None: + self.current_tool_id = value + + def extract_tool_calls( + self, model_output: str, + request: ChatCompletionRequest) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + + if not (self.TOOL_CALL_REGEX.match(model_output)): + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + module = ast.parse(model_output) + parsed = getattr(module.body[0], "value", None) + if isinstance(parsed, ast.List) and all( + isinstance(e, ast.Call) for e in parsed.elts): + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=[ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ], + content=None) + else: + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + except Exception: + logger.exception("Error in extracting tool call from response.") + # Treat as regular text + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + if not current_text.startswith("["): + return DeltaMessage(content=delta_text) + + try: + valid_and_added_text = _make_valid_python(current_text) + if valid_and_added_text is None: + return None + valid_text, added_text = valid_and_added_text + + module = ast.parse(valid_text) + parsed = getattr(module.body[0], "value", None) + if not isinstance(parsed, ast.List) or not all( + isinstance(e, ast.Call) for e in parsed.elts): + raise _UnexpectedAstError( + "Tool output must be a list of function calls") + tool_calls = [ + _handle_single_tool(e) # type: ignore + for e in parsed.elts + ] + + tool_deltas = [] + for index, new_call in enumerate(tool_calls): + if index < self.current_tool_index: + continue + + self.current_tool_index = index + if len(self.streamed_args_for_tool) == index: + self.streamed_args_for_tool.append("") + + new_call_complete = index < len( + tool_calls) - 1 or ")]" not in added_text + if new_call_complete: + self.current_tool_index += 1 + + withheld_suffix = (added_text[:-2] + if not new_call_complete else "") + if not new_call_complete and added_text[-2] == ")": + # Function call is incomplete. Withhold the closing bracket. + withheld_suffix = withheld_suffix + "}" + # Strings get single quotes in the model-produced string. + # JSON requires double quotes. + withheld_suffix = withheld_suffix.replace("'", '"') + delta = _compute_tool_delta(self.streamed_args_for_tool[index], + new_call, index, withheld_suffix) + + if delta is not None: + tool_deltas.append(delta) + if (delta.function is not None + and delta.function.arguments is not None): + self.streamed_args_for_tool[ + index] += delta.function.arguments + + # HACK: serving_chat.py inspects the internal state of tool parsers + # when determining it's final streaming delta, automatically + # adding autocompleted JSON. + # These two lines avoid that nonsense while ensuring finish_reason + # is set to tool_calls when at least one tool is called. + if tool_deltas and not self.prev_tool_call_arr: + self.prev_tool_call_arr = [{"arguments": {}}] + + if tool_deltas: + return DeltaMessage(tool_calls=tool_deltas) + elif not added_text and self.current_tool_id > 0: + # Return an empty DeltaMessage once the tool calls are all done + # so that finish_reason gets set. + return DeltaMessage(content='') + else: + return None + except Exception: + logger.exception("Error trying to handle streaming tool call.") + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None + + +def _get_parameter_value(val: ast.expr) -> Any: + if isinstance(val, ast.Constant): + return val.value + elif isinstance(val, ast.Dict): + if not all(isinstance(k, ast.Constant) for k in val.keys): + raise _UnexpectedAstError( + "Dict tool call arguments must have literal keys") + return { + k.value: _get_parameter_value(v) # type: ignore + for k, v in zip(val.keys, val.values) + } + elif isinstance(val, ast.List): + return [_get_parameter_value(v) for v in val.elts] + else: + raise _UnexpectedAstError("Tool call arguments must be literals") + + +def _handle_single_tool(call: ast.Call) -> ToolCall: + if not isinstance(call.func, ast.Name): + raise _UnexpectedAstError("Invalid tool call name") + function_name = call.func.id + arguments = {} + for keyword in call.keywords: + arguments[keyword.arg] = _get_parameter_value(keyword.value) + return ToolCall(type="function", + function=FunctionCall(name=function_name, + arguments=json.dumps(arguments))) + + +def _make_valid_python(text: str) -> Union[Tuple[str, str], None]: + bracket_stack = [] + for index, char in enumerate(text): + if char in {"[", "(", "{"}: + bracket_stack.append(char) + elif char == "]": + if not bracket_stack or bracket_stack.pop() != "[": + raise _UnexpectedAstError("Mismatched square brackets") + elif char == ")": + if not bracket_stack or bracket_stack.pop() != "(": + raise _UnexpectedAstError("Mismatched parentheses") + elif char == "}": + if not bracket_stack or bracket_stack.pop() != "{": + raise _UnexpectedAstError("Mismatched curly braces") + elif char in {"'", '"'}: + if bracket_stack and bracket_stack[-1] == char: + if index > 0 and text[index - 1] == "\\": + # Treat an escaped quote as a regular character + pass + else: + bracket_stack.pop() + elif bracket_stack and bracket_stack[-1] in {"'", '"'}: + # Double quote within a single quote string or vice versa. + pass + else: + bracket_stack.append(char) + + text = text.rstrip() + if text.endswith("=") or text.endswith(":"): + # Since we have no type information for this property/parameter value, + # we can't fill in a valid value. + return None + if bracket_stack and bracket_stack[-1] == "{": + trailing_dict_text = text[:text.rfind("{")] + num_keys = trailing_dict_text.count(":") + num_values = trailing_dict_text.count(",") + if num_keys <= num_values: + return None # Incomplete property name within parameter value + if bracket_stack and bracket_stack[-1] == "(": + trailing_params_text = text[:text.rfind("(")] + num_full_param_names = trailing_params_text.count("=") + num_full_param_values = trailing_params_text.count(",") + if num_full_param_names <= num_full_param_values: + return None # Incomplete parameter name + if text.endswith(","): + text = text[:-1] + if bracket_stack and bracket_stack[-1] == "[" and not text.endswith( + "[") and not text.endswith(")"): + return None # Incomplete function name + + added_text = "" + for char in reversed(bracket_stack): + if char == "[": + added_text += "]" + elif char == "(": + added_text += ")" + elif char == "{": + added_text += "}" + elif char == "'": + added_text += "'" + elif char == '"': + added_text += '"' + + return text + added_text, added_text + + +def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall, + index: int, + withheld_suffix: str) -> Union[DeltaToolCall, None]: + new_call_args = new_call.function.arguments + if withheld_suffix: + assert new_call_args.endswith(withheld_suffix) + new_call_args = new_call_args[:-len(withheld_suffix)] + if not previously_sent_args: + return DeltaToolCall(id=new_call.id, + index=index, + function=DeltaFunctionCall( + name=new_call.function.name, + arguments=new_call_args, + )) + + arg_diff = new_call_args[len(previously_sent_args):] + return DeltaToolCall( + id="", index=index, function=DeltaFunctionCall( + arguments=arg_diff)) if arg_diff else None diff --git a/vllm/envs.py b/vllm/envs.py index 154246c69f165..f320e35971f94 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,6 +72,7 @@ VLLM_CUSTOM_OPS: List[str] = [] VLLM_DISABLED_KERNELS: List[str] = [] VLLM_USE_V1: bool = False + VLLM_ENABLE_V1_MULTIPROCESSING: bool = False def get_default_cache_root(): @@ -473,6 +474,10 @@ def get_default_config_root(): # If set, use the V1 code path. "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + + # If set, enable multiprocessing in LLM for the V1 code path. + "VLLM_ENABLE_V1_MULTIPROCESSING": + lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))), } # end-env-vars-definition diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 68ac50a2c5a16..54fbd7a321a6f 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,9 +1,11 @@ from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, - SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, - TokensPrompt, build_explicit_enc_dec_prompt, - to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import DummyData, InputContext, InputRegistry + SingletonInputs, SingletonInputsAdapter, SingletonPrompt, + TextPrompt, TokenInputs, TokensPrompt, + build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, + token_inputs, zip_enc_dec_prompts) +from .registry import (DummyData, InputContext, InputProcessingContext, + InputRegistry) INPUT_REGISTRY = InputRegistry() """ @@ -26,12 +28,14 @@ "EncoderDecoderInputs", "ProcessorInputs", "SingletonInputs", + "SingletonInputsAdapter", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", "INPUT_REGISTRY", "DummyData", "InputContext", + "InputProcessingContext", "InputRegistry", ] diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 46b41f431bec7..07ff9faa50f13 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,10 +1,14 @@ +from dataclasses import dataclass +from functools import cached_property from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) -from typing_extensions import NotRequired, TypedDict, TypeVar +import torch +from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict + from vllm.multimodal.inputs import MultiModalInputsV2 class TextPrompt(TypedDict): @@ -36,13 +40,13 @@ class TokensPrompt(TypedDict): multi_modal_data: NotRequired["MultiModalDataDict"] """ - Optional multi-modal data to pass to the model, + DEPRECATED: Optional multi-modal data to pass to the model, if the model supports it. """ mm_processor_kwargs: NotRequired[Dict[str, Any]] """ - Optional multi-modal processor kwargs to be forwarded to the + DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them. @@ -176,7 +180,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = TokenInputs +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -191,19 +195,91 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: TokenInputs + encoder: Union[TokenInputs, "MultiModalInputsV2"] """The inputs for the encoder portion.""" - decoder: TokenInputs + decoder: Union[TokenInputs, "MultiModalInputsV2"] """The inputs for the decoder portion.""" -SingletonInputs = TokenInputs +SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. """ + +@dataclass +class SingletonInputsAdapter: + """ + Unified interface to access the components of :class:`SingletonInputs`. + """ + inputs: SingletonInputs + + @cached_property + def prompt(self) -> Optional[str]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt") + + assert_never(inputs) + + @cached_property + def prompt_token_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt_token_ids", []) + + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return None + + assert_never(inputs) + + @cached_property + def multi_modal_data(self) -> "MultiModalDataDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + + @cached_property + def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_placeholders", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_placeholders", {}) + + assert_never(inputs) + + @cached_property + def mm_processor_kwargs(self) -> Dict[str, Any]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + if inputs["type"] == "multimodal": + return {} + + assert_never(inputs) + + ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ The inputs to :data:`vllm.inputs.InputProcessor`. @@ -234,10 +310,11 @@ def zip_enc_dec_prompts( ) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs - may also be provided; if a dict is passed, the same dictionary will be - used for every encoder/decoder prompt. If an iterable is provided, it will - be zipped with the encoder/decoder prompts. + :class:`ExplicitEncoderDecoderPrompt` instances. + + ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: mm_processor_kwargs = cast(Dict[str, Any], {}) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 509b0448b9e51..fdf28615fda10 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,11 +1,13 @@ import asyncio -from typing import List, Optional +from typing import List, Mapping, Optional, Union from typing_extensions import assert_never from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once @@ -23,11 +25,13 @@ def __init__( self, model_config: ModelConfig, tokenizer: Optional[BaseTokenizerGroup], + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: super().__init__() self.model_config = model_config self.tokenizer = tokenizer + self.mm_registry = mm_registry def get_tokenizer_group(self) -> BaseTokenizerGroup: if self.tokenizer is None: @@ -198,14 +202,79 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) + def _can_process_multimodal(self) -> bool: + model_config = self.model_config + + if not model_config.is_multimodal_model: + raise ValueError("Your model does not support multi-modal inputs") + + # Interim measure so we can handle models that have yet to be + # updated to use the new multi-modal processor + can_process_multimodal = self.mm_registry.has_processor(model_config) + if not can_process_multimodal: + logger.info( + "Your model uses the legacy input pipeline instead of the new " + "multi-modal processor. Please note that the legacy pipeline " + "will be removed in a future release. For more details, see: " + "https://github.com/vllm-project/vllm/issues/10114") + + return can_process_multimodal + + def _process_multimodal( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """ + Apply the model's multi-modal processor to a multi-modal prompt, + returning the corresponding token IDs and metadata. + """ + tokenizer_group = self.get_tokenizer_group() + tokenizer = tokenizer_group.get_lora_tokenizer(lora_request) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + + if isinstance(prompt, list): + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + async def _process_multimodal_async( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """Async version of :meth:`_process_multimodal`.""" + tokenizer_group = self.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request + ) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + if isinstance(prompt, list): + logger.warning("Passing `multi_modal_data` in TokensPrompt is" + "deprecated and will be removed in a future update") + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, ) -> SingletonInputs: - ''' - Extract the components of any single encoder or decoder input prompt. + """ + Extract the singleton inputs from a prompt. Arguments: @@ -215,12 +284,8 @@ def _prompt_to_llm_inputs( Returns: - * prompt - * prompt_token_ids - * multi_modal_data - * mm_processor_kwargs (request-level input processor/mapper overrides) - ''' - + * :class:`SingletonInputs` instance + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": @@ -243,6 +308,14 @@ def _prompt_to_llm_inputs( multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + return token_inputs( prompt_token_ids=prompt_token_ids, multi_modal_data=multi_modal_data, @@ -253,13 +326,22 @@ def _prompt_to_llm_inputs( text_content = parsed["content"] prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = text_content.get("multi_modal_data") - mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -299,6 +381,14 @@ async def _prompt_to_llm_inputs_async( multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + return token_inputs( prompt_token_ids=prompt_token_ids, multi_modal_data=multi_modal_data, @@ -309,13 +399,22 @@ async def _prompt_to_llm_inputs_async( text_content = parsed["content"] prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = text_content.get("multi_modal_data") - mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -331,7 +430,8 @@ def _build_enc_dec_llm_inputs( encoder_inputs: SingletonInputs, decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - if encoder_inputs["type"] == "token": + if (encoder_inputs["type"] == "token" + or encoder_inputs["type"] == "multimodal"): pass else: assert_never(encoder_inputs) @@ -340,7 +440,8 @@ def _build_enc_dec_llm_inputs( dec_token_ids = self._prepare_decoder_input_ids_for_generation( None) decoder_inputs = token_inputs(dec_token_ids) - elif decoder_inputs["type"] == "token": + elif (decoder_inputs["type"] == "token" + or decoder_inputs["type"] == "multimodal"): dec_token_ids = self._prepare_decoder_input_ids_for_generation( decoder_inputs["prompt_token_ids"]) decoder_inputs["prompt_token_ids"] = dec_token_ids @@ -361,7 +462,7 @@ def _process_encoder_decoder_prompt( prompt: PromptType, request_id: str, ) -> EncoderDecoderInputs: - ''' + """ For encoder/decoder models only: Process an input prompt into an :class:`EncoderDecoderInputs` instance. @@ -391,8 +492,7 @@ def _process_encoder_decoder_prompt( Returns: * :class:`EncoderDecoderInputs` instance - ''' - + """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -460,7 +560,8 @@ def _build_decoder_only_llm_inputs( prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - if prompt_inputs["type"] == "token": + if (prompt_inputs["type"] == "token" + or prompt_inputs["type"] == "multimodal"): prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( prompt_inputs["prompt_token_ids"], prompt_adapter_request=prompt_adapter_request, @@ -477,7 +578,7 @@ def _process_decoder_only_prompt( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: - ''' + """ For decoder-only models: Process an input prompt into an :class:`DecoderOnlyInputs` instance. @@ -491,7 +592,7 @@ def _process_decoder_only_prompt( Returns: * :class:`DecoderOnlyInputs` instance - ''' + """ prompt_comps = self._prompt_to_llm_inputs( prompt, diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 7d7a797be4f60..68b4756331e6d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -5,14 +5,17 @@ Optional, Protocol, Type, cast) from torch import nn -from transformers import PretrainedConfig -from typing_extensions import TypeVar +from transformers import PretrainedConfig, ProcessorMixin +from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger +from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, resolve_mm_processor_kwargs) -from .data import ProcessorInputs +from .data import ProcessorInputs, SingletonInputs +from .parse import is_encoder_decoder_inputs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -61,6 +64,19 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]: return self.model_config.hf_image_processor_config +@dataclass(frozen=True) +class InputProcessingContext(InputContext): + tokenizer: AnyTokenizer + """The tokenizer used to tokenize the inputs.""" + + def get_hf_processor(self) -> ProcessorMixin: + return cached_get_processor( + self.model_config.tokenizer, + tokenizer=self.tokenizer, # Override the tokenizer with ours + trust_remote_code=self.model_config.trust_remote_code, + ) + + N = TypeVar("N", bound=Type[nn.Module]) @@ -94,7 +110,7 @@ def __call__( ... -class _MultiModalCounts(UserDict): +class _MultiModalCounts(UserDict[str, int]): """ Wraps `mm_counts` for a more informative error message when attempting to access a plugin that does not exist. @@ -287,6 +303,21 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]): return self._input_processors_by_model_type \ .get(model_cls, self._default_input_processor) + def _ensure_mm_kwargs( + self, + inputs: SingletonInputs, + mm_processor_kwargs: Dict[str, Any], + ): + if inputs["type"] == "token": + # In case the input processor for that model fails to set it + if "mm_processor_kwargs" not in inputs: + inputs["mm_processor_kwargs"] = mm_processor_kwargs + elif inputs["type"] == "multimodal": + # Be more strict in V2 + assert "mm_kwargs" in inputs + else: + assert_never(inputs["type"]) + def process_input(self, model_config: "ModelConfig", inputs: ProcessorInputs) -> ProcessorInputs: """ @@ -312,8 +343,21 @@ def process_input(self, model_config: "ModelConfig", processor, ) - return processor(InputContext(model_config), inputs, - **mm_processor_kwargs) + processed_inputs = processor( + InputContext(model_config), + inputs, + **mm_processor_kwargs, + ) + + if is_encoder_decoder_inputs(processed_inputs): + self._ensure_mm_kwargs(processed_inputs["encoder"], + mm_processor_kwargs) + self._ensure_mm_kwargs(processed_inputs["decoder"], + mm_processor_kwargs) + else: + self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs) + + return processed_inputs def create_input_processor(self, model_config: "ModelConfig"): """ diff --git a/vllm/logger.py b/vllm/logger.py index 80b9fcc59272d..9e16e591315ba 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -117,13 +117,14 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None): last_lineno = 0 last_func_name = "" with open(log_path, 'a') as f: + ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") if event == 'call': - f.write(f"{datetime.datetime.now()} Call to" + f.write(f"{ts} Call to" f" {func_name} in {filename}:{lineno}" f" from {last_func_name} in {last_filename}:" f"{last_lineno}\n") else: - f.write(f"{datetime.datetime.now()} Return from" + f.write(f"{ts} Return from" f" {func_name} in {filename}:{lineno}" f" to {last_func_name} in {last_filename}:" f"{last_lineno}\n") diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index a7887a048746a..04fc635828d4d 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -70,6 +70,14 @@ def apply(self, x: torch.Tensor, self.lora_b_stacked, add_input=True) # now have column partitioned output + + if self.bias_stacked is not None: + self.bias_stacked = self.bias_stacked.view( + -1, self.bias_stacked.shape[-1]) + self.bias_stacked = self.bias_stacked[ + self.punica_wrapper.token_lora_indices] + output += self.bias_stacked + output = output.view(*out_orig_shape) return output @@ -121,6 +129,15 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] + + if layer.bias_stacked is not None: + bias = layer.bias_stacked[idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[layer.punica_wrapper.token_lora_indices] + bias[layer.punica_wrapper.token_lora_indices == -1] = 0 + output[:, left_offset:left_offset + shard_size] += bias + layer.punica_wrapper.add_expand_slice( output, buffers[idx], @@ -295,6 +312,15 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + shard_size = self.bias_stacked.shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) @@ -318,6 +344,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size + + if self.bias_stacked is not None: + bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) + bias = bias[self.punica_wrapper.token_lora_indices] + bias[self.punica_wrapper.token_lora_indices == -1] = 0 + output += bias + self.punica_wrapper.add_expand_slice(output, buffer, self.lora_b_stacked, start_idx, shard_size) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 6254c67596e65..7429c60e0222d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -67,6 +67,63 @@ def dec(*args, **kwargs): return dec +def apply_bias( + indices: torch.Tensor, + output: torch.Tensor, + bias_stacked: torch.Tensor, +): + """Applies bias to output + + Input shapes: + bias_stacked: (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) + bias_stacked = bias_stacked[indices] + bias_stacked[indices == -1] = 0 + output += bias_stacked + + return output.view_as(org_output) + + +def apply_bias_packed_nslice( + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], +): + """Applies bias to output + + Input shapes: + bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + + offset_left += slice + + return output.view_as(org_output) + + @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -105,6 +162,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): """Overwrites lora tensors at index.""" ... @@ -203,6 +261,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( @@ -299,10 +358,22 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def set_lora( self, @@ -310,6 +381,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) @@ -319,10 +391,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -401,11 +484,25 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + self.output_dim = self.lora_b_stacked.shape[2] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -418,18 +515,30 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_dim + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -437,10 +546,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -534,6 +654,17 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) for _ in range(n_slices)) + if lora_config.bias_enabled: + self.bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + self.output_size // 2, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(n_slices)) + else: + self.bias_stacked = None self.output_dim = self.lora_b_stacked[0].shape[2] @@ -542,6 +673,9 @@ def reset_lora(self, index: int): self.lora_a_stacked[1][index] = 0 self.lora_b_stacked[0][index] = 0 self.lora_b_stacked[1][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -562,18 +696,32 @@ def slice_lora_b( ] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + if bias[0] is None or bias[1] is None: + return bias + shard_size = self.output_dim + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_a[0] is not None: self.lora_a_stacked[0][ @@ -582,6 +730,10 @@ def set_lora( self.lora_b_stacked[0][ index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) + if bias is not None and bias[0] is not None: + self.bias_stacked[0][index, + 0, :bias[0].shape[0]].copy_(bias[0].T, + non_blocking=True) if lora_a[1] is not None: self.lora_a_stacked[1][ index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( @@ -589,10 +741,22 @@ def set_lora( self.lora_b_stacked[1][ index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) + if bias is not None and bias[1] is not None: + self.bias_stacked[1][index, + 0, :bias[1].shape[0]].copy_(bias[1].T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + (self.output_dim, self.output_dim), + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice( output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, (self.output_dim, self.output_dim)) @@ -654,17 +818,35 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + bias_q = bias[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + bias_k = bias[k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + bias_v = bias[v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + bias = torch.cat([bias_q, bias_k, bias_v], dim=1) + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -672,6 +854,10 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -768,6 +954,32 @@ def create_lora_weights( device=self.device, ), ) + if lora_config.bias_enabled: + self.bias_stacked = ( + torch.zeros( + max_loras, + 1, + self.q_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + ) + else: + self.bias_stacked = None self.output_slices = ( self.q_proj_shard_size, @@ -787,6 +999,10 @@ def reset_lora(self, index: int): self.lora_b_stacked[1][index] = 0 self.lora_a_stacked[2][index] = 0 self.lora_b_stacked[2][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 + self.bias_stacked[2][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -812,18 +1028,40 @@ def slice_lora_b( lora_b = [lora_b_q, lora_b_k, lora_b_v] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + bias_q, bias_k, bias_v = bias + if bias_q is not None: + bias_q = bias_q[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + if bias_k is not None: + bias_k = bias_k[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + if bias_v is not None: + bias_v = bias_v[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + bias = [bias_q, bias_k, bias_v] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_b[0] is not None: lora_b_q = lora_b[0] @@ -854,9 +1092,28 @@ def set_lora( index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( lora_a[2].T, non_blocking=True) + if bias is not None: + if bias[0] is not None: + self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( + bias[0].T, non_blocking=True) + if bias[1] is not None: + self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( + bias[1].T, non_blocking=True) + if bias[2] is not None: + self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( + bias[2].T, non_blocking=True) + def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + self.output_slices, + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, @@ -919,9 +1176,27 @@ def create_lora_weights( device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + ( + max_loras, + 1, + self.output_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + # Lazily initialized + self.indices: torch.Tensor + self.indices_len: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tensor_model_parallel_rank = get_tensor_model_parallel_rank() @@ -934,18 +1209,24 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.base_layer.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -953,9 +1234,20 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -1132,6 +1424,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, @@ -1199,7 +1492,7 @@ def _get_logits( neginf=float("-inf"))) logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1], ] = lora_logits + lora_logits.shape[1]] = lora_logits # LogitsProcessorWithLoRA always using bgmv self.punica_wrapper.add_lora_logits(logits, hidden_states, @@ -1276,6 +1569,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): ... diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 14081b5ba441c..b648312ba76ec 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -17,6 +17,7 @@ def __init__( lora_alpha: int, lora_a: torch.Tensor, lora_b: torch.Tensor, + bias: Optional[torch.Tensor] = None, embeddings_tensor: Optional[torch.Tensor] = None, scaling: Optional[float] = None, ) -> None: @@ -25,6 +26,7 @@ def __init__( self.lora_alpha = lora_alpha self.lora_a = lora_a self.lora_b = lora_b + self.bias = bias self.embeddings_tensor = embeddings_tensor if scaling is None: @@ -66,7 +68,8 @@ def create_dummy_lora_weights( rank: int, dtype: torch.dtype, device: torch.types.Device, - embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights": + embeddings_tensor_dim: Optional[int] = None, + bias_enabled: Optional[bool] = False) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() lora_a = torch.zeros([input_dim, rank], dtype=dtype, @@ -76,6 +79,14 @@ def create_dummy_lora_weights( dtype=dtype, device=device, pin_memory=pin_memory) + if bias_enabled: + bias = torch.zeros([output_dim], + dtype=dtype, + device=device, + pin_memory=pin_memory) + else: + bias = None + embeddings_tensor = torch.rand( 10, embeddings_tensor_dim, @@ -88,6 +99,7 @@ def create_dummy_lora_weights( lora_alpha=1, lora_a=lora_a, lora_b=lora_b, + bias=bias, embeddings_tensor=embeddings_tensor, ) @@ -102,6 +114,7 @@ def __init__( lora_alphas: List[Optional[int]], lora_a: List[Optional[torch.Tensor]], lora_b: List[Optional[torch.Tensor]], + bias: Optional[List[Optional[torch.Tensor]]] = None, scaling: Optional[List[float]] = None, ) -> None: super().__init__( @@ -110,6 +123,7 @@ def __init__( lora_alpha=0, lora_a=lora_a, lora_b=lora_b, + bias=bias, scaling=scaling, # type: ignore embeddings_tensor=None, ) @@ -141,6 +155,7 @@ def pack( [lora.lora_alpha if lora is not None else None for lora in loras], [lora.lora_a if lora is not None else None for lora in loras], [lora.lora_b if lora is not None else None for lora in loras], + [lora.bias if lora is not None else None for lora in loras], scaling=[ 1 if lora is not None else None # type: ignore for lora in loras diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 81e274612b73b..2ffefe61427e3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Type import safetensors.torch import torch @@ -119,7 +119,8 @@ def from_lora_tensors( pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: Dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): - module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) + module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( + tensor_name) if module_name not in loras: lora_embeddings_tensor = None if embeddings: @@ -136,8 +137,16 @@ def from_lora_tensors( lora_embeddings_tensor.pin_memory()) loras[module_name] = LoRALayerWeights(module_name, rank, lora_alpha, None, None, + None, lora_embeddings_tensor) - if is_lora_a: + if is_bias: + loras[module_name].bias = tensor.to(device=device, + dtype=dtype).t() + bias = tensor.to(device=device, dtype=dtype).t() + if pin_memory: + bias = bias.pin_memory() + loras[module_name].bias = bias + elif is_lora_a: loras[module_name].lora_a = tensor.to(device=device, dtype=dtype).t() if pin_memory: @@ -215,7 +224,7 @@ def from_local_checkpoint( with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore for lora_module in f.keys(): # noqa - module_name, _ = parse_fine_tuned_lora_name(lora_module) + module_name, _, _ = parse_fine_tuned_lora_name(lora_module) part_name = module_name.split(".")[-1] if part_name not in expected_lora_modules: unexpected_modules.append(module_name) @@ -301,6 +310,7 @@ def __init__( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + device: torch.device, ): """Create a LoRAModelManager and adapter for a given model. @@ -314,6 +324,7 @@ def __init__( lora_config: the LoRA configuration. """ self.lora_config = lora_config + self.device = device self.max_num_seqs = max_num_seqs assert self.capacity >= self.lora_slots self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 @@ -322,7 +333,7 @@ def __init__( self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = PunicaWrapper(max_num_batched_tokens, max_batches=self.max_num_seqs, - device="cuda") + device=self.device) # Scaling factor -> offset to the sin_cos_cache to it. # Used for long context lora. self.scaling_factor_to_offset: Dict[float, int] = {} @@ -384,8 +395,19 @@ def activate_adapter( module_lora = lora_model.get_lora(module_name) if module_lora: module_lora.optimize() + # Bias is not explicitly enabled with the flag enable_lora_bias. + bias = module_lora.bias + if ((torch.is_tensor(bias) or + (isinstance(bias, Sequence) and any(b is not None + for b in bias))) + and not self.lora_config.bias_enabled): + module_lora.bias = None + raise ValueError( + f"Adapter bias cannot be used for {module_name}" + " without --enable-lora-bias.") module.set_lora(index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor) + module_lora.embeddings_tensor, + module_lora.bias) else: module.reset_lora(index) return True @@ -507,6 +529,7 @@ def create_dummy_lora( """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): + bias_enabled = self.lora_config.bias_enabled if (not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) or isinstance(module, LinearScalingRotaryEmbeddingWithLora) @@ -534,7 +557,8 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim) + embeddings_tensor_dim=embeddings_tensor_dim, + bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, @@ -543,6 +567,7 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() else: @@ -557,6 +582,7 @@ def create_dummy_lora( rank, module.lora_a_stacked[i].dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() subloras.append(lora) @@ -653,16 +679,11 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int], class LRUCacheLoRAModelManager(LoRAModelManager): """A model manager that manages multiple LoRAs with LRU cache.""" - def __init__( - self, - model: nn.Module, - max_num_seqs: int, - max_num_batched_tokens: int, - vocab_size: int, - lora_config: LoRAConfig, - ): + def __init__(self, model: nn.Module, max_num_seqs: int, + max_num_batched_tokens: int, vocab_size: int, + lora_config: LoRAConfig, device: torch.device): super().__init__(model, max_num_seqs, max_num_batched_tokens, - vocab_size, lora_config) + vocab_size, lora_config, device) self._registered_adapters: LoRALRUCache = LoRALRUCache( self.capacity, self.deactivate_adapter) self._active_adapters: LoRALRUCache = LoRALRUCache( @@ -732,6 +753,7 @@ def create_lora_manager( max_num_batched_tokens: int, vocab_size: int, lora_config: LoRAConfig, + device: torch.device, lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager, **kwargs) -> LoRAModelManager: """Create a LoRA adapter for a given model.""" @@ -743,5 +765,6 @@ def create_lora_manager( max_num_batched_tokens=max_num_batched_tokens, vocab_size=vocab_size, lora_config=lora_config, + device=device, **kwargs) return lora_manager diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py index adb3ab5b46b87..4910cb4061298 100644 --- a/vllm/lora/ops/sgmv_expand.py +++ b/vllm/lora/ops/sgmv_expand.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_expand_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py index efa234520ab87..844f5cec39e93 100644 --- a/vllm/lora/ops/sgmv_expand_slice.py +++ b/vllm/lora/ops/sgmv_expand_slice.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_expand_slice_kernel( input_ptr, diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py index c003f3dc0ce9e..b4d893047b06b 100644 --- a/vllm/lora/ops/sgmv_shrink.py +++ b/vllm/lora/ops/sgmv_shrink.py @@ -9,10 +9,7 @@ import triton import triton.language as tl -from vllm.triton_utils import libentry - -@libentry() @triton.jit def _sgmv_shrink_kernel( input_ptr, diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py index 5033ce4126929..082041f390750 100644 --- a/vllm/lora/punica.py +++ b/vllm/lora/punica.py @@ -62,6 +62,7 @@ def convert_mapping( max_loras: int, vocab_size: int, extra_vocab_size: int, + device: torch.device, long_lora_context: Optional["LongContextLoRAContext"] = None, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor], List[int]]: @@ -104,7 +105,7 @@ def convert_mapping( long_lora_offsets: Optional[torch.Tensor] = None if long_lora_context: long_lora_offsets = torch.zeros(len(index_mapping_indices), - device="cuda", + device=device, dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 @@ -131,10 +132,10 @@ def convert_mapping( if long_lora_context: assert long_lora_offsets is not None indices_list.append(long_lora_offsets) - indices = torch.tensor(indices_list, dtype=torch.long, device="cuda") + indices = torch.tensor(indices_list, dtype=torch.long, device=device) prompt_mapping_tensor = torch.tensor(prompt_mapping, - device="cuda", - dtype=torch.long) + dtype=torch.long, + device=device) embeddings_indices = torch.stack([ indices[2] * extra_vocab_size, indices[2] * (vocab_size + extra_vocab_size), @@ -145,7 +146,7 @@ def convert_mapping( sampler_indices_padded = sampler_indices.clone() sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1 sampler_indices_padded = torch.arange( - 0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + ( + 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( sampler_indices_padded * len(sampler_indices_padded)) long_lora_indices = None long_lora_indices_len: Optional[int] = None @@ -183,7 +184,7 @@ class PunicaWrapper: """ def __init__(self, max_num_batched_tokens: int, max_batches: int, - device: str): + device: Union[torch.device, str]): self._token_lora_indices = torch.empty(max_num_batched_tokens, dtype=torch.long, device=device) @@ -215,6 +216,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, self._lora_indices_per_batch = torch.empty(max_batches, dtype=torch.long, device=device) + self.device: torch.device = device self.max_length: int = 0 self.token_nums: int = 0 self.batch_size: int = -1 @@ -263,6 +265,7 @@ def _update_base_metadata( max_loras, vocab_size, extra_vocab_size, + self.device, long_lora_context, ) self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a780429f413d3..5876494ce2824 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -91,7 +91,7 @@ def replace_submodule(model: nn.Module, module_name: str, return new_module -def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: +def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]: """Parse the name of lora weights. args: @@ -101,15 +101,18 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. + is_bias whether the tensor is lora bias. """ parts = name.split(".") + if parts[-1] == "weight" and (parts[-2] == "lora_A" + or parts[-2] == "lora_B"): + return ".".join(parts[2:-2]), parts[-2] == "lora_A", False - if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model": - if parts[-1] == "weight": - if parts[-2] == "lora_A" or parts[-2] == "lora_B": - return ".".join(parts[2:-2]), parts[-2] == "lora_A" - elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": - return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" + if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": + return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False + + if parts[-1] == "bias": + return ".".join(parts[2:-2]), False, True raise ValueError(f"{name} is unsupported LoRA weight") diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 724c308a07a27..93a5e27621912 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -73,6 +73,7 @@ def create_lora_manager( max_num_batched_tokens=self.max_num_batched_tokens, vocab_size=self.vocab_size, lora_config=self.lora_config, + device=self.device, lora_manager_cls=self._manager_cls, ) self._adapter_manager = lora_manager @@ -176,6 +177,7 @@ def create_lora_manager( max_num_seqs=self.max_num_seqs, vocab_size=self.vocab_size, lora_config=self.lora_config, + device=self.device, max_num_batched_tokens=self.max_num_batched_tokens, ) self._adapter_manager = lora_manager diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 68a66095f2daf..69a5ecdd6c6fb 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -4,6 +4,7 @@ import dataclasses import fnmatch import glob +import inspect import json import math import os @@ -41,7 +42,8 @@ runai_safetensors_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.transformers_utils.s3_utils import is_s3, glob as s3_glob +from vllm.transformers_utils.s3_utils import glob as s3_glob +from vllm.transformers_utils.s3_utils import is_s3 from vllm.utils import is_pin_memory_available @@ -89,11 +91,23 @@ def device_loading_context(module: torch.nn.Module, logger = init_logger(__name__) -def _initialize_model(vllm_config: VllmConfig) -> nn.Module: +def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module: """Initialize a model with the given configurations.""" model_config = vllm_config.model_config model_class, _ = get_model_architecture(model_config) - return model_class(vllm_config=vllm_config) + signatures = inspect.signature(model_class.__init__) + # collect all kw-only parameters + kw_only_params = [ + param.name for param in signatures.parameters.values() + if param.kind == inspect.Parameter.KEYWORD_ONLY + ] + assert "vllm_config" in kw_only_params and "prefix" in kw_only_params, \ + ("vLLM model class must accept `vllm_config` and `prefix` as kw-only " + "arguments. Possibly you have an old-style model class registered from " + "out of tree and it is used for new vLLM version. " + "Please check https://docs.vllm.ai/en/latest/design/class_hierarchy.html " + "for the design and update the model class accordingly.") + return model_class(vllm_config=vllm_config, prefix=prefix) class BaseModelLoader(ABC): @@ -992,7 +1006,13 @@ def _load_weights(self, model_config: ModelConfig, param_dict = dict(model.named_parameters()) stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + # TODO: Change this lazy import to normal import + # after the checks are updated to run on a new version + from vllm.model_executor.models.utils import is_pp_missing_parameter for quant_param_name in quant_state_dict: + if is_pp_missing_parameter(quant_param_name, model): + continue + non_stacked_param_name = quant_param_name shard_index = 0 @@ -1177,11 +1197,11 @@ def _prepare_weights(self, model_name_or_path: str, )) if is_s3_path: - hf_weights_files = s3_glob(path=hf_folder, - allow_pattern=[safetensors_pattern]) + hf_weights_files = s3_glob(path=hf_folder, + allow_pattern=[safetensors_pattern]) else: hf_weights_files = glob.glob( - os.path.join(hf_folder, safetensors_pattern)) + os.path.join(hf_folder, safetensors_pattern)) if not is_local and not is_s3_path: download_safetensors_index_file_from_hf( diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index 997554f7dcccd..9ee2a2cc09a24 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -34,7 +34,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -364,14 +365,13 @@ def forward( @support_torch_compile class ArcticModel(nn.Module): - def __init__( - self, - config: ArcticConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding( @@ -415,16 +415,13 @@ def forward( class ArcticForCausalLM(nn.Module, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config - self.model = ArcticModel(config, - cache_config, - quant_config, - prefix=prefix) + self.model = ArcticModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.lm_head = ParallelLMHead( self.vocab_size, diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 8e1dab71b1f39..aabbd31192a40 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -253,13 +253,18 @@ def forward( @support_torch_compile class BaiChuanModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - position_embedding: str, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__( + self, + vllm_config: VllmConfig, + prefix: str = "", + position_embedding: str = "ROPE", + ) -> None: super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -332,21 +337,22 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): def __init__( self, + *, vllm_config: VllmConfig, prefix: str = "", position_embedding: str = "ROPE", ): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.quant_config = quant_config - self.model = BaiChuanModel(config, position_embedding, cache_config, - quant_config) + self.model = BaiChuanModel(vllm_config=vllm_config, + prefix=prefix, + position_embedding=position_embedding) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) @@ -438,16 +444,16 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM): NOTE: the class name has a lower case 'c'. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config if config.hidden_size == 4096: # baichuan2 7b - super().__init__(vllm_config, prefix, "ROPE") + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ROPE") else: # baichuan 13b, baichuan2 13b - super().__init__(vllm_config, prefix, "ALIBI") + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ALIBI") class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): @@ -455,9 +461,7 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM): NOTE: the class name has an upper case 'C'. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): - super().__init__(vllm_config, prefix, "ROPE") + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + position_embedding="ROPE") diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index c6da6a590cf5a..a50a5a5b018e1 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -41,6 +41,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .utils import maybe_prefix + logger = logging.get_logger(__name__) @@ -739,13 +741,14 @@ class BartModel(nn.Module): "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" ] - def __init__(self, - config: BartConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id @@ -810,20 +813,16 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, class BartForConditionalGeneration(nn.Module): base_model_prefix = "model" - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config # currently all existing BART models have `tie_word_embeddings` enabled assert config.tie_word_embeddings self.config = config - self.model = BartModel(config, - cache_config, - quant_config, - lora_config=lora_config) + self.model = BartModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 2b0f45c5603f5..7dbc7fa0aaba4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,7 +5,6 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.xformers import XFormersImpl from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -21,6 +20,8 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput +from .utils import maybe_prefix + class BertEmbedding(nn.Module): @@ -216,11 +217,6 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn") - if not isinstance(self.attn.impl, XFormersImpl): - raise ValueError( - "Encoder-only models currently require XFORMERS attention " - "backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.") - def forward( self, hidden_states: torch.Tensor, @@ -309,12 +305,13 @@ def forward(self, hidden_states: torch.Tensor, class BertModel(nn.Module): - def __init__(self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embeddings = BertEmbedding(config) self.encoder = BertEncoder(config, cache_config, @@ -382,17 +379,11 @@ class BertEmbeddingModel(nn.Module): _pooler: An instance of Pooler used for pooling operations. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config pooler_config = vllm_config.model_config.pooler_config - self.model = BertModel(config, cache_config, quant_config) + self.model = BertModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.CLS, diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index cdc30eda2ab3c..03dc1d15ab697 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -23,7 +23,7 @@ get_max_blip_image_tokens) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo @@ -483,11 +483,7 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): @INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -517,7 +513,7 @@ def __init__( self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 7540bc23efd88..84adf574af5e2 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -42,7 +42,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: @@ -221,14 +222,13 @@ def forward( @support_torch_compile class BloomModel(nn.Module): - def __init__( - self, - config: BloomConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embed_dim = config.hidden_size # Embedding + LN Embedding @@ -281,18 +281,15 @@ def forward( class BloomForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = BloomModel(config, cache_config, quant_config) + self.transformer = BloomModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.word_embeddings else: diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index f79bad6190708..7b59c818e0b60 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -37,7 +37,8 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # These configs are not part of the model config but the preprocessor # and processor files, so we hardcode them in the model file for now. @@ -831,14 +832,13 @@ def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor: class ChameleonModel(nn.Module): - def __init__( - self, - config: ChameleonConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -924,19 +924,14 @@ def forward( class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config - self.model = ChameleonModel(config, cache_config, quant_config) + self.model = ChameleonModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size self.lm_head = ParallelLMHead( self.unpadded_vocab_size, diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index c14f2fcb15063..70e9b607b0642 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -30,8 +30,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.base import MultiModalData +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -39,7 +39,8 @@ from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -481,14 +482,13 @@ def forward( class ChatGLMModel(nn.Module): - def __init__( - self, - config: ChatGLMConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embedding = VocabParallelEmbedding(config.padded_vocab_size, @@ -593,14 +593,9 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP, embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config multimodal_config = vllm_config.model_config.multimodal_config @@ -611,7 +606,9 @@ def __init__( self.quant_config = quant_config self.max_position_embeddings = getattr(config, "max_sequence_length", 8192) - self.transformer = ChatGLMModel(config, cache_config, quant_config) + self.transformer = ChatGLMModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.transformer.output_layer.weight = ( self.transformer.embedding.weight) diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index e921fa50b099e..cd5c1d6844716 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -49,7 +49,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) @torch.compile @@ -253,15 +254,14 @@ def forward( @support_torch_compile class CohereModel(nn.Module): - def __init__( - self, - config: CohereConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -332,14 +332,9 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {"embed_tokens": "input_embeddings"} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config @@ -353,10 +348,8 @@ def __init__( self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size, scale=config.logit_scale) - self.model = CohereModel(config, - cache_config, - quant_config, - lora_config=lora_config) + self.model = CohereModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index e3b3164cacde3..fff8710f6b475 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -25,7 +25,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DbrxRouter(nn.Module): @@ -294,14 +295,13 @@ def forward( class DbrxModel(nn.Module): - def __init__( - self, - config: DbrxConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.wte = VocabParallelEmbedding( config.vocab_size, config.d_model, @@ -350,14 +350,9 @@ def forward( class DbrxForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config if config.tie_word_embeddings: @@ -365,7 +360,9 @@ def __init__( "tie_word_embeddings is not supported for Dbrx models.") self.quant_config = quant_config self.unpadded_vocab_size = config.vocab_size - self.transformer = DbrxModel(config, cache_config, quant_config) + self.transformer = DbrxModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead( config.vocab_size, config.d_model, diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py index 3e7005efb39ca..b38fd9fa49c21 100644 --- a/vllm/model_executor/models/decilm.py +++ b/vllm/model_executor/models/decilm.py @@ -51,11 +51,7 @@ class DeciLMForCausalLM(LlamaForCausalLM): instead. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config config.num_key_value_heads = max(config.num_key_value_heads_per_layer) delattr(config, "num_key_value_heads_per_layer") diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c90d3d250e4c5..a9bf1440c4d60 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -50,7 +50,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DeepseekMLP(nn.Module): @@ -326,14 +327,13 @@ class DeepseekModel(nn.Module): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -383,18 +383,14 @@ def forward( class DeepseekForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = DeepseekModel(config, cache_config, quant_config) + self.model = DeepseekModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0f391d8329a8e..4fb1eed15a2e7 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -51,7 +51,8 @@ from .interfaces import SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class DeepseekV2MLP(nn.Module): @@ -408,14 +409,13 @@ class DeepseekV2Model(nn.Module): fall_back_to_pt_during_load = False - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -479,21 +479,14 @@ def forward( class DeepseekV2ForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = DeepseekV2Model(config, - cache_config, - quant_config, - prefix="model") + self.model = DeepseekV2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 6bd73d20d340d..85c51e8404584 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -14,6 +14,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .utils import maybe_prefix + class EAGLE(nn.Module): """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077 @@ -34,7 +36,7 @@ class EAGLE(nn.Module): in the draft checkpoint (using key token_map). Also, the draft config needs to have truncated_vocab_size (=k) as an attribute.""" - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config self.config = config @@ -42,7 +44,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: architectures = getattr(self.config.model, "architectures", []) model_cls, _ = ModelRegistry.resolve_model_cls(architectures) - self.model = model_cls(vllm_config, prefix) + self.model = model_cls(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.fc = nn.Linear(config.model.hidden_size * 2, config.model.hidden_size, bias=getattr(self.config, "eagle_fc_bias", False)) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index fa6dbfe35b3ad..cd3e7da657e0e 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -29,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -54,7 +54,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class ExaoneGatedMLP(nn.Module): @@ -314,15 +315,14 @@ def forward( @support_torch_compile class ExaoneModel(nn.Module): - def __init__( - self, - config: ExaoneConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -438,14 +438,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "c_fc_1": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -453,11 +448,8 @@ def __init__( self.lora_config = lora_config self.transformer = ExaoneModel( - config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), ) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index 96ae119042277..dcfcb6694feb5 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -48,7 +48,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) FalconConfig = Union[HF_FalconConfig, RWConfig] @@ -332,14 +333,13 @@ def forward( @support_torch_compile class FalconModel(nn.Module): - def __init__( - self, - config: FalconConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -401,18 +401,15 @@ class FalconForCausalLM(nn.Module, SupportsPP): ".dense_4h_to_h.", ] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = FalconModel(config, cache_config, quant_config) + self.transformer = FalconModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) # only Falcon-11B doesn't share lm_head weight with word embeddings # and previous Falcon model doesn't have tie_word_embeddings config # so we set tie_word_embeddings to True by default diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index b0d970d9fb572..971a71180164b 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,13 +3,10 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, @@ -23,11 +20,13 @@ class Florence2LanguageModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id @@ -93,15 +92,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, class Florence2LanguageForConditionalGeneration(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + self.config = config - self.model = Florence2LanguageModel(config, - cache_config=cache_config, - quant_config=quant_config) + self.model = Florence2LanguageModel(vllm_config=vllm_config, + prefix=prefix) embed_scale = math.sqrt( config.d_model) if config.scale_embedding else 1.0 @@ -189,17 +187,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): class Florence2ForConditionalGeneration(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config # TODO(Isotr0py): Add vision backbone self.language_model = Florence2LanguageForConditionalGeneration( - config=config.text_config, - cache_config=cache_config, - quant_config=quant_config) + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=prefix, + ) @property def sampler(self): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 37f38d4d76671..50701793b7b83 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -32,8 +32,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) @@ -226,7 +225,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): @INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 4e0cbfb9cbf58..55baba809e58f 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -258,14 +258,13 @@ def forward( @support_torch_compile class GemmaModel(nn.Module): - def __init__( - self, - config: GemmaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding( @@ -372,14 +371,9 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -389,9 +383,7 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = GemmaModel(config, - cache_config, - quant_config, + self.model = GemmaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 773d3b72ec418..eeb3fd98a7eac 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -43,7 +43,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -243,11 +244,7 @@ def forward( @support_torch_compile class Gemma2Model(nn.Module): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -399,13 +396,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config del lora_config # Unused. @@ -414,7 +406,8 @@ def __init__( # currently all existing Gemma models have `tie_word_embeddings` enabled assert config.tie_word_embeddings self.quant_config = quant_config - self.model = Gemma2Model(config, cache_config, quant_config) + self.model = Gemma2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.logits_processor = LogitsProcessor( config.vocab_size, soft_cap=config.final_logit_softcapping) self.sampler = get_sampler() @@ -471,14 +464,11 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP): _pooler: An instance of Pooler used for pooling operations. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.model = Gemma2Model(vllm_config, prefix) + self.model = Gemma2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( vllm_config.model_config.pooler_config, pooling_type=PoolingType.LAST, diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index c3fc47db79986..cc85693f99526 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -42,7 +42,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPT2Attention(nn.Module): @@ -184,14 +185,13 @@ def forward( @support_torch_compile class GPT2Model(nn.Module): - def __init__( - self, - config: GPT2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config assert not config.add_cross_attention assert not config.scale_attn_by_inverse_layer_idx @@ -216,9 +216,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor], ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - inputs_embeds = self.wte(input_ids) + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds else: @@ -240,21 +242,15 @@ def forward( class GPT2LMHeadModel(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = GPT2Model(config, - cache_config, - quant_config, - prefix="transformer") + self.transformer = GPT2Model(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: @@ -265,6 +261,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -272,9 +271,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index ea1614d966365..ab25c66c3a887 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -25,7 +25,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -189,15 +189,14 @@ def forward( @support_torch_compile class GPTBigCodeModel(nn.Module): - def __init__( - self, - config: GPTBigCodeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config assert not config.add_cross_attention @@ -258,14 +257,9 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -273,8 +267,8 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.transformer = GPTBigCodeModel(config, cache_config, quant_config, - lora_config) + self.transformer = GPTBigCodeModel(vllm_config=vllm_config, + prefix=prefix) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 58cff67c69051..a83d03480dde1 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -42,7 +42,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPTJAttention(nn.Module): @@ -177,14 +178,13 @@ def forward( @support_torch_compile class GPTJModel(nn.Module): - def __init__( - self, - config: GPTJConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_dim = config.n_embd self.wte = VocabParallelEmbedding( @@ -229,19 +229,16 @@ def forward( class GPTJForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config assert not config.tie_word_embeddings - self.transformer = GPTJModel(config, cache_config, quant_config) + self.transformer = GPTJModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead( config.vocab_size, config.n_embd, diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 27b2577a8cdca..794b141bfa4aa 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -41,7 +41,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class GPTNeoXAttention(nn.Module): @@ -189,14 +190,13 @@ def forward( @support_torch_compile class GPTNeoXModel(nn.Module): - def __init__( - self, - config: GPTNeoXConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_in = VocabParallelEmbedding( @@ -242,18 +242,14 @@ def forward( class GPTNeoXForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.gpt_neox = GPTNeoXModel(config, cache_config, quant_config) + self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "gpt_neox")) self.embed_out = ParallelLMHead( config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index c3e23b7138e7f..d1e6e31f2b8d1 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -52,7 +52,8 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP -from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers, + maybe_prefix) class GraniteMLP(nn.Module): @@ -257,15 +258,14 @@ def forward( @support_torch_compile class GraniteModel(nn.Module): - def __init__( - self, - config: GraniteConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -370,25 +370,17 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = GraniteModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = GraniteModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 73f7c106e3d39..2ed115c56af45 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -47,7 +47,7 @@ from . import mixtral from .interfaces import SupportsLoRA, SupportsPP -from .utils import make_layers +from .utils import make_layers, maybe_prefix class GraniteMoeMoE(nn.Module): @@ -247,15 +247,14 @@ def forward( @support_torch_compile class GraniteMoeModel(nn.Module): - def __init__( - self, - config: GraniteMoeConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -333,25 +332,17 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = GraniteMoeModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = GraniteMoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 767171dad7c7b..df7e768fe14d3 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -15,8 +15,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index b676171b556a7..0cecc754e916f 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -26,7 +26,7 @@ from transformers import ProcessorMixin as Idefics3ImageProcessor from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.logger import init_logger @@ -35,6 +35,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor @@ -46,9 +47,10 @@ from .idefics2_vision_model import ( Idefics2VisionTransformer as Idefics3VisionTransformer) # yapf: enable -from .interfaces import SupportsMultiModal +from .interfaces import SupportsLoRA, SupportsMultiModal from .llama import LlamaModel -from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings +from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, + merge_multimodal_embeddings) logger = init_logger(__name__) @@ -59,8 +61,6 @@ class Idefics3ImagePixelInputs(TypedDict): """ Shape: `(batch_size * num_images, num_channels, height, width)` """ - rows: List[int] - cols: List[int] pixel_attention_mask: Optional[torch.BoolTensor] @@ -357,8 +357,15 @@ def dummy_data_for_idefics3( image_seq_len = processor.image_seq_len max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images + if seq_len - max_llm_image_tokens < 0: + raise RuntimeError( + f"Idefics3 cannot process {num_images} images in a prompt, " + "please increase max_model_len or reduce image limit by " + "--limit-mm-per-prompt.") + seq_data = SequenceData.from_prompt_token_counts( - (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len)) + (hf_config.image_token_id, max_llm_image_tokens), + (0, seq_len - max_llm_image_tokens)) width = height = hf_config.vision_config.image_size image = Image.new("RGB", (width, height), color=0) @@ -369,12 +376,23 @@ def dummy_data_for_idefics3( class Idefics3SimpleMLP(nn.Module): - def __init__(self, config): + def __init__( + self, + config: Idefics3Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): super().__init__() input_size = config.vision_config.hidden_size * (config.scale_factor** 2) output_size = config.text_config.hidden_size - self.proj = ReplicatedLinear(input_size, output_size, bias=False) + self.proj = ReplicatedLinear( + input_size, + output_size, + bias=False, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "proj"), + ) def forward(self, x: torch.Tensor) -> torch.Tensor: out, _ = self.proj(x) @@ -383,10 +401,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class Idefics3Connector(nn.Module): - def __init__(self, config): + def __init__( + self, + config: Idefics3Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): super().__init__() self.scale_factor = config.scale_factor - self.modality_projection = Idefics3SimpleMLP(config) + self.modality_projection = Idefics3SimpleMLP( + config, + quant_config, + prefix=maybe_prefix(prefix, "modality_projection"), + ) def pixel_shuffle(self, x: torch.Tensor, @@ -417,22 +444,28 @@ def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor: class Idefics3Model(nn.Module): - def __init__( - self, - config: Idefics3Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = self.config.text_config.pad_token_id self.vocab_size = self.config.text_config.vocab_size - - self.vision_model = Idefics3VisionTransformer(config.vision_config, - quant_config) - self.connector = Idefics3Connector(config) - self.text_model = LlamaModel(config.text_config, cache_config, - quant_config) + self.vision_model = Idefics3VisionTransformer( + config.vision_config, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "vision_model")) + self.connector = Idefics3Connector( + config, + quant_config, + prefix=maybe_prefix(prefix, "connector"), + ) + self.text_model = LlamaModel( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "text_model"), + ) self.image_seq_len = int( ((config.vision_config.image_size // @@ -464,8 +497,6 @@ def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) - rows = kwargs.pop("rows", None) - cols = kwargs.pop("cols", None) pixel_attention_mask = kwargs.pop("pixel_attention_mask", None) if pixel_values is None and image_embeds is None: @@ -490,8 +521,6 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), - rows=rows, - cols=cols, pixel_attention_mask=flatten_bn( pixel_attention_mask, concat=True)) @@ -611,24 +640,72 @@ def forward( @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) @INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) -class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal): - - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: +class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + # vision_model + "fc1", + "fc2", + "out_proj", + # text_model + "qkv_proj", # same name with vision encoder + "o_proj", + "gate_up_proj", + "down_proj", + ] + + # BitandBytes specific attributes + default_bitsandbytes_target_modules = [ + ".gate_proj.", + ".down_proj.", + ".up_proj.", + ".q_proj.", + ".k_proj.", + ".v_proj.", + ".o_proj.", + # vision_model + ".fc1.", + ".fc2.", + ".out_proj.", + # connector + ".proj.", + ] + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + + embedding_modules = {} + embedding_padding_modules = [] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config - self.model = Idefics3Model(config, cache_config, quant_config) + self.model = Idefics3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.image_token_id = self.config.image_token_id self.lm_head = ParallelLMHead( @@ -677,3 +754,12 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="model.text_model", + connector="model.connector", + tower_model="model.vision_model") diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index cbedd0c8a0130..21fa6983063b8 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -250,14 +250,13 @@ def forward( @support_torch_compile class InternLM2Model(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -317,20 +316,13 @@ def forward( class InternLM2ForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = InternLM2Model(config, - cache_config, - quant_config, + self.model = InternLM2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.output = ParallelLMHead(config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 51e2c64d5552d..34889d691a934 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -104,14 +104,13 @@ def forward( class InternLM2VEModel(InternLM2Model): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__(config, cache_config, quant_config) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: InternLM2VEDecoderLayer( @@ -159,12 +158,8 @@ def forward( class InternLM2VEForCausalLM(InternLM2ForCausalLM): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: - super().__init__(vllm_config, prefix=prefix) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 42bccf71273b3..92579e3aae949 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -25,8 +25,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -35,7 +34,7 @@ get_clip_num_patches) from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) IMG_START = '' IMG_END = '' @@ -410,7 +409,7 @@ def dummy_data( @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config @@ -435,13 +434,13 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config, quant_config=quant_config, is_mono=self.is_mono, - prefix="vision_model", + prefix=maybe_prefix(prefix, "vision_model"), ) self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.mlp1 = self._init_mlp1(config) diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index ae3f5b01d5cce..65800c44e5a93 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -44,7 +44,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class SwiGLUActivation(nn.Module): @@ -215,14 +216,13 @@ def forward( @support_torch_compile class JAISModel(nn.Module): - def __init__( - self, - config: JAISConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config assert not config.add_cross_attention assert not config.scale_attn_by_inverse_layer_idx @@ -286,18 +286,15 @@ def forward( class JAISLMHeadModel(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.transformer = JAISModel(config, cache_config, quant_config) + self.transformer = JAISModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) if self.config.tie_word_embeddings: self.lm_head = self.transformer.wte else: diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 72eb1017c2868..88fb8d5cf555a 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -7,7 +7,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.layer import Attention -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -29,6 +29,7 @@ _get_graph_batch_size) from .interfaces import HasInnerState, SupportsLoRA +from .utils import maybe_prefix KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -258,14 +259,14 @@ def forward( class JambaModel(nn.Module): - def __init__( - self, - config: JambaConfig, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -348,14 +349,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ @@ -364,10 +360,8 @@ def __init__( super().__init__() self.config = config self.scheduler_config = scheduler_config - self.model = JambaModel(config, - cache_config=cache_config, - quant_config=quant_config, - lora_config=lora_config) + self.model = JambaModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b765912387e2e..8aed0fead18f9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -271,15 +271,14 @@ def forward( @support_torch_compile class LlamaModel(nn.Module): - def __init__( - self, - config: LlamaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -492,24 +491,16 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "norm": "model.norm" } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config - self.model = LlamaModel(config, - cache_config, - quant_config, - lora_config=lora_config, + self.model = LlamaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size @@ -547,6 +538,9 @@ def __init__( normalize=False, softmax=False) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -554,9 +548,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( @@ -652,23 +648,12 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config pooler_config = vllm_config.model_config.pooler_config - self.model = LlamaModel(config, - cache_config, - quant_config, - lora_config, + self.model = LlamaModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self._pooler = Pooler.from_config_with_defaults( pooler_config, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c98462537728a..b13bcfa676811 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -32,7 +33,7 @@ dummy_seq_data_for_siglip, get_max_siglip_image_tokens, input_processor_for_siglip) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) class LlavaImagePixelInputs(TypedDict): @@ -258,7 +259,7 @@ def init_vision_tower_for_llava( @INPUT_REGISTRY.register_input_processor(input_processor_for_llava) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config @@ -282,7 +283,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, @@ -291,7 +292,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -448,6 +449,25 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.config.image_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -455,6 +475,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LLaVA-1.5. @@ -494,24 +515,13 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - else: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index f187f8105b96a..dd2fa6cac969f 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -31,7 +31,7 @@ dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, - init_vllm_registered_model) + init_vllm_registered_model, maybe_prefix) class LlavaNextImagePixelInputs(TypedDict): @@ -281,7 +281,7 @@ def input_processor_for_llava_next(ctx: InputContext, class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -296,7 +296,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( @@ -307,7 +307,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) # The same model class supports both language generation and embedding # because the architecture name is the same diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index eceb0c0ab52df..5d5598d07bfde 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -29,7 +29,7 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip) from .utils import (AutoWeightsLoader, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # For profile run _MAX_FRAMES_PER_VIDEO = 32 @@ -253,7 +253,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -267,7 +267,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, @@ -276,7 +276,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.make_empty_intermediate_tensors = ( self.language_model.model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 64d373ce91509..a5b2108177830 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -35,7 +35,7 @@ dummy_video_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) # Result in the max possible feature size (2x2 grid of 336x336px tiles) MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 @@ -404,7 +404,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -418,12 +418,12 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: config, quant_config, require_post_norm=False, - prefix="vision_tower") + prefix=maybe_prefix(prefix, "vision_tower")) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 49e43f8cc683c..55c575e22a0f6 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -6,7 +6,7 @@ from transformers import MambaConfig from vllm.attention.backends.abstract import AttentionMetadata -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -26,6 +26,8 @@ from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE, _get_graph_batch_size) +from .utils import maybe_prefix + KVCache = Tuple[torch.Tensor, torch.Tensor] @@ -73,14 +75,14 @@ def forward( class MambaModel(nn.Module): - def __init__( - self, - config: MambaConfig, - quant_config: Optional[QuantizationConfig] = None, - cache_config: Optional[CacheConfig] = None, - lora_config: Optional[LoRAConfig] = None, - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -130,14 +132,9 @@ def forward( class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config scheduler_config = vllm_config.scheduler_config assert not cache_config.enable_prefix_caching, \ @@ -146,10 +143,8 @@ def __init__( super().__init__() self.config = config self.scheduler_config = scheduler_config - self.backbone = MambaModel(config, - cache_config=cache_config, - quant_config=quant_config, - lora_config=lora_config) + self.backbone = MambaModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "backbone")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index 4cb1b4a929b9f..de5b2d89c0962 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -44,7 +44,7 @@ class Medusa(nn.Module): in the draft checkpoint (using key token_map). Also, the draft config needs to have truncated_vocab_size (=k) as an attribute.""" - def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: config = vllm_config.model_config.hf_config super().__init__() self.config = config diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 559d9c4dd35bf..2db953329fd91 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -29,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce) @@ -53,7 +53,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MiniCPMMoE(nn.Module): @@ -351,15 +352,14 @@ def forward( @support_torch_compile class MiniCPMModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.cache_config = cache_config self.quant_config = quant_config @@ -461,24 +461,22 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + self.prefix = prefix + self.vllm_config = vllm_config self.config = config self.lora_config = lora_config self.cache_config = cache_config self.quant_config = quant_config self.num_experts = getattr(self.config, "num_experts", 0) - self._init_model() + self._init_model(vllm_config=vllm_config, prefix=prefix) unpadded_vocab_size = config.vocab_size if lora_config: unpadded_vocab_size += lora_config.lora_extra_vocab_size @@ -502,11 +500,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - def _init_model(self): - self.model = MiniCPMModel(config=self.config, - cache_config=self.cache_config, - quant_config=self.quant_config, - lora_config=self.lora_config) + def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): + self.model = MiniCPMModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) def forward( self, diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index eeedf55cf3e57..278c4bbe6e563 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -28,7 +28,7 @@ from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -40,7 +40,7 @@ MiniCPMForCausalLM, MiniCPMModel) -from .utils import make_layers +from .utils import make_layers, maybe_prefix class MiniCPM3Attention(nn.Module): @@ -238,8 +238,6 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM): # `embedding_modules` and `embedding_padding_modules` # are inherited from MiniCPMForCausalLM - def _init_model(self): - self.model = MiniCPM3Model(config=self.config, - cache_config=self.cache_config, - quant_config=self.quant_config, - lora_config=self.lora_config) + def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""): + self.model = MiniCPM3Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 9458204c5a038..999739ccd98bf 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -34,7 +34,7 @@ from typing_extensions import NotRequired from vllm.attention import AttentionMetadata -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, InputContext, token_inputs) from vllm.model_executor.layers.logits_processor import LogitsProcessor @@ -51,15 +51,14 @@ from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import is_pp_missing_parameter +from .utils import is_pp_missing_parameter, maybe_prefix _KEYS_TO_MODIFY_MAPPING = { "llm.lm_head": "lm_head", @@ -390,7 +389,6 @@ def __init__( ): config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config super().__init__() # All MiniCPM-V models disable `tie_word_embeddings` but @@ -401,11 +399,11 @@ def __init__( self.multimodal_config = multimodal_config self.version = get_version_by_config(self.config) - self.llm = self.init_llm(config, - cache_config, - quant_config, - prefix="llm") - self.vpm = self.init_vision_module(config, quant_config, prefix="vpm") + self.llm = self.init_llm(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "llm")) + self.vpm = self.init_vision_module(config, + quant_config, + prefix=maybe_prefix(prefix, "vpm")) param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else @@ -414,13 +412,15 @@ def __init__( self.resampler = self.init_resampler(self.embed_dim, self.vision_dim, quant_config=quant_config, - prefix="resampler") + prefix=maybe_prefix( + prefix, "resampler")) self.resampler.to(device="cuda", dtype=param_dtype) # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config, - prefix="llm.lm_head") + prefix=maybe_prefix( + prefix, "llm.lm_head")) self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() @@ -661,9 +661,7 @@ def get_mm_mapping(self) -> MultiModelKeys: def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: raise NotImplementedError @@ -711,16 +709,10 @@ def __init__( def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - - return LLMWrapper(MiniCPMModel(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -875,15 +867,10 @@ def __init__( def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - return LLMWrapper(LlamaModel(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -1022,16 +1009,10 @@ def __init__( def init_llm( self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig, prefix: str = "", ) -> nn.Module: - - return LLMWrapper(Qwen2Model(config, - cache_config=cache_config, - quant_config=quant_config, - prefix=prefix), + return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix), name="model") def init_vision_module( @@ -1151,4 +1132,4 @@ def __new__(cls, vllm_config: VllmConfig, prefix: str = ""): if instance_class is None: raise ValueError( "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6") - return instance_class(vllm_config, prefix=prefix) + return instance_class(vllm_config=vllm_config, prefix=prefix) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 91ec3228c0d48..3eb2f60fd4fc7 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -48,7 +48,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MixtralMoE(nn.Module): @@ -248,15 +249,14 @@ def forward( @support_torch_compile class MixtralModel(nn.Module): - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0 @@ -332,24 +332,16 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = MixtralModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = MixtralModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index aeac326776392..95cfb6f54dc10 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -49,7 +49,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class MixtralMLP(nn.Module): @@ -293,14 +294,13 @@ def forward( class MixtralModel(nn.Module): - def __init__( - self, - config: MixtralConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -350,18 +350,14 @@ def forward( class MixtralForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = MixtralModel(config, cache_config, quant_config) + self.model = MixtralModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 14aa515570f38..db7ee7b2d8537 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -32,8 +32,10 @@ import vllm.distributed.parallel_state as ps from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.attention.backends.xformers import XFormersMetadata from vllm.attention.ops.paged_attn import PagedAttention -from vllm.config import CacheConfig, VllmConfig +from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs, InputContext, TokenInputs, token_inputs) @@ -56,6 +58,7 @@ from .clip import CLIPMLP from .interfaces import SupportsMultiModal from .llama import LlamaDecoderLayer, LlamaMLP +from .utils import maybe_prefix logger = init_logger(__name__) MLLAMA_IMAGE_TOKEN_ID = 128256 @@ -798,12 +801,13 @@ def forward( q = self.q_norm(q) if attention_mask is not None: - output = self.attention_with_mask(q, k, v, kv_cache, - attention_mask, - kv_range_for_decode, - attn_metadata) + output = self._attention_with_mask(q, k, v, kv_cache, + attention_mask, + kv_range_for_decode, + attn_metadata) else: - output = self.attn(q, + output = self.attn(q.view(-1, + self.num_local_heads * self.head_dim), k, v, kv_cache, @@ -812,7 +816,7 @@ def forward( out, _ = self.o_proj(output) return out - def attention_with_mask( + def _attention_with_mask( self, q: torch.Tensor, k: torch.Tensor, @@ -823,14 +827,35 @@ def attention_with_mask( attn_metadata: AttentionMetadata, ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. - if len(kv_cache.shape) == 3: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_local_key_value_heads, self.head_dim) - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - PagedAttention.write_to_paged_cache( - cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + if len(kv_cache.shape) > 1: + if isinstance(attn_metadata, FlashAttentionMetadata): + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + torch.ops._C_cache_ops.reshape_and_cache_flash( + cached_k, + cached_v, + kv_cache[0], + kv_cache[1], + attn_metadata. + cross_slot_mapping, # type: ignore[union-attr] + "auto", + 1.0, + 1.0, + ) + elif isinstance(attn_metadata, XFormersMetadata): + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_local_key_value_heads, self.head_dim) + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + PagedAttention.write_to_paged_cache( + cached_k, cached_v, key_cache, value_cache, + attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + else: + raise ValueError( + f"Unsupported AttentionMetadata {type(attn_metadata)} " + f"class found. Expected the AttentionMetadata to " + f"be either XFormersMetadata or FlashAttentionMetadata.") + # We have to call torch.sdpa for prefill when using a # custom cross-attention mask. Because the mask is not a # standard causal mask, neither a block diagonal mask which @@ -939,15 +964,13 @@ class MllamaTextModel(nn.Module): config_class = config_mllama.MllamaTextConfig base_model_prefix = "model" - def __init__( - self, - config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + config = vllm_config.model_config.hf_config.text_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, @@ -1029,18 +1052,14 @@ class MllamaForCausalLM(nn.Module): "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" ] - def __init__( - self, - config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig], - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config.text_config + quant_config = vllm_config.quant_config + self.vocab_size = config.vocab_size - self.model = MllamaTextModel(config, - cache_config, - quant_config, + self.model = MllamaTextModel(vllm_config=vllm_config, prefix=f"{prefix}.model") self.lm_head = ParallelLMHead( config.vocab_size, @@ -1108,14 +1127,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.vocab_size = config.text_config.vocab_size self.hidden_size = config.text_config.hidden_size @@ -1127,12 +1141,11 @@ def __init__( self.vision_model = MllamaVisionModel(config.vision_config, quant_config, - prefix="vision_model") + prefix=maybe_prefix( + prefix, "vision_model")) self.language_model = MllamaForCausalLM( - config.text_config, - cache_config=cache_config, - quant_config=quant_config, - prefix="language_model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), ) self.multi_modal_projector = ColumnParallelLinear( config.vision_config.vision_output_dim, @@ -1140,7 +1153,7 @@ def __init__( bias=True, quant_config=quant_config, gather_output=True, - prefix="multi_modal_projector", + prefix=maybe_prefix(prefix, "multi_modal_projector"), ) self.logits_processor = LogitsProcessor(config.output_hidden_states, config.text_config.vocab_size) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index cd462c4d0495e..035a1e2ab7b02 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -44,7 +44,8 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (get_vit_attn_backend, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # TODO: hard-coded for now. Consider making it configurable. VIT_LAYERS = [-2, -9] @@ -716,14 +717,13 @@ def forward( @support_torch_compile class MolmoModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embedding_size = config.embedding_size or config.vocab_size @@ -1024,14 +1024,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config @@ -1040,7 +1035,8 @@ def __init__( vision_config = VisionBackboneConfig() self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config) - self.model = MolmoModel(config, cache_config, quant_config) + self.model = MolmoModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if self.config.weight_tying: self.lm_head = self.model.transformer.wte diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 672c8e9c22260..e15c0fe8db060 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -26,7 +26,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def _get_alibi_slopes( @@ -207,14 +208,13 @@ def forward( @support_torch_compile class MPTModel(nn.Module): - def __init__( - self, - config: MPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + assert config.embedding_fraction == 1.0 assert config.norm_type == "low_precision_layernorm" @@ -267,20 +267,16 @@ def forward( class MPTForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config assert config.tie_word_embeddings self.quant_config = quant_config - self.transformer = MPTModel(config, cache_config, quant_config) + self.transformer = MPTModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "transformer")) self.lm_head = self.transformer.wte self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 5991cce642981..e09d7088a69ce 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -27,7 +27,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -47,7 +47,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) # The architecture is pretty similar to Llama, with these changes: # - There is no gate_proj, just up_proj @@ -293,15 +294,14 @@ def forward( @support_torch_compile class NemotronModel(nn.Module): - def __init__( - self, - config: NemotronConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = (lora_config.lora_extra_vocab_size * @@ -401,14 +401,9 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "v_proj": ("qkv_proj", 2), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config assert isinstance(config, NemotronConfig) @@ -416,11 +411,8 @@ def __init__( self.config = config self.lora_config = lora_config - self.model = NemotronModel(config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model") + self.model = NemotronModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size if lora_config: diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 6905f8521a8c3..3467ae5896494 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -46,7 +46,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OlmoAttention(nn.Module): @@ -224,12 +225,13 @@ def forward( @support_torch_compile class OlmoModel(nn.Module): - def __init__(self, - config: OlmoConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -291,17 +293,13 @@ class OlmoForCausalLM(nn.Module, SupportsPP): Extremely barebones HF model wrapper. """ - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config - self.model = OlmoModel(config, cache_config, quant_config) + self.model = OlmoModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if config.tie_word_embeddings: self.lm_head = self.model.embed_tokens else: diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index 8fa90d17003af..3d31919edd862 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -38,7 +38,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OlmoeMoE(nn.Module): @@ -243,14 +244,13 @@ def forward( @support_torch_compile class OlmoeModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -309,18 +309,14 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = OlmoeModel(config, cache_config, quant_config) + self.model = OlmoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index d378956b68cfc..997fe642439e6 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -293,14 +293,13 @@ def forward( @support_torch_compile class OPTModel(nn.Module): - def __init__( - self, - config: OPTConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.decoder = OPTDecoder(config, cache_config, quant_config, @@ -342,21 +341,14 @@ class OPTForCausalLM(nn.Module, SupportsPP): ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2." ] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config super().__init__() self.config = config self.quant_config = quant_config - self.model = OPTModel(config, - cache_config, - quant_config, + self.model = OPTModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if self.config.tie_word_embeddings: self.lm_head = self.model.decoder.embed_tokens @@ -368,6 +360,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -375,9 +370,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index b400d4e3f5228..38821c8288347 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -29,7 +29,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class OrionMLP(nn.Module): @@ -208,14 +209,13 @@ def forward( @support_torch_compile class OrionModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -268,18 +268,14 @@ def forward( class OrionForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = OrionModel(config, cache_config, quant_config) + self.model = OrionModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 69b7fe9d56847..eea229359255e 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -20,7 +20,7 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens) from .utils import (AutoWeightsLoader, init_vllm_registered_model, - merge_multimodal_embeddings) + maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -131,11 +131,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -145,7 +141,8 @@ def __init__( self.vision_tower = SiglipVisionModel(config.vision_config, quant_config, - prefix="vision_tower") + prefix=maybe_prefix( + prefix, "vision_tower")) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) @@ -155,7 +152,7 @@ def __init__( self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) logit_scale = getattr(config, "logit_scale", 1.0) self.language_model.logits_processor.scale *= logit_scale diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index a86e2c1b4e4a1..2e34a7cc30873 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -45,7 +45,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PersimmonMLP(nn.Module): @@ -212,12 +213,13 @@ def forward( @support_torch_compile class PersimmonModel(nn.Module): - def __init__(self, - config: PersimmonConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -265,20 +267,13 @@ def forward( class PersimmonForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config self.config = config self.vocab_size = config.vocab_size - self.model = PersimmonModel(config, - cache_config=cache_config, - quant_config=quant_config) + self.model = PersimmonModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, bias=False) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index fef921528b042..262f6996fc374 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -60,7 +60,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PhiAttention(nn.Module): @@ -196,12 +197,13 @@ def forward( @support_torch_compile class PhiModel(nn.Module): - def __init__(self, - config: PhiConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, @@ -277,14 +279,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config @@ -294,7 +291,8 @@ def __init__( self.quant_config = quant_config - self.model = PhiModel(config, cache_config, quant_config) + self.model = PhiModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index de1b09eba6c6d..8a5fb6d303e60 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -24,7 +24,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) def load_column_parallel_weight(param: torch.nn.Parameter, @@ -299,14 +300,13 @@ def forward( class Phi3SmallModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size) @@ -363,18 +363,14 @@ def forward( class Phi3SmallForCausalLM(nn.Module, SupportsPP): _tied_weights_keys = ["lm_head.weight"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Phi3SmallModel(config, cache_config, quant_config) + self.model = Phi3SmallModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.mup_width_multiplier = config.mup_width_multiplier self.lm_head = ParallelLMHead( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 65131d61673a3..4db65edc174f1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -39,13 +39,14 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -500,15 +501,20 @@ def input_processor_for_phi3v(ctx: InputContext, # TODO: Move this to utils or integrate with clip. new_token_ids: List[int] = [] + placeholder_ranges: List[PlaceholderRange] = [] placeholder_idx = 0 while merged_token_ids: token_id = merged_token_ids.pop(0) if token_id == _IMAGE_TOKEN_ID: - new_token_ids.extend( - repeat_and_pad_token( - _IMAGE_TOKEN_ID, - repeat_count=image_feature_size[placeholder_idx], - )) + replacement_ids = repeat_and_pad_token( + _IMAGE_TOKEN_ID, + repeat_count=image_feature_size[placeholder_idx], + ) + placeholder_ranges.append({ + "offset": len(new_token_ids), + "length": len(replacement_ids) + }) + new_token_ids.extend(replacement_ids) placeholder_idx += 1 else: new_token_ids.append(token_id) @@ -516,7 +522,8 @@ def input_processor_for_phi3v(ctx: InputContext, # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) @MULTIMODAL_REGISTRY.register_image_input_mapper() @@ -525,11 +532,7 @@ def input_processor_for_phi3v(ctx: InputContext, @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -544,12 +547,14 @@ def __init__( config.hidden_size, org_num_embeddings=config.vocab_size, quant_config=quant_config, - prefix="model.embed_tokens", + prefix=maybe_prefix(prefix, "model.embed_tokens"), ) # TODO: Optionally initializes this for supporting input embeddings. self.vision_embed_tokens = Phi3HDImageEmbedding( - config, quant_config, prefix="model.vision_embed_tokens") + config, + quant_config, + prefix=maybe_prefix(prefix, "model.vision_embed_tokens")) # The prefix is empty intentionally because default prefix of # LlamaForCausalLM is "model" @@ -671,32 +676,42 @@ def _process_image_input( return image_embeds + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.embed_tokens(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + return inputs_embeds + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object): if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.embed_tokens(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - else: - inputs_embeds = self.language_model.model.embed_tokens( - input_ids) - - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 17d00c0ede2b2..6d71a8949111b 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -28,7 +28,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -48,7 +48,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class PhiMoEConfig(PretrainedConfig): @@ -432,15 +433,14 @@ def forward( @support_torch_compile class PhiMoEModel(nn.Module): - def __init__( - self, - config: PhiMoEConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * (lora_config.max_loras or 1)) if lora_config else 0) @@ -529,23 +529,15 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config - self.model = PhiMoEModel(config, - cache_config, - quant_config, - lora_config=lora_config) + self.model = PhiMoEModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.unpadded_vocab_size = config.vocab_size if lora_config: self.unpadded_vocab_size += lora_config.lora_extra_vocab_size diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 93919c9c051c0..a3e30ea2dd299 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -29,8 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData @@ -38,7 +37,7 @@ from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP -from .utils import init_vllm_registered_model +from .utils import init_vllm_registered_model, maybe_prefix try: from xformers import ops as xops @@ -152,11 +151,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs): class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config @@ -176,7 +171,7 @@ def __init__( self.language_model = init_vllm_registered_model( config.text_config, vllm_config=vllm_config, - prefix="language_model") + prefix=maybe_prefix(prefix, "language_model")) self.vision_encoder = VisionTransformer(self.vision_args) self.vision_language_adapter = VisionLanguageAdapter( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index d3f10ee7c85ca..3d26ede722dd1 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -42,15 +42,15 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import is_list_of from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (flatten_bn, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) logger = init_logger(__name__) @@ -552,14 +552,13 @@ def forward( @support_torch_compile class QWenModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.vocab_size = config.vocab_size @@ -865,20 +864,17 @@ def dummy_data_for_qwen( class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config self.quant_config = quant_config - self.transformer = QWenModel(config, cache_config, quant_config) + self.transformer = QWenModel(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "transformer")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) @@ -1071,7 +1067,7 @@ def __new__( config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): - return QWenVL(vllm_config) + return QWenVL(vllm_config=vllm_config) # Initialize LLM else: - return QWenLLM(vllm_config) + return QWenLLM(vllm_config=vllm_config) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b0156a25ca5cf..b623c576bb673 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -240,14 +240,13 @@ def forward( @support_torch_compile class Qwen2Model(nn.Module): - def __init__( - self, - config: Qwen2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -403,11 +402,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -429,9 +424,7 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, - cache_config, - quant_config, + self.model = Qwen2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) if config.tie_word_embeddings: @@ -448,6 +441,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -455,9 +451,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 1057720e8c308..d30950361ad89 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -264,14 +264,9 @@ def input_mapper_for_qwen2_audio( class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config self.config = config @@ -283,8 +278,9 @@ def __init__( self.quant_config = quant_config - self.language_model = Qwen2Model(config.text_config, cache_config, - quant_config) + self.language_model = Qwen2Model( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=prefix) self.unpadded_vocab_size = config.text_config.vocab_size if config.text_config.tie_word_embeddings: self.lm_head = self.language_model.embed_tokens diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py index 25ecf76e35f22..27eb7e8a93975 100644 --- a/vllm/model_executor/models/qwen2_cls.py +++ b/vllm/model_executor/models/qwen2_cls.py @@ -17,7 +17,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput -from .utils import AutoWeightsLoader +from .utils import AutoWeightsLoader, maybe_prefix class Qwen2ForSequenceClassification(nn.Module): @@ -43,11 +43,7 @@ class Qwen2ForSequenceClassification(nn.Module): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -70,11 +66,17 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + # hidden_states from Qwen2Model has been reduced, + # the input of score layer is not parallelized. self.score = RowParallelLinear(config.hidden_size, config.num_labels, - quant_config=quant_config) + quant_config=quant_config, + input_is_parallel=False, + bias=False, + prefix=maybe_prefix(prefix, "score")) self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.LAST, diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index b1177f9c59063..51c0cd5664fd2 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -54,7 +54,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class Qwen2MoeMLP(nn.Module): @@ -315,14 +316,13 @@ def forward( @support_torch_compile class Qwen2MoeModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -377,18 +377,14 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP): fall_back_to_pt_during_load = False - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = Qwen2MoeModel(config, cache_config, quant_config) + self.model = Qwen2MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 1f9411241bdd6..89768ec9dff37 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -18,7 +18,7 @@ from .interfaces import SupportsPP from .qwen2 import Qwen2Model -from .utils import AutoWeightsLoader +from .utils import AutoWeightsLoader, maybe_prefix class ReLU(nn.Module): @@ -55,11 +55,7 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP): embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -82,7 +78,8 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = Qwen2Model(config, cache_config, quant_config) + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.score = nn.Sequential( ColumnParallelLinear(config.hidden_size, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ab80c1494d067..2335baf459771 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -51,6 +51,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import (GPTQConfig, GPTQMarlinConfig, QuantizationConfig) @@ -58,19 +59,20 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalKwargs) -from vllm.multimodal.base import MultiModalData +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, + MultiModalKwargs) from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (PPMissingLayer, get_vit_attn_backend, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory) + make_empty_intermediate_tensors_factory, maybe_prefix) logger = init_logger(__name__) @@ -79,7 +81,7 @@ class Qwen2VLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: torch.Tensor + pixel_values: torch.Tensor """Shape: `(num_patches, num_channels * patch_size * patch_size)` """ @@ -92,9 +94,22 @@ class Qwen2VLImagePixelInputs(TypedDict): class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` - `hidden_size` must match the hidden size of language model backbone. + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. """ @@ -102,7 +117,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): Qwen2VLImageEmbeddingInputs] -class Qwen2VLVideoInputs(TypedDict): +class Qwen2VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] pixel_values_videos: torch.Tensor """Shape: `(num_patches, @@ -116,6 +132,30 @@ class Qwen2VLVideoInputs(TypedDict): """ +class Qwen2VLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + video_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all videos' features. + Each tensor holds an video's features. + - `torch.Tensor`: A tensor holding all videos' features + (concatenation of all videos' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the videos. + - `hidden_size` must match the hidden size of language model backbone. + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs, + Qwen2VLVideoEmbeddingInputs] + # === Vision Encoder === # @@ -585,6 +625,12 @@ def mm_input_mapper_for_qwen2_vl( "image_embeds": data.get("image_embeds"), "image_grid_thw": data.get("image_grid_thw"), }) + if data_type_key == "video" and isinstance(data, dict): + return MultiModalKwargs({ + "video_embeds": data.get("video_embeds"), + "video_grid_thw": data.get("video_grid_thw"), + }) + model_config = ctx.model_config # Handle mm processor kwargs; we pass these at creation time # because preprocess() in transformers doesn't expose them @@ -890,16 +936,33 @@ def input_processor_for_qwen2_vl( idx for idx, token in enumerate(prompt_token_ids) if token == hf_config.image_token_id ] - image_cnt = len(image_indices) - embed_dim = image_inputs.get('image_embeds').size(0) - assert embed_dim % image_cnt == 0 - num_pad_tokens = embed_dim // image_cnt + + # ensure all image tokens have grid_thw + assert \ + len(image_indices) == image_inputs["image_grid_thw"].size(0), \ + "image token num does not match image_grid_thw.shape" + + image_counter = 0 + pad_token_counter = 0 for idx, token in enumerate(prompt_token_ids): if idx in image_indices: + grid_thw = image_inputs["image_grid_thw"][image_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) prompt_token_ids_with_image.extend([token] * num_pad_tokens) + image_counter += 1 + pad_token_counter += num_pad_tokens else: prompt_token_ids_with_image.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == image_inputs["image_embeds"].size(0), \ + "image_embeds.shape does not match image_grid_thw" + prompt_token_ids = prompt_token_ids_with_image else: prompt_token_ids = _expand_pad_tokens(image_inputs, @@ -912,14 +975,49 @@ def input_processor_for_qwen2_vl( max_pixels=max_pixels) if video_inputs is not None: - prompt_token_ids = _expand_pad_tokens(video_inputs, - hf_config.video_token_id, - make_batched_videos, - "video", - image_processor, - prompt_token_ids, - min_pixels=min_pixels, - max_pixels=max_pixels) + if isinstance(video_inputs, dict): + prompt_token_ids_with_video = [] + video_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.video_token_id + ] + + # ensure all video tokens have grid_thw + assert \ + len(video_indices) == video_inputs["video_grid_thw"].size(0), \ + "video token num does not match video_grid_thw.shape" + + video_counter = 0 + pad_token_counter = 0 + for idx, token in enumerate(prompt_token_ids): + if idx in video_indices: + grid_thw = video_inputs["video_grid_thw"][video_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) + prompt_token_ids_with_video.extend([token] * + num_pad_tokens) + video_counter += 1 + pad_token_counter += num_pad_tokens + else: + prompt_token_ids_with_video.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == video_inputs["video_embeds"].size(0), \ + "video_embeds.shape does not match video_grid_thw" + + prompt_token_ids = prompt_token_ids_with_video + else: + prompt_token_ids = _expand_pad_tokens(video_inputs, + hf_config.video_token_id, + make_batched_videos, + "video", + image_processor, + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) prompt = inputs.get("prompt") if prompt is None: @@ -966,15 +1064,12 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, embedding_modules = {} embedding_padding_modules = [] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config assert not cache_config.enable_prefix_caching, \ "Qwen2-VL currently does not support prefix caching" @@ -986,13 +1081,11 @@ def __init__( config.vision_config, norm_eps=getattr(config, "rms_norm_eps", 1e-6), quant_config=self._maybe_ignore_quant_config(quant_config), - prefix="visual", + prefix=maybe_prefix(prefix, "visual"), ) - self.model = Qwen2Model(config, - cache_config, - quant_config, - prefix="model") + self.model = Qwen2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) if get_pp_group().is_last_rank: if config.tie_word_embeddings: @@ -1001,12 +1094,18 @@ def __init__( self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config, - prefix="lm_head") + prefix=maybe_prefix( + prefix, "lm_head")) else: self.lm_head = PPMissingLayer() self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) @@ -1056,49 +1155,71 @@ def _parse_and_validate_image_input( f"Got type: {type(pixel_values)}") return Qwen2VLImagePixelInputs(type="pixel_values", - data=pixel_values, + pixel_values=pixel_values, image_grid_thw=image_grid_thw) if image_embeds is not None: image_embeds = self._validate_and_reshape_mm_tensor( image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") if not isinstance(image_embeds, torch.Tensor): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") return Qwen2VLImageEmbeddingInputs(type="image_embeds", - data=image_embeds) + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) video_grid_thw = kwargs.pop("video_grid_thw", None) - if pixel_values_videos is None: + if pixel_values_videos is None and video_embeds is None: return None - pixel_values_videos = self._validate_and_reshape_mm_tensor( - pixel_values_videos, "video pixel values") - video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw") - - return Qwen2VLVideoInputs( - pixel_values_videos=pixel_values_videos, - video_grid_thw=video_grid_thw, - ) + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2VLVideoEmbeddingInputs(type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: if image_input["type"] == "image_embeds": - return image_input["data"].type(self.visual.dtype) + return image_input["image_embeds"].type(self.visual.dtype) - pixel_values = image_input["data"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"].type(self.visual.dtype) image_embeds = self.visual(pixel_values, grid_thw=image_input["image_grid_thw"]) return image_embeds def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor: + if video_input["type"] == "video_embeds": + return video_input["video_embeds"].type(self.visual.dtype) + pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) video_embeds = self.visual(pixel_values_videos, @@ -1205,6 +1326,13 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 32750602b988c..f172c06c4a26a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -109,6 +109,7 @@ # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501, } _MULTIMODAL_MODELS = { diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index ffabac8292dbd..4f03ca501fb68 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -29,7 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.activation import SiluAndMul @@ -53,7 +53,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class SolarMLP(nn.Module): @@ -266,15 +267,14 @@ def forward( @support_torch_compile class SolarModel(nn.Module): - def __init__( - self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - lora_config: Optional[LoRAConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + self.config = config self.padding_idx = config.pad_token_id lora_vocab = ((lora_config.lora_extra_vocab_size * @@ -409,25 +409,17 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP): "up_proj": ("gate_up_proj", 1), } - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config self.config = config self.lora_config = lora_config self.model = SolarModel( - config, - cache_config, - quant_config, - lora_config=lora_config, - prefix="model", + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), ) if get_pp_group().is_last_rank: self.unpadded_vocab_size = config.vocab_size diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 975d316977c37..1125f9e9f9617 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -43,7 +43,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class StablelmMLP(nn.Module): @@ -193,12 +194,13 @@ def forward( class StableLMEpochModel(nn.Module): - def __init__(self, - config: PretrainedConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = '') -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -245,18 +247,14 @@ def forward( class StablelmForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config self.quant_config = quant_config - self.model = StableLMEpochModel(config, cache_config, quant_config) + self.model = StableLMEpochModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index ae61aa4e248a5..ce7a7957f52c4 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -43,7 +43,8 @@ from .interfaces import SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class Starcoder2Attention(nn.Module): @@ -195,12 +196,13 @@ def forward( @support_torch_compile class Starcoder2Model(nn.Module): - def __init__(self, - config: Starcoder2Config, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + self.config = config self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size @@ -245,19 +247,13 @@ def forward( class Starcoder2ForCausalLM(nn.Module, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config self.config = config - self.model = Starcoder2Model(config, - cache_config, - quant_config=quant_config) + self.model = Starcoder2Model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.vocab_size = config.vocab_size self.unpadded_vocab_size = config.vocab_size if config.tie_word_embeddings: diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index d47f0091e0f9f..9fde22c016de0 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -34,7 +34,7 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, - init_vllm_registered_model, + init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings_from_map) _AUDIO_PLACEHOLDER_TOKEN = 128002 @@ -339,11 +339,7 @@ def forward( @INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config multimodal_config = vllm_config.model_config.multimodal_config @@ -354,6 +350,8 @@ def __init__( self.secondary_weights = [] self.audio_tower = ModifiedWhisperEncoder(config.audio_config) if config.audio_model_id is not None: + # this prefix is not for initialization, but for loading weights + # note the trailing dot self.secondary_weights.append( DefaultModelLoader.Source( model_or_path=config.audio_model_id, @@ -362,8 +360,12 @@ def __init__( )) self.multi_modal_projector = UltravoxProjector(config) self.language_model = init_vllm_registered_model( - config.text_config, vllm_config, prefix="language_model") + config.text_config, + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model")) if config.text_model_id is not None: + # this prefix is not for initialization, but for loading weights + # note the trailing dot self.secondary_weights.append( DefaultModelLoader.Source(model_or_path=config.text_model_id, revision=None, diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ca4fc8ec952bf..1d51885f9094a 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -14,8 +14,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models import ModelRegistry -from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors +from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import is_pin_memory_available @@ -240,12 +239,9 @@ def init_vllm_registered_model( Helper function to initialize an inner model registered to vLLM, based on the arguments passed to the outer vLLM model. """ - model_class, _ = ModelRegistry.resolve_model_cls(hf_config.architectures) - - return model_class( - vllm_config=vllm_config.with_hf_config(hf_config), - prefix=prefix, - ) + from vllm.model_executor.model_loader.loader import _initialize_model + vllm_config = vllm_config.with_hf_config(hf_config) + return _initialize_model(vllm_config, prefix) @overload diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py index 7afb99176077b..153527da20d75 100644 --- a/vllm/model_executor/models/xverse.py +++ b/vllm/model_executor/models/xverse.py @@ -46,7 +46,8 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers) + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) class XverseMLP(nn.Module): @@ -223,11 +224,7 @@ def forward( @support_torch_compile class XverseModel(nn.Module): - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -315,15 +312,10 @@ class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP): } embedding_padding_modules = ["lm_head"] - def __init__( - self, - vllm_config: VllmConfig, - prefix: str = "", - ) -> None: + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config @@ -331,7 +323,8 @@ def __init__( self.lora_config = lora_config self.quant_config = quant_config - self.model = XverseModel(config, cache_config, quant_config) + self.model = XverseModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 14911853abc73..03a5f3a91f7a1 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,7 +1,8 @@ -from .base import (BatchedTensorInputs, MultiModalDataBuiltins, - MultiModalDataDict, MultiModalKwargs, - MultiModalPlaceholderDict, MultiModalPlaceholderMap, - MultiModalPlugin, NestedTensors) +from .base import MultiModalPlaceholderMap, MultiModalPlugin +from .inputs import (BatchedTensorInputs, MultiModalData, + MultiModalDataBuiltins, MultiModalDataDict, + MultiModalKwargs, MultiModalPlaceholderDict, + NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -15,6 +16,7 @@ __all__ = [ "BatchedTensorInputs", + "MultiModalData", "MultiModalDataBuiltins", "MultiModalDataDict", "MultiModalKwargs", diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index e71ae5feec1c6..1a230602966d4 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,5 +1,7 @@ from vllm.inputs.registry import InputContext -from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin + +from .base import MultiModalPlugin +from .inputs import AudioItem, MultiModalData, MultiModalKwargs class AudioPlugin(MultiModalPlugin): @@ -8,8 +10,12 @@ class AudioPlugin(MultiModalPlugin): def get_data_key(self) -> str: return "audio" - def _default_input_mapper(self, ctx: InputContext, data: object, - **mm_processor_kwargs) -> MultiModalKwargs: + def _default_input_mapper( + self, + ctx: InputContext, + data: MultiModalData[AudioItem], + **mm_processor_kwargs, + ) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fa514d3fcb3b7..6eec660e42ac4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,180 +1,23 @@ from abc import ABC, abstractmethod -from collections import UserDict, defaultdict -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping, - NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar, - Union, cast, final) - -import numpy as np -import torch -import torch.types -from PIL import Image +from collections import defaultdict +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Sequence, Tuple, Type, TypeVar, Union) + from torch import nn -from typing_extensions import TypeAlias from vllm.inputs import InputContext from vllm.logger import init_logger -from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of, - json_map_leaves, resolve_mm_processor_kwargs) +from vllm.utils import (get_allowed_kwarg_only_overrides, + resolve_mm_processor_kwargs) if TYPE_CHECKING: from vllm.config import ModelConfig from vllm.sequence import SequenceGroupMetadata -logger = init_logger(__name__) - -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] -""" -Uses a list instead of a tensor if the dimensions of each element do not match. -""" - -BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] -""" -A dictionary containing nested tensors which have been batched via -:meth:`MultiModalKwargs.batch`. -""" - - -class _MultiModalKwargsBase(UserDict[str, NestedTensors]): - pass - - -class MultiModalKwargs(_MultiModalKwargsBase): - """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. - """ - - @staticmethod - def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: - """ - Recursively stacks lists of tensors when they all have the same shape. - """ - if isinstance(nested_tensors, torch.Tensor): - return nested_tensors - - if isinstance(nested_tensors, np.ndarray): - return torch.from_numpy(nested_tensors) - - if isinstance(nested_tensors, (int, float)): - return torch.tensor(nested_tensors) +from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, + PlaceholderRange) - stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] - if not is_list_of(stacked, torch.Tensor, check="all"): - # Only tensors (not lists) can be stacked. - return stacked - - tensors_ = cast(List[torch.Tensor], stacked) - if any(t.shape != tensors_[0].shape for t in tensors_): - # The tensors have incompatible shapes and can't be stacked. - return tensors_ - - return torch.stack(tensors_) - - @staticmethod - def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: - """ - Batch multiple inputs together into a dictionary. - - The resulting dictionary has the same keys as the inputs. - If the corresponding value from each input is a tensor and they all - share the same shape, the output value is a single batched tensor; - otherwise, the output value is a list containing the original value - from each input. - """ - if len(inputs_list) == 0: - return {} - - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) - - for inputs in inputs_list: - # For models that supports multiple modalities (e.g. Qwen2-VL), - # different modalities will return different data keys, - # so batch() should skip the same key check. - - for k, v in inputs.items(): - item_lists[k].append(v) - - return { - k: MultiModalKwargs._try_stack(item_list) - for k, item_list in item_lists.items() - } - - @staticmethod - def as_kwargs( - batched_inputs: BatchedTensorInputs, - *, - device: torch.types.Device, - ) -> BatchedTensorInputs: - json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) - - json_mapped = json_map_leaves( - lambda x: x.to(device, non_blocking=True), - json_inputs, - ) - - return cast(BatchedTensorInputs, json_mapped) - - -_T = TypeVar("_T") - -MultiModalData: TypeAlias = Union[_T, List[_T]] -""" -Either a single data instance, or a list of data instances. - -The number of data instances allowed per modality is restricted by -`--limit-mm-per-prompt`. -""" - - -@final -class MultiModalDataBuiltins(TypedDict, total=False): - """Modality types that are predefined by vLLM.""" - - image: MultiModalData[Image.Image] - """The input image(s).""" - - audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]] - """The input audio item(s) and corresponding sampling rate(s).""" - - video: MultiModalData[Tuple[np.ndarray]] - """The input video(s).""" - - -MultiModalDataDict = Union[MultiModalDataBuiltins, - Mapping[str, MultiModalData[object]]] -""" -A dictionary containing an item for each modality type to input. - -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalDataBuiltins` as long as a customized plugin is registered - through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. -""" - - -class PlaceholderRange(TypedDict): - """ - Placeholder location information for multi-modal data. - - For example: - Prompt: AAAA BBBB What is in these images? - Images A and B will have: - A: { "offset": 0, "length": 4 } - B: { "offset": 5, "length": 4 } - """ - - offset: int - """The start index of the placeholder in the prompt.""" - - length: int - """The length of the placeholder.""" - - -MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]] -""" -A dictionary containing placeholder ranges. -""" +logger = init_logger(__name__) MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], MultiModalKwargs] @@ -192,6 +35,7 @@ class PlaceholderRange(TypedDict): model. This does not include tokens that correspond to the input text. """ +_T = TypeVar("_T") N = TypeVar("N", bound=Type[nn.Module]) @@ -224,7 +68,7 @@ def get_data_key(self) -> str: def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[Any], **mm_processor_kwargs, ) -> MultiModalKwargs: """ @@ -273,8 +117,8 @@ def wrapper(model_cls: N) -> N: def map_input( self, model_config: "ModelConfig", - data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any], + data: MultiModalData[Any], + mm_processor_kwargs: Optional[Dict[str, Any]], ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the @@ -289,6 +133,7 @@ def map_input( - :ref:`input_processing_pipeline` - :ref:`enabling_multimodal_inputs` """ + # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture @@ -300,6 +145,9 @@ def map_input( raise KeyError(f"No input mapper in {self} is registered for " f"model class {model_cls.__name__}.") + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + # In the case of the default mapper, we have to get resource # processor through its HuggingFace autoclass; since this goes # through **kwargs, we can't inspect it the same way, so we allow @@ -508,7 +356,7 @@ def append_items_from_seq_group( self, positions: range, multi_modal_items: List[_T], - multi_modal_placeholders: List[PlaceholderRange], + multi_modal_placeholders: Sequence[PlaceholderRange], ) -> List[_T]: """ Adds the multi-modal items that intersect ```positions`` to this diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 589b46266b08d..97bbce1ce1570 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,14 +3,14 @@ import torch from PIL import Image -from transformers.image_processing_base import BatchFeature from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin +from .base import MultiModalPlugin +from .inputs import ImageItem, MultiModalData, MultiModalKwargs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -41,15 +41,11 @@ def _get_hf_image_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[ImageItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config - # Processed by input processor - if isinstance(data, BatchFeature): - return MultiModalKwargs(data.data) - # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): image_processor = self._get_hf_image_processor( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py new file mode 100644 index 0000000000000..64a4c58d5509c --- /dev/null +++ b/vllm/multimodal/inputs.py @@ -0,0 +1,225 @@ +from collections import UserDict, defaultdict +from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, + TypedDict, TypeVar, Union, cast, final) + +import numpy as np +import torch +import torch.types +from PIL.Image import Image +from typing_extensions import TypeAlias + +from vllm.utils import JSONTree, is_list_of, json_map_leaves + +_T = TypeVar("_T") + +# yapf: disable +ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image, +which can be passed to a HuggingFace :code:`ImageProcessor`. +""" + +VideoItem: TypeAlias = Union[ + List[Image], + np.ndarray, + torch.Tensor, + List[np.ndarray], + List[torch.Tensor], +] +""" + +A :class:`transformers.image_utils.VideoInput` representing a single video, +which can be passed to a HuggingFace :code:`VideoProcessor`. +""" + +AudioItem: TypeAlias = Union[ + np.ndarray, + List[float], + Tuple[np.ndarray, float], # DEPRECATED: Use mm_processor_kwargs instead +] +""" +Represents a single audio that can be inputted to a HuggingFace +:code:`AudioProcessor`. +""" +# yapf: enable + +MultiModalData: TypeAlias = Union[_T, List[_T]] +""" +Either a single data item, or a list of data items. + +The number of data items allowed per modality is restricted by +:code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalData[ImageItem] + """The input image(s).""" + + video: MultiModalData[VideoItem] + """The input video(s).""" + + audio: MultiModalData[AudioItem] + """The input audio(s).""" + + +MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +class PlaceholderRange(TypedDict): + """ + Placeholder location information for multi-modal data. + + For example: + Prompt: AAAA BBBB What is in these images? + Images A and B will have: + A: { "offset": 0, "length": 4 } + B: { "offset": 5, "length": 4 } + """ + + offset: int + """The start index of the placeholder in the prompt.""" + + length: int + """The length of the placeholder.""" + + +NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] +""" +Uses a list instead of a tensor if the dimensions of each element do not match. +""" + +BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] +""" +A dictionary containing nested tensors which have been batched via +:meth:`MultiModalKwargs.batch`. +""" + + +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. + """ + + @staticmethod + def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: + """ + Stack the inner dimensions that have the same shape in + a nested list of tensors. + + Thus, a dimension represented by a list means that the inner + dimensions are different for each element along that dimension. + """ + if isinstance(nested_tensors, torch.Tensor): + return nested_tensors + + # TODO: Remove these once all models have been migrated + if isinstance(nested_tensors, np.ndarray): + return torch.from_numpy(nested_tensors) + if isinstance(nested_tensors, (int, float)): + return torch.tensor(nested_tensors) + + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] + if not is_list_of(stacked, torch.Tensor, check="all"): + # Only tensors (not lists) can be stacked. + return stacked + + tensors_ = cast(List[torch.Tensor], stacked) + if any(t.shape != tensors_[0].shape for t in tensors_): + # The tensors have incompatible shapes and can't be stacked. + return tensors_ + + return torch.stack(tensors_) + + @staticmethod + def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: + """ + Batch multiple inputs together into a dictionary. + + The resulting dictionary has the same keys as the inputs. + If the corresponding value from each input is a tensor and they all + share the same shape, the output value is a single batched tensor; + otherwise, the output value is a list containing the original value + from each input. + """ + if len(inputs_list) == 0: + return {} + + # We need to consider the case where each item in the batch + # contains different modalities (i.e. different keys). + item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) + + for inputs in inputs_list: + for k, v in inputs.items(): + item_lists[k].append(v) + + return { + k: MultiModalKwargs._try_stack(item_list) + for k, item_list in item_lists.items() + } + + @staticmethod + def as_kwargs( + batched_inputs: BatchedTensorInputs, + *, + device: torch.types.Device, + ) -> BatchedTensorInputs: + json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + + json_mapped = json_map_leaves( + lambda x: x.to(device, non_blocking=True), + json_inputs, + ) + + return cast(BatchedTensorInputs, json_mapped) + + +MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +""" +A dictionary containing placeholder ranges. +""" + + +class MultiModalInputsV2(TypedDict): + """ + Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`, + ready to be passed to vLLM internals. + """ + + type: Literal["multimodal"] + """The type of inputs.""" + + prompt: str + """ + The original, unprocessed prompt text. + + Note: + Since prompt text is not required by vLLM internals, we leave this + unprocessed to save CPU computation. You can still call + :code:`tokenizer.decode(prompt_token_ids)` to get the processed text. + """ + + prompt_token_ids: List[int] + """The processed token IDs which includes placeholder tokens.""" + + mm_kwargs: MultiModalKwargs + """Keyword arguments to be directly passed to the model after batching.""" + + mm_placeholders: MultiModalPlaceholderDict + """ + For each modality, information about the placeholder tokens in + :code:`prompt_token_ids`. + """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py new file mode 100644 index 0000000000000..88a924da174a6 --- /dev/null +++ b/vllm/multimodal/processing.py @@ -0,0 +1,273 @@ +from dataclasses import dataclass +from functools import lru_cache, partial +from typing import (Any, Callable, Collection, Generic, List, Mapping, + Optional, TypedDict, TypeVar, final) + +from transformers import BatchFeature +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import is_list_of + +from .inputs import (AudioItem, ImageItem, MultiModalDataDict, + MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, + VideoItem) + +_T = TypeVar("_T") + +ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]] +""" +Given the original data item, HF-processed data, and index of the processed +item, output the replacement token IDs to be allocated in vLLM. +""" + + +@dataclass +class ModalityProcessingMetadata(Generic[_T]): + placeholder_replacements: Mapping[str, ReplacementFunc] + """ + A dictionary where each item represents the original placeholder in the + prompt text and the corresponding replacement. + """ + + +class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: ModalityProcessingMetadata[ImageItem] + video: ModalityProcessingMetadata[VideoItem] + audio: ModalityProcessingMetadata[AudioItem] + + +MultiModalProcessingMetadata: TypeAlias = \ + Mapping[str, ModalityProcessingMetadata[Any]] +""" +A dictionary containing an entry for each modality type to process. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + +MultiModalMultiData: TypeAlias = List[_T] +""" +A list of data items, where the number of data items allowed +per modality is restricted by :code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalMultiDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalMultiData[ImageItem] + """The input images.""" + + video: MultiModalMultiData[VideoItem] + """The input videos.""" + + audio: MultiModalMultiData[AudioItem] + """The input audios.""" + + +MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalMultiDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict: + """ + Convert a :class:`MultiModalDataDict` containing single data items + to a :class:`MultiModalMultiDataDict` containing multiple data items + per entry. + """ + multi_data: Mapping[str, MultiModalMultiData[Any]] = {} + + for k, v in data.items(): + # yapf: disable + if k == "video": + # Special case since even a single item can be a list + multi_data[k] = v if is_list_of(v, list) else [v] # type: ignore[index] + elif k in ("image", "audio"): + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + else: + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + # yapf: enable + + return multi_data + + +def encode_no_special_tokens( + tokenizer: AnyTokenizer, + text: str, +) -> List[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=False)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, bos=False, eos=False) + + return tokenizer.encode(text, add_special_tokens=False) + + +@lru_cache +def candidate_placeholders( + tokenizer: AnyTokenizer, + placeholder_text: str, +) -> Collection[List[int]]: + """Generate token ID sequences that may represent a placeholder text.""" + # When the placeholder text is not mapped to a special token ID, + # it may be tokenized differently based on whether it is at the start/end + # of the string. So, we go through each combination of whether the text + # is at the start and end boundaries of the string + + # Matches the placeholder when it is in the middle of the string + start_id, = encode_no_special_tokens(tokenizer, "a") + end_id, = encode_no_special_tokens(tokenizer, "b") + + candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text) + + start_id_, *candidate_a = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}", + ) + assert start_id == start_id_ + + start_id_, *candidate_ab, end_id_ = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}b", + ) + assert start_id == start_id_ and end_id == end_id_ + + *candidate_b, end_id_ = encode_no_special_tokens( + tokenizer, + f"{placeholder_text}b", + ) + assert end_id == end_id_ + + # Remove duplicates (need to convert to tuple to be hashable) + unique_candidates = { + tuple(c) + for c in [candidate_basic, candidate_a, candidate_ab, candidate_b] + } + + # Convert back to list + return [list(c) for c in unique_candidates] + + +def apply_placeholders( + token_ids: List[int], + placeholder_ids: List[int], + get_replacement_ids: Callable[[], List[int]], +) -> Optional[PlaceholderRange]: + """ + Find the first occurrence of :code:`placeholder_ids`, + and replace it with the output of :code:`get_replacement_ids`. + + This function updates :code:`token_ids` in place. + """ + placeholder_length = len(placeholder_ids) + + for start_idx in range(len(token_ids) - placeholder_length + 1): + if token_ids[start_idx:placeholder_length] == placeholder_ids: + token_ids[start_idx:placeholder_length] = get_replacement_ids() + + return PlaceholderRange(offset=start_idx, + length=placeholder_length) + + return None + + +class MultiModalProcessor: + """ + Helper class to process multi-modal inputs to be used in vLLM. + """ + + def __init__( + self, + ctx: InputProcessingContext, + metadata: MultiModalProcessingMetadata, + ) -> None: + super().__init__() + + self.ctx = ctx + self.metadata = metadata + + def __call__( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + return self.apply(prompt, mm_data, mm_processor_kwargs) + + def apply( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + tokenizer = self.ctx.tokenizer + hf_processor = self.ctx.get_hf_processor() + + processed_inputs = hf_processor( + text=prompt, # type: ignore + **mm_data, + **mm_processor_kwargs, + ) + new_token_ids, = processed_inputs.pop("input_ids").tolist() + mm_kwargs = MultiModalKwargs(processed_inputs) + + mm_placeholders: Mapping[str, List[PlaceholderRange]] = {} + + for modality, orig_inputs in to_multi_format(mm_data).items(): + assert isinstance(orig_inputs, list) + + metadata = self.metadata[modality] + placeholder_replacements = metadata.placeholder_replacements + + modality_placeholders: List[PlaceholderRange] = [] + + for item_idx, orig_item in enumerate(orig_inputs): + for match_text, replace_fn in placeholder_replacements.items(): + candidates = candidate_placeholders(tokenizer, match_text) + get_replacement_ids = partial( + replace_fn, + orig_item, + processed_inputs, + item_idx, + ) + + for match_ids in candidates: + # TODO(youkaichao): Don't update new_token_ids + placeholders = apply_placeholders( + new_token_ids, + match_ids, + get_replacement_ids, + ) + + if placeholders is not None: + modality_placeholders.append(placeholders) + + # yapf: disable + mm_placeholders[modality] = modality_placeholders # type: ignore[index] + # yapf: enable + + return MultiModalInputsV2( + type="multimodal", + prompt=prompt, + prompt_token_ids=new_token_ids, + mm_kwargs=mm_kwargs, + mm_placeholders=mm_placeholders, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b844c9e1c2e89..b992442d3b314 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,13 +1,20 @@ import functools from collections import UserDict -from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence +from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, + Sequence, Type, TypeVar) +import torch.nn as nn +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer from .audio import AudioPlugin -from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs, - MultiModalPlugin, MultiModalTokensCalc, NestedTensors) +from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin +from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors +from .processing import MultiModalProcessor from .video import VideoPlugin if TYPE_CHECKING: @@ -15,8 +22,18 @@ logger = init_logger(__name__) +N = TypeVar("N", bound=Type[nn.Module]) + +MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], + MultiModalProcessor] +""" +Constructs a :class:`MultiModalProcessor` instance from the context. + +The processing metadata should be derived from the context. +""" + -class _MultiModalLimits(UserDict): +class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message when attempting to access a model that does not exist. @@ -45,6 +62,9 @@ def __init__( plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: self._plugins = {p.get_data_key(): p for p in plugins} + self._processor_factories: Dict[Type[nn.Module], + MultiModalProcessorFactory] = {} + # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} @@ -243,3 +263,59 @@ def get_mm_limits_per_prompt( This should be called after :meth:`init_mm_limits_per_prompt`. """ return self._limits_by_model[model_config] + + def register_processor( + self, + factory: MultiModalProcessorFactory, + ): + """ + Register a multi-modal processor to a model class. + + When the model receives multi-modal data, the provided function is + invoked to transform the data into a dictionary of model inputs. + + See also: + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` + """ + + def wrapper(model_cls: N) -> N: + if model_cls in self._processor_factories: + logger.warning( + "Model class %s already has an input mapper " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._processor_factories[model_cls] = factory + + return model_cls + + return wrapper + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + """ + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + return model_cls in self._processor_factories + + def create_processor( + self, + model_config: "ModelConfig", + tokenizer: AnyTokenizer, + ) -> MultiModalProcessor: + """ + Create a multi-modal processor for a specific model and tokenizer. + """ + + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + processor_factory = self._processor_factories[model_cls] + + ctx = InputProcessingContext(model_config, tokenizer) + return processor_factory(ctx) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index bee3c25dbd8dd..40194716bbf94 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -11,9 +11,10 @@ import vllm.envs as envs from vllm.connections import global_http_connection from vllm.logger import init_logger -from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .inputs import MultiModalDataDict, PlaceholderRange + logger = init_logger(__name__) cached_get_tokenizer = lru_cache(get_tokenizer) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index a518270974f92..ba9bf58a4a20c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np @@ -9,8 +9,9 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalKwargs +from .base import MultiModalData from .image import ImagePlugin +from .inputs import MultiModalKwargs, VideoItem if TYPE_CHECKING: from vllm.config import ModelConfig @@ -20,17 +21,6 @@ cached_get_video_processor = lru_cache(get_video_processor) cached_get_tokenizer = lru_cache(get_tokenizer) -VideoInput = Union[ - "np.ndarray", # single video input - List["np.ndarray"], - # TODO: support more types - # List[Image.Image], List[List[Image.Image]], - # "torch.Tensor", - # List["torch.Tensor"], - # List[List["np.ndarrray"]], - # List[List["torch.Tensor"]], -] - class VideoPlugin(ImagePlugin): """Plugin for video data.""" @@ -53,13 +43,13 @@ def _get_hf_video_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[VideoItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config if isinstance(data, list) and len(data) == 1: - data = data[0] + data = data[0] # type: ignore if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray): video_processor = self._get_hf_video_processor( diff --git a/vllm/outputs.py b/vllm/outputs.py index 951976310e7ae..badf50d0602d6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -83,10 +83,11 @@ class RequestOutput: finished: Whether the whole request is finished. metrics: Metrics associated with the request. lora_request: The LoRA request that was used to generate the output. - encoder_prompt: The encoder prompt string of the request; - None if decoder-only - encoder_prompt_token_ids: The token IDs of the encoder prompt; - None if decoder-only + encoder_prompt: The encoder prompt string of the request. + None if decoder-only. + encoder_prompt_token_ids: The token IDs of the encoder prompt. + None if decoder-only. + num_cached_tokens: The number of tokens with prefix cache hit. """ def __init__( @@ -101,6 +102,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, encoder_prompt: Optional[str] = None, encoder_prompt_token_ids: Optional[List[int]] = None, + num_cached_tokens: Optional[int] = None, ) -> None: self.request_id = request_id self.prompt = prompt @@ -112,6 +114,37 @@ def __init__( self.lora_request = lora_request self.encoder_prompt = encoder_prompt self.encoder_prompt_token_ids = encoder_prompt_token_ids + self.num_cached_tokens = num_cached_tokens + + @classmethod + def new( + cls, + request_id: str, + prompt: Optional[str], + prompt_token_ids: Optional[List[int]], + text: str, + token_ids: List[int], + finished: bool = False, + ) -> "RequestOutput": + """Initialize a new RequestOutput object.""" + + # TODO: Support `n` > 1. + completion_output = CompletionOutput( + index=0, + text=text, + token_ids=token_ids, + cumulative_logprob=None, + logprobs=None, # TODO + ) + + return RequestOutput( + request_id=request_id, + prompt=prompt, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, # TODO + outputs=[completion_output], + finished=finished, + ) @classmethod def from_seq_group( @@ -162,6 +195,8 @@ def from_seq_group( outputs = [] include_prompt = True + # num_cached_tokens should be the same for all the sequences + num_cached_tokens = None for i, seq in enumerate(top_n_seqs): output_text = seq.get_output_text_to_return( text_buffer_length, delta) @@ -169,6 +204,7 @@ def from_seq_group( output_token_ids = seq.get_output_token_ids_to_return(delta) num_output_tokens = 1 if isinstance(output_token_ids, int) else len(output_token_ids) + num_cached_tokens = seq.data.get_num_cached_tokens() output_logprobs = seq.output_logprobs if include_logprobs else None @@ -242,7 +278,7 @@ def from_seq_group( init_args = (seq_group.request_id, prompt, prompt_token_ids, prompt_logprobs, outputs, finished, seq_group.metrics, seq_group.lora_request, encoder_prompt, - encoder_prompt_token_ids) + encoder_prompt_token_ids, num_cached_tokens) if use_cache: request_output = seq_group.cached_request_output @@ -263,7 +299,8 @@ def __repr__(self) -> str: f"outputs={self.outputs}, " f"finished={self.finished}, " f"metrics={self.metrics}, " - f"lora_request={self.lora_request})") + f"lora_request={self.lora_request}, " + f"num_cached_tokens={self.num_cached_tokens})") class EmbeddingRequestOutput: diff --git a/vllm/sequence.py b/vllm/sequence.py index 7d7ddc7ec4447..3b41d25a2fe42 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,25 +5,21 @@ from array import array from collections import defaultdict from dataclasses import dataclass, field -from functools import cached_property, reduce -from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List, - Mapping, Optional) +from functools import reduce +from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence from typing import Set, Tuple, Union import msgspec import torch -from typing_extensions import assert_never +from vllm.inputs import SingletonInputs, SingletonInputsAdapter from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams -if TYPE_CHECKING: - from vllm.inputs import SingletonInputs - VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 @@ -167,6 +163,8 @@ class SequenceData(msgspec.Struct, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 + # The number of tokens with prefix cache hit. + _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) @@ -323,6 +321,14 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int): if self.get_num_uncomputed_tokens() == 0: self._stage = SequenceStage.DECODE + def get_num_cached_tokens(self) -> int: + """Return the number of tokens with prefix cache hit.""" + return self._num_cached_tokens + + def update_num_cached_tokens(self, num_cached_tokens: int): + """Update the number of tokens with prefix cache hit.""" + self._num_cached_tokens = num_cached_tokens + def reset_state_for_recompute(self) -> None: """Reset the number of computed tokens from this sequence. It is supposed to be called when a sequence needs to be started from @@ -379,7 +385,7 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - + The sequence is constructed from the :data:`DecoderOnlyInputs` (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) instance passed in through the :code:`inputs` constructor argument. @@ -397,14 +403,14 @@ class Sequence: def __init__( self, seq_id: int, - inputs: "SingletonInputs", + inputs: SingletonInputs, block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> None: self.seq_id = seq_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request @@ -431,59 +437,29 @@ def __init__( def n_blocks(self) -> int: return (self.get_len() + self.block_size - 1) // self.block_size - @cached_property + @property def prompt(self) -> Optional[str]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("prompt") + return self.inputs.prompt - assert_never(inputs) - - @cached_property + @property def prompt_token_ids(self) -> List[int]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("prompt_token_ids", []) + return self.inputs.prompt_token_ids - assert_never(inputs) - - @cached_property + @property def prompt_embeds(self) -> Optional[torch.Tensor]: - inputs = self.inputs - - if inputs["type"] == "token": - return None - - assert_never(inputs) + return self.inputs.prompt_embeds - @cached_property + @property def multi_modal_data(self) -> "MultiModalDataDict": - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_data", {}) - - assert_never(inputs) - - @cached_property - def mm_processor_kwargs(self) -> Dict[str, Any]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("mm_processor_kwargs", {}) - - assert_never(inputs) + return self.inputs.multi_modal_data @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - inputs = self.inputs + return self.inputs.multi_modal_placeholders - if inputs["type"] == "token": - return inputs.get("multi_modal_placeholders", {}) - - assert_never(inputs) + @property + def mm_processor_kwargs(self) -> Dict[str, Any]: + return self.inputs.mm_processor_kwargs @property def lora_int_id(self) -> int: @@ -906,7 +882,7 @@ class SequenceGroupMetadata( multi_modal_data: Multi modal data. mm_processor_kwargs: Multimodal input processor / mapper overrides. encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None + (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder model. cross_block_table: Optional cross-attention block table associated diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 14d9518364d26..054845584c2ef 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -234,6 +234,9 @@ def get_config( patch_rope_scaling(config) + if trust_remote_code: + maybe_register_config_serialize_by_value() + return config @@ -389,33 +392,39 @@ def get_sentence_transformer_tokenizer_config(model: str, return None -def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None: +def maybe_register_config_serialize_by_value() -> None: """Try to register HF model configuration class to serialize by value - With trust_remote_code, the config class is typically an instance of a - custom class imported from the HF modules cache. The class will not be - importable in spawned workers by default (and won't exist at all on - other nodes), which breaks serialization of the config. + If trust_remote_code is set, and the model's config file specifies an + `AutoConfig` class, then the config class is typically an instance of + a custom class imported from the HF modules cache. + + Examples: + + >>> from transformers import AutoConfig + >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True) + >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig + >>> import transformers_modules # error, not initialized + >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True) + >>> import transformers_modules # success, initialized + >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config + + In the DeepSeek example, the config class is an instance of a custom + class that is not serializable by default. This class will not be + importable in spawned workers, and won't exist at all on + other nodes, which breaks serialization of the config. In this function we tell the cloudpickle serialization library to pass instances of these generated classes by value instead of by reference, i.e. the class definition is serialized along with its data so that the - class module does not need to be importable on the receiving end. This - registration only works if the modules cache has already been - initialized. - + class module does not need to be importable on the receiving end. See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs - """ - if not trust_remote_code: - return - + """ # noqa try: import transformers_modules except ImportError: - logger.debug("Could not import transformers_modules used for remote" - " code. If remote code is not needed remove" - " `--trust-remote-code`.") + # the config does not need trust_remote_code return try: @@ -428,19 +437,19 @@ class module does not need to be importable on the receiving end. This ray.cloudpickle.register_pickle_by_value(transformers_modules) # multiprocessing uses pickle to serialize arguments when using spawn - # Here we get pickle to use cloudpickle to serialize ModelConfig objects + # Here we get pickle to use cloudpickle to serialize config objects # that contain instances of the custom config class to avoid # serialization problems if the generated module (and model) has a `.` # in its name import multiprocessing import pickle - from vllm.config import ModelConfig + from vllm.config import VllmConfig - def _reduce_modelconfig(mc: ModelConfig): - return (pickle.loads, (cloudpickle.dumps(mc), )) + def _reduce_config(config: VllmConfig): + return (pickle.loads, (cloudpickle.dumps(config), )) - multiprocessing.reducer.register(ModelConfig, _reduce_modelconfig) + multiprocessing.reducer.register(VllmConfig, _reduce_config) except Exception as e: logger.warning( diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 39220992aa23a..390a47a24b691 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -1,11 +1,11 @@ from __future__ import annotations + import fnmatch import os import shutil +import signal import tempfile from pathlib import Path -from typing import List, Optional, Tuple -import signal import boto3 @@ -14,35 +14,36 @@ def is_s3(model_or_path: str) -> bool: return model_or_path.lower().startswith('s3://') -def _filter_allow(paths: List[str], patterns: List[str]) -> List[str]: +def _filter_allow(paths: list[str], patterns: list[str]) -> list[str]: return [ path for path in paths if any( fnmatch.fnmatch(path, pattern) for pattern in patterns) ] -def _filter_ignore(paths: List[str], patterns: List[str]) -> List[str]: +def _filter_ignore(paths: list[str], patterns: list[str]) -> list[str]: return [ path for path in paths if not any(fnmatch.fnmatch(path, pattern) for pattern in patterns) ] -def glob(s3 = None, - path: str = "", - allow_pattern: Optional[List[str]] = None) -> List[str]: +def glob(s3=None, + path: str = "", + allow_pattern: list[str] | None = None) -> list[str]: if s3 is None: s3 = boto3.client("s3") - bucket_name, _, paths = list_files( - s3, path=path, allow_pattern=allow_pattern) + bucket_name, _, paths = list_files(s3, + path=path, + allow_pattern=allow_pattern) return [f"s3://{bucket_name}/{path}" for path in paths] + def list_files( - s3, - path: str, - allow_pattern: Optional[List[str]] = None, - ignore_pattern: Optional[List[str]] = None -) -> Tuple[str, str, List[str]]: + s3, + path: str, + allow_pattern: list[str] | None = None, + ignore_pattern: list[str] | None = None) -> tuple[str, str, list[str]]: parts = path.removeprefix('s3://').split('/') prefix = '/'.join(parts[1:]) bucket_name = parts[0] @@ -61,6 +62,7 @@ def list_files( class S3Model: + def __init__(self) -> None: self.s3 = boto3.client('s3') for sig in (signal.SIGINT, signal.SIGTERM): @@ -76,21 +78,23 @@ def close(self) -> None: shutil.rmtree(self.dir) def close_by_signal(self, existing_handler=None): + def new_handler(signum, frame): self.close() if existing_handler: existing_handler(signum, frame) + return new_handler def pull_files(self, s3_model_path: str = "", - allow_pattern: Optional[List[str]] = None, - ignore_pattern: Optional[List[str]] = None) -> str: + allow_pattern: list[str] | None = None, + ignore_pattern: list[str] | None = None) -> None: bucket_name, base_dir, files = list_files(self.s3, s3_model_path, - allow_pattern, - ignore_pattern) + allow_pattern, + ignore_pattern) if len(files) == 0: - return self.dir + return for file in files: destination_file = self.dir + file.removeprefix(base_dir) diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py index 3f57c22e1f2e4..568185383aa5c 100644 --- a/vllm/triton_utils/__init__.py +++ b/vllm/triton_utils/__init__.py @@ -6,6 +6,5 @@ from vllm.triton_utils.custom_cache_manager import ( maybe_set_triton_cache_manager) - from vllm.triton_utils.libentry import libentry - __all__ += ["maybe_set_triton_cache_manager", "libentry"] + __all__ += ["maybe_set_triton_cache_manager"] diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py deleted file mode 100644 index 4335c7adfc13b..0000000000000 --- a/vllm/triton_utils/libentry.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copied From https://github.com/FlagOpen/FlagGems - -import inspect - -import triton - - -class LibEntry(triton.KernelInterface): - - def __init__( - self, - fn, - ): - self.fn = fn - self.arg_names = fn.arg_names - self.divisibility = 16 - self.kernel_cache = dict() - fn = self.fn - while not isinstance(fn, triton.runtime.JITFunction): - fn = fn.fn - self.jit_function: triton.runtime.JITFunction = fn - self.specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and not p.do_not_specialize - ] - self.do_not_specialize_indices = [ - p.num for p in self.jit_function.params - if not p.is_constexpr and p.do_not_specialize - ] - - def key(self, spec_args, dns_args, const_args): - spec_key = [(arg.dtype, arg.data_ptr() % - self.divisibility == 0) if hasattr(arg, "data_ptr") else - (type(arg), arg) for arg in spec_args] - dns_key = [ - arg.dtype if hasattr( - arg, "data_ptr") else type(arg) if not isinstance(arg, int) - else "i32" if arg >= -(2**31) and arg <= 2**31 - - 1 else "u64" if arg >= 2**63 and arg <= 2**64 - 1 else "i64" - for arg in dns_args - ] - # const args passed by position - return tuple(spec_key + dns_key + const_args) - - def run(self, *args, **kwargs): - grid = kwargs["grid"] - # collect all the arguments - spec_args = [] # specialize arguments - dns_args = [] # do not specialize arguments - const_args = [] # constexpr arguments - k_args = [] # kernel arguments - for i, arg in enumerate(args): - if i in self.specialize_indices: - k_args.append(arg) - spec_args.append(arg) - elif i in self.do_not_specialize_indices: - k_args.append(arg) - dns_args.append(arg) - else: - const_args.append(arg) - for p in self.jit_function.params[len(args):]: - if p.name in kwargs: - val = kwargs[p.name] - elif p.default is inspect._empty: - continue - else: - val = p.default - - if p.is_constexpr: - const_args.append(val) - elif p.do_not_specialize: - dns_args.append(val) - k_args.append(val) - else: - spec_args.append(val) - k_args.append(val) - - entry_key = self.key(spec_args, dns_args, const_args) - - if entry_key not in self.kernel_cache: - # compile the kernel also completes the related computations - kernel = self.fn.run(*args, **kwargs) - fn = self.fn - # collect constexpr arguments for grid computation - constexprs = {} - while not isinstance(fn, triton.runtime.JITFunction): - if isinstance(fn, triton.runtime.Autotuner): - config = fn.best_config - constexprs["num_warps"] = config.num_warps - constexprs["num_stages"] = config.num_stages - constexprs["num_ctas"] = config.num_ctas - constexprs = {**constexprs, **config.kwargs} - elif isinstance(fn, triton.runtime.Heuristics): - for v, heur in fn.values.items(): - constexprs[v] = heur({ - **dict(zip(fn.arg_names, args)), - **kwargs, - **constexprs, - }) - else: - raise RuntimeError("Invalid Runtime Function") - fn = fn.fn - # In vLLM, certain kernels like fused_moe_kernel get the - # best_config(as kwargs) from a configuration json file, rather - # than using Autotuner & Heuristics. Therefore, all their constexprs - # (tl.constexpr) are assigned values through the following loop. - for p in self.jit_function.params: - if p.is_constexpr and p.name not in constexprs: - constexprs[p.name] = p.default #default=inspect._empty - self.kernel_cache[entry_key] = (kernel, constexprs) - else: - # load kernel from cache directly - kernel, constexprs = self.kernel_cache[entry_key] - - if callable(grid): - # collect all arguments to the grid fn,ie: - # 1. args, - # 2. kwargs, - # 3. all all other captured arguments in CompiledKernel from - # Autotunner & Heuristics when kwargs & captured args conflict, - # captured args have higher priority - # 4. We must filter out captured args with default value firstly - constexprs = { - k: v - for k, v in constexprs.items() if v is not inspect._empty - } - meta = { - **dict(zip(self.arg_names, args)), - **kwargs, - **constexprs, - } - grid = grid(meta) - if isinstance(grid, tuple): - grid = grid + (1, 1) - elif isinstance(grid, list): - grid = grid + [1, 1] - kernel[grid[0:3]](*k_args) - # maintaining the same return type as the JITFunction.run - return kernel - - -def libentry(): - """ - Decorator for triton library entries. - Motivation: - The runtime overhead of Triton kernels is the reason for the lower - performance of small kernels, particularly evident with smaller models. - Using this decorator can reduce Triton runtime overhead. - How: - The `run` function of JITFunction needs to accomplish: - - Parameter binding using inspect - - KernelArg type wrapping - - Cache key calculation - When dealing with small size, these steps can become bottlenecks in - Triton runtime. Libentry simplifies these steps to reduce runtime - overhead, thereby improving the runtime expenses of small kernels. - NOTE: - When Triton is upgraded to version 3.0.0, libentry can be removed, - see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245 - - - """ - - def decorator(fn): - return LibEntry(fn) - - return decorator diff --git a/vllm/utils.py b/vllm/utils.py index 13d7f6d475346..1b02cbff79f78 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -4,6 +4,7 @@ import datetime import enum import gc +import getpass import inspect import ipaddress import os @@ -967,6 +968,8 @@ def enable_trace_function_call_for_thread() -> None: if envs.VLLM_TRACE_FUNCTION: tmp_dir = tempfile.gettempdir() + # add username to tmp_dir to avoid permission issues + tmp_dir = os.path.join(tmp_dir, getpass.getuser()) filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}" f"_thread_{threading.get_ident()}_" f"at_{datetime.datetime.now()}.log").replace(" ", "_") diff --git a/vllm/v1/__init__.py b/vllm/v1/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py new file mode 100644 index 0000000000000..845bd5ea05e3c --- /dev/null +++ b/vllm/v1/core/encoder_cache_manager.py @@ -0,0 +1,48 @@ +from typing import Dict, List, Set, Tuple + +from vllm.v1.request import Request + + +class EncoderCacheManager: + + def __init__(self, cache_size: int): + self.cache_size = cache_size + self.num_free_slots = cache_size + # req_id -> cached input ids + self.cached: Dict[str, Set[int]] = {} + # List of [req_id, input_id] + self.freed: List[Tuple[str, int]] = [] + + def has_cache(self, request: Request, input_id: int) -> bool: + req_id = request.request_id + return req_id in self.cached and input_id in self.cached[req_id] + + def can_allocate(self, request: Request, input_id: int) -> bool: + num_tokens = request.get_num_encoder_tokens(input_id) + return num_tokens <= self.num_free_slots + + def allocate(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + self.cached[req_id] = set() + self.cached[req_id].add(input_id) + self.num_free_slots -= request.get_num_encoder_tokens(input_id) + + def get_cached_input_ids(self, request: Request) -> Set[int]: + return self.cached.get(request.request_id, set()) + + def free(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + return + + self.cached[req_id].discard(input_id) + if len(self.cached[req_id]) == 0: + del self.cached[req_id] + self.num_free_slots += request.get_num_encoder_tokens(input_id) + self.freed.append((req_id, input_id)) + + def get_freed_ids(self) -> List[Tuple[str, int]]: + freed = self.freed + self.freed = [] + return freed diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 82094fb65dd1a..38f1c03a4d3ac 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -70,7 +70,7 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]: Args: request: The request to get the computed blocks. - + Returns: A list of blocks that are computed for the request. """ @@ -105,7 +105,7 @@ def append_slots( Args: request: The request to append slots. num_tokens: The number of tokens to append. - + Returns: A list of new blocks if new blocks are allocated, or None if new blocks are required but cannot be allocated. @@ -176,7 +176,7 @@ def allocate_slots( num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. computed_blocks: The blocks that have already been computed. - + Returns: A list of new allocated blocks. """ @@ -240,7 +240,8 @@ def free(self, request: Request) -> None: Args: request: The request to free the blocks. """ - blocks = self.req_to_blocks.pop(request.request_id) + # Default to [] in case a request is freed (aborted) before alloc. + blocks = self.req_to_blocks.pop(request.request_id, []) if self.enable_caching: # Free blocks in reverse order so that the tail blocks are # freed first. @@ -259,13 +260,13 @@ def _get_new_blocks( """Get new blocks from the free block pool, and add token IDs to allocated blocks if caching is enabled. Note that we do not check block cache in this function. - + Args: num_blocks: The number of blocks to allocate. token_ids: The token IDs in the blocks. None if caching is disabled. parent_block: The parent block. Used to include block chain in the block hash. - + Returns: A list of new block. """ diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index a60f8b8138ecf..ba50a9786d805 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -1,15 +1,21 @@ from collections import deque from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, + Tuple, Union) from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import SamplingParams +from vllm.v1.core.encoder_cache_manager import EncoderCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager +from vllm.v1.engine import EngineCoreOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus +if TYPE_CHECKING: + from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.base import PlaceholderRange + logger = init_logger(__name__) @@ -60,12 +66,20 @@ def __init__( # Request id -> RunningRequestData self.running_reqs_data: Dict[str, RunningRequestData] = {} - def schedule(self) -> "SchedulerOutput": - scheduled_new_reqs: List[Request] = [] - scheduled_resumed_reqs: List[Request] = [] - scheduled_running_reqs: List[Request] = [] - preempted_reqs: List[Request] = [] + # Encoder-related. + # NOTE(woosuk): Here, "encoder" includes the vision encoder (and + # projector if needed). Currently, we assume that the encoder also + # has the Transformer architecture (e.g., ViT). + # FIXME(woosuk): Below are placeholder values. We need to calculate the + # actual values from the configurations. + self.max_num_encoder_input_tokens = 2048 + # NOTE(woosuk): For the models without encoder (e.g., text-only models), + # the encoder cache will not be initialized and used, regardless of + # the cache size. This is because the memory space for the encoder cache + # is preallocated in the profiling run. + self.encoder_cache_manager = EncoderCacheManager(cache_size=2048) + def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: # There's no "decoding phase" nor "prefill phase" in the scheduler. # Each request just has the num_computed_tokens and num_tokens, @@ -73,23 +87,45 @@ def schedule(self) -> "SchedulerOutput": # At each step, the scheduler tries to assign tokens to the requests # so that each request's num_computed_tokens can catch up its # num_tokens. This is general enough to cover chunked prefills, - # prefix caching, and the "jump forward" optimization in the future. + # prefix caching, and the "jump decoding" optimization in the future. + + scheduled_new_reqs: List[Request] = [] + scheduled_resumed_reqs: List[Request] = [] + scheduled_running_reqs: List[Request] = [] + preempted_reqs: List[Request] = [] req_to_new_block_ids: Dict[str, List[int]] = {} num_scheduled_tokens: Dict[str, int] = {} token_budget = self.max_num_scheduled_tokens + # Encoder-related. + scheduled_encoder_inputs: Dict[str, List[int]] = {} + encoder_budget = self.max_num_encoder_input_tokens # First, schedule the RUNNING requests. + # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be + # in the "partial" state, where the request has some tokens computed + # but not all. The constraint is due to the persistent batch in the + # V1 model runner. + # TODO(woosuk): Remove this constraint after refactoring model runner. + has_partial_request = False req_index = 0 while req_index < len(self.running): - if token_budget == 0: - break - + # Only the last request in the RUNNING queue can be "partial". + assert not has_partial_request + assert token_budget > 0 request = self.running[req_index] num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 + # Schedule encoder inputs. + encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = ( + self._try_schedule_encoder_inputs(request, + request.num_computed_tokens, + num_new_tokens, + encoder_budget)) + assert num_new_tokens > 0 + while True: new_blocks = self.kv_cache_manager.append_slots( request, num_new_tokens) @@ -105,22 +141,40 @@ def schedule(self) -> "SchedulerOutput": preempted_reqs.append(preempted_req) if preempted_req == request: # No more request to preempt. + can_schedule = False break else: # The request can be scheduled. - scheduled_running_reqs.append(request) - - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in new_blocks - ] - num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens - req_index += 1 + can_schedule = True break + if not can_schedule: + break + + # Schedule the request. + scheduled_running_reqs.append(request) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in new_blocks + ] + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + has_partial_request = (request.num_computed_tokens + num_new_tokens + < request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Next, schedule the WAITING requests. if not preempted_reqs: while self.waiting: + if has_partial_request: + break if len(self.running) == self.max_num_running_reqs: break if token_budget == 0: @@ -148,12 +202,21 @@ def schedule(self) -> "SchedulerOutput": computed_blocks.pop() num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 + + # Schedule encoder inputs. + (encoder_inputs_to_schedule, num_new_tokens, + new_encoder_budget) = self._try_schedule_encoder_inputs( + request, num_computed_tokens, num_new_tokens, + encoder_budget) + if num_new_tokens == 0: + # The request cannot be scheduled. + break + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens, computed_blocks) if new_blocks is None: # The request cannot be scheduled. break - request.num_computed_tokens = num_computed_tokens self.waiting.popleft() self.running.append(request) @@ -171,6 +234,18 @@ def schedule(self) -> "SchedulerOutput": num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING + request.num_computed_tokens = num_computed_tokens + has_partial_request = (num_computed_tokens + num_new_tokens < + request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) @@ -204,12 +279,14 @@ def schedule(self) -> "SchedulerOutput": scheduled_running_reqs=running_reqs_data, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_encoder_inputs=scheduled_encoder_inputs, preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between # the previous and the current steps. finished_req_ids=self.finished_req_ids, + free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(), ) self.finished_req_ids = set() @@ -233,17 +310,82 @@ def _make_running_request_data( self.running_reqs_data[request.request_id] = req_data return req_data + def _try_schedule_encoder_inputs( + self, + request: Request, + num_computed_tokens: int, + num_new_tokens: int, + encoder_budget: int, + ) -> Tuple[List[int], int, int]: + """ + Determine which encoder inputs need to be scheduled in the current step, + and update `num_new_tokens` and encoder token budget accordingly. + + An encoder input will be scheduled if: + - Its output tokens overlap with the range of tokens being computed + in this step, i.e., + [num_computed_tokens, num_computed_tokens + num_new_tokens). + - It is not already computed and stored in the encoder cache. + - There is sufficient encoder token budget to process it. + - The encoder cache has space to store it. + + If an encoder input cannot be scheduled due to cache or budget + limitations, the method adjusts `num_new_tokens` to schedule only the + decoder tokens up to just before the unschedulable encoder input. + """ + if not request.has_encoder_inputs(): + return [], num_new_tokens, encoder_budget + + encoder_inputs_to_schedule: List[int] = [] + mm_positions = request.mm_positions + assert mm_positions is not None + assert len(mm_positions) > 0 + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, num_computed_tokens + num_new_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_new_tokens: + # The encoder input is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder input is already computed and stored + # in the decoder's KV cache. + continue + + if self.encoder_cache_manager.has_cache(request, i): + # The encoder input is already computed and cached. + continue + if not self.encoder_cache_manager.can_allocate(request, i): + # The encoder cache is full. We can only schedule the decoder + # tokens just before the encoder input. + num_new_tokens = start_pos - num_computed_tokens + break + if num_encoder_tokens > encoder_budget: + # The encoder budget is exhausted. We can only schedule the + # decoder tokens up until the encoder input. + # NOTE(woosuk): We assume that the encoder tokens should be + # processed altogether, as the encoder usually uses + # bidirectional attention. + num_new_tokens = start_pos - num_computed_tokens + break + + encoder_budget -= num_encoder_tokens + encoder_inputs_to_schedule.append(i) + return encoder_inputs_to_schedule, num_new_tokens, encoder_budget + def update_from_output( self, scheduler_output: "SchedulerOutput", model_runner_output: "ModelRunnerOutput", - ) -> List[Tuple[Request, int]]: + ) -> List[EngineCoreOutput]: # NOTE(woosuk): This method doesn't consider speculative decoding. sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist() num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] - # (request, num_sampled_tokens) - sampled: List[Tuple[Request, int]] = [] + engine_core_outputs: List[EngineCoreOutput] = [] for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -251,23 +393,46 @@ def update_from_output( # the request generates output tokens. Otherwise, we ignore the # sampler output for the request. assert request.num_computed_tokens <= request.num_tokens + + cached_encoder_input_ids = ( + self.encoder_cache_manager.get_cached_input_ids(request)) + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) + if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] # NOTE(woosuk): Currently, we assume that each request # generates at most one token at each step. token_id = sampled_token_ids[req_index] request.append_output_token_ids(token_id) - sampled.append((request, 1)) + num_new_tokens = 1 # TODO: Update the KV cache manager for prefix caching. - # Check if the request is finished. + # Check for stop and update request state. + # This must be called before me make the EngineCoreOutput. stopped = self._check_stop(request) + + # Add EngineCoreOutput for this Request. + output = EngineCoreOutput( + request_id=req_id, + new_token_ids=request.output_token_ids[-num_new_tokens:], + finished=request.is_finished(), + finish_reason=request.get_finished_reason(), + stop_reason=request.stop_reason) + engine_core_outputs.append(output) + + # Breakout of the loop. if stopped: continue new_running.append(request) self.running = new_running - return sampled + return engine_core_outputs def _check_stop(self, request: Request) -> bool: if (request.num_tokens >= self.max_model_len @@ -343,7 +508,8 @@ class NewRequestData: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional[MultiModalDataDict] + mm_inputs: List["MultiModalKwargs"] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams block_ids: List[int] num_computed_tokens: int @@ -357,9 +523,10 @@ def from_request( ) -> "NewRequestData": return cls( req_id=request.request_id, - prompt_token_ids=request.inputs["prompt_token_ids"], - prompt=request.inputs.get("prompt"), - multi_modal_data=request.inputs.get("multi_modal_data"), + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + mm_inputs=request.mm_inputs, + mm_positions=request.mm_positions, sampling_params=request.sampling_params, block_ids=block_ids, num_computed_tokens=num_computed_tokens, @@ -417,6 +584,8 @@ class SchedulerOutput: num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int + scheduled_encoder_inputs: Dict[str, List[int]] preempted_req_ids: Set[str] finished_req_ids: Set[str] + free_encoder_input_ids: List[Tuple[str, int]] diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index e69de29bb2d1d..edfb8bd7c2fc1 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -0,0 +1,77 @@ +import enum +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import msgspec + +from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict +from vllm.sampling_params import RequestOutputKind, SamplingParams + + +@dataclass +class DetokenizerRequest: + + request_id: str + prompt: Optional[str] + prompt_token_ids: List[int] + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + stop: List[str] + include_stop_str_in_output: bool + + +@dataclass +class EngineCoreRequest: + + # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, + # but this object is currently not playing well with msgspec + # due to circular imports and typing we have in data.py + + request_id: str + #NOTE(Nick): I don't think we need to pass prompt here since it should + # always be tokenized? + prompt: Optional[str] + prompt_token_ids: List[int] + mm_data: Optional[MultiModalDataDict] + mm_placeholders: Optional[MultiModalPlaceholderDict] + mm_processor_kwargs: Optional[Dict[str, Any]] + sampling_params: SamplingParams + eos_token_id: Optional[int] + arrival_time: float + lora_request: Optional[LoRARequest] + + +class EngineCoreOutput(msgspec.Struct, + array_like=True, + omit_defaults=True, + gc=False): + + request_id: str + new_token_ids: List[int] + finished: bool + finish_reason: Optional[str] = None + stop_reason: Union[int, str, None] = None + + +class EngineCoreOutputs(msgspec.Struct, + array_like=True, + omit_defaults=True, + gc=False): + + #NOTE(Nick): We could consider ways to make this more compact, + # e.g. columnwise layout and using an int enum for finish/stop reason + + # [num_reqs] + outputs: List[EngineCoreOutput] + + +class EngineCoreRequestType(enum.Enum): + """ + Request types defined as hex byte strings, so it can be sent over sockets + without separate encoding step. + """ + ADD = b'\x00' + ABORT = b'\x01' diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py new file mode 100644 index 0000000000000..09bff9655a882 --- /dev/null +++ b/vllm/v1/engine/async_llm.py @@ -0,0 +1,372 @@ +import asyncio +from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union + +from vllm.config import ModelConfig, VllmConfig +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.engine.metrics_types import StatLoggerBase +from vllm.engine.protocol import EngineClient +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.inputs.preprocess import InputPreprocessor +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.outputs import EmbeddingRequestOutput, RequestOutput +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.usage.usage_lib import UsageContext +from vllm.v1.engine.async_stream import AsyncStream +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.processor import Processor +from vllm.v1.executor.gpu_executor import GPUExecutor + +logger = init_logger(__name__) + + +class AsyncLLM(EngineClient): + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + log_stats: bool, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + input_registry: InputRegistry = INPUT_REGISTRY, + use_cached_outputs: bool = False, + log_requests: bool = True, + start_engine_loop: bool = True, + ) -> None: + assert start_engine_loop + + self.log_requests = log_requests + self.log_stats = log_stats + self.stat_loggers = stat_loggers + self.model_config = vllm_config.model_config + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + enable_lora=bool(vllm_config.lora_config)) + self.tokenizer.ping() + + # Request streams (map of request_id -> AsyncStream). + self.request_streams: Dict[str, AsyncStream] = {} + # List of cancelled request ids to be aborted. + self.client_aborted_requests: List[str] = [] + + # Processor (converts Inputs --> EngineCoreRequests). + self.processor = Processor(vllm_config.model_config, + vllm_config.lora_config, self.tokenizer, + input_registry) + + # Detokenizer (converts EngineCoreOutputs --> RequestOutput). + self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer) + + # EngineCore (starts the engine in background process). + self.engine_core = EngineCoreClient.make_client( + vllm_config=vllm_config, + executor_class=executor_class, + usage_context=usage_context, + multiprocess_mode=True, + asyncio_mode=True, + ) + + self.output_handler = None + + def __del__(self): + self.shutdown() + + @classmethod + def from_engine_args( + cls, + engine_args: AsyncEngineArgs, + engine_config: Optional[VllmConfig] = None, + start_engine_loop: bool = True, + usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, + stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + ) -> "AsyncLLMEngine": + """Create an AsyncLLM from the EngineArgs.""" + + # Create the engine configs. + if engine_config is None: + vllm_config = engine_args.create_engine_config() + else: + vllm_config = engine_config + + executor_class = cls._get_executor_cls(vllm_config) + + # Create the AsyncLLM. + return cls( + vllm_config=vllm_config, + executor_class=executor_class, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + start_engine_loop=start_engine_loop, + usage_context=usage_context, + stat_loggers=stat_loggers, + ) + + def shutdown(self): + """Shutdown, cleaning up the background proc and IPC.""" + + self.engine_core.shutdown() + + if handler := getattr(self, "output_handler", None): + handler.cancel() + + @classmethod + def _get_executor_cls(cls, vllm_config: VllmConfig): + return GPUExecutor + + async def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + """Add new request to the AsyncLLM.""" + + if self.detokenizer.is_request_active(request_id): + raise KeyError(f"Request {request_id} already exists.") + + # 1) Create a new AsyncStream for the request. + stream = self._add_request_to_streams(request_id) + + # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. + detokenizer_req, engine_core_req = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) + + # 3) Add the request to Detokenizer (this process). + self.detokenizer.add_request(detokenizer_req) + + # 4) Add the EngineCoreRequest to EngineCore (separate process). + await self.engine_core.add_request_async(engine_core_req) + + # 5) Return the generator. + return stream.generator() + + # TODO: we should support multiple prompts in one call, as you + # can do with LLM.generate. So that for multi-prompt completion + # requests we don't need to send multiple messages to core proc, + # and so we don't need multiple streams which then get + # re-multiplexed in the API server anyhow. + async def generate( + self, + prompt: PromptType, + sampling_params: SamplingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> AsyncGenerator[RequestOutput, None]: + """ + Main function called by the API server to kick off a request + * 1) Making an AsyncStream corresponding to the Request. + # 2) Processing the Input. + * 3) Adding the Request to the Detokenizer. + * 4) Adding the Request to the EngineCore (separate process). + + A separate output_handler loop runs in a background AsyncIO task, + pulling outputs from EngineCore and putting them into the + per-request AsyncStream. + + The caller of generate() iterates the returned AsyncGenerator, + returning the RequestOutput back to the caller. + """ + + # We start the output_handler on the first call to generate() so that + # we can call __init__ before the event loop starts, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + + async for output in await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ): + yield output + + def _finish_stream(self, request_id: str): + stream = self.request_streams.pop(request_id, None) + if stream is not None: + stream.finish() + + def _add_request_to_streams( + self, + request_id: str, + ) -> AsyncStream: + + if request_id in self.request_streams: + raise ValueError(f"Request id {request_id} already running.") + + # Avoid streams having circular ref to parent AsyncLLM object. + aborted_reqs = self.client_aborted_requests + stream = AsyncStream(request_id, aborted_reqs.append) + self.request_streams[request_id] = stream + + if self.log_requests: + logger.info("Added request %s.", request_id) + + return stream + + async def _process_cancellations(self) -> None: + """ + Process requests cancelled from user disconnecting. + + When a client disconnects, AsyncStream._cancel() is called. + We passed a callback to AsyncStream(), which appends to + self.client_aborted_requests. + + As a result, if any requests are canceled from the user side + the request_id will show up in self.client_aborted_requests. + """ + + # Avoid streams having circular ref to parent AsyncLLM object. + if not self.client_aborted_requests: + return + reqs_to_abort = self.client_aborted_requests.copy() + self.client_aborted_requests.clear() + + # Remove from Detokenizer. + self.detokenizer.abort_requests(reqs_to_abort) + + # Remove from RequestStreams. + for request_id in reqs_to_abort: + if self.log_requests: + logger.info("User-cancelled request %s.", request_id) + self._finish_stream(request_id) + + # Remove from EngineCore. + await self.engine_core.abort_requests_async(reqs_to_abort) + + def _process_request_outputs(self, request_outputs: List[RequestOutput]): + """Process outputs by putting them into per-request AsyncStreams.""" + + for request_output in request_outputs: + request_id = request_output.request_id + assert request_id in self.request_streams + + # Each request in the API server pulls from the per-request stream. + stream = self.request_streams.get(request_id) + if stream is not None: + stream.put(request_output) + + # If finished, remove from the tracker. + if request_output.finished: + if self.log_requests: + logger.info("Finished request %s.", request_id) + self._finish_stream(request_id) + + async def _run_output_handler(self): + """Background loop: pulls from EngineCore and pushes to AsyncStreams.""" + + try: + while True: + # 1) Pull EngineCoreOutput from the EngineCore. + outputs = await self.engine_core.get_output_async() + + # 2) Detokenize based on the output. + request_outputs, reqs_to_abort = self.detokenizer.step(outputs) + + # 3) Put the RequestOutputs into the per-request AsyncStreams. + self._process_request_outputs(request_outputs) + + # 4) Abort any requests that finished due to stop strings. + await self.engine_core.abort_requests_async(reqs_to_abort) + + # 5) Abort any requests due to client cancellations. + await self._process_cancellations() + + except BaseException as e: + logger.error(e) + raise e + + # TODO: can we eliminate these? + + async def abort(self, request_id: str) -> None: + # Note: Who Calls this? I dont think this is actually used. + raise ValueError("Not Supported on V1 yet.") + + def encode( + self, + prompt: PromptType, + pooling_params: PoolingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + priority: int = 0, + ): + raise ValueError("Not Supported on V1 yet.") + + async def get_model_config(self) -> ModelConfig: + return self.model_config + + async def get_decoding_config(self): + raise ValueError("Not Supported on V1 yet.") + + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.processor.input_preprocessor + + async def get_tokenizer( + self, + lora_request: Optional[LoRARequest] = None, + ) -> AnyTokenizer: + assert lora_request is None + return self.detokenizer.tokenizer + + async def is_tracing_enabled(self) -> bool: + return False + + async def do_log_stats( + self, + scheduler_outputs=None, + model_output=None, + ) -> None: + logger.debug("Called do_log_stats.") + + async def check_health(self) -> None: + logger.debug("Called check_health.") + + async def start_profile(self) -> None: + raise ValueError("Not supported on V1 yet.") + + async def stop_profile(self) -> None: + raise ValueError("Not supported on V1 yet.") + + @property + def is_running(self) -> bool: + return True + + @property + def is_stopped(self) -> bool: + return False + + @property + def errored(self) -> bool: + return False + + @property + def dead_error(self) -> BaseException: + return Exception + + +# Retain V0 name for backwards compatibility. +AsyncLLMEngine = AsyncLLM diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py new file mode 100644 index 0000000000000..3e6c759ad5ebd --- /dev/null +++ b/vllm/v1/engine/async_stream.py @@ -0,0 +1,55 @@ +import asyncio +from typing import Any, AsyncGenerator, Callable, Optional, Type, Union + +from vllm.outputs import EmbeddingRequestOutput, RequestOutput + + +class AsyncStream: + """A stream of RequestOutputs or EmbeddingRequestOutputs for a request + that can be iterated over asynchronously via an async generator.""" + + STOP_ITERATION = Exception() # Sentinel + + def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None: + self.request_id = request_id + self._cancel = cancel + self._queue: asyncio.Queue = asyncio.Queue() + self._finished = False + + def put(self, item: Union[RequestOutput, EmbeddingRequestOutput, + Exception]) -> None: + if not self._finished: + self._queue.put_nowait(item) + + def finish( + self, + exception: Optional[Union[BaseException, Type[BaseException]]] = None, + ) -> None: + if not self._finished: + self._finished = True + self._queue.put_nowait(exception if self._is_raisable(exception) + else AsyncStream.STOP_ITERATION) + + async def generator( + self + ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + finished = False + try: + while True: + result = await self._queue.get() + if self._is_raisable(result): + finished = True + if result == AsyncStream.STOP_ITERATION: + return + raise result + yield result + finally: + self._finished = True + if not finished: + self._cancel(self.request_id) + + @staticmethod + def _is_raisable(value: Any): + return isinstance(value, BaseException) or \ + (isinstance(value, type) and \ + issubclass(value, BaseException)) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py new file mode 100644 index 0000000000000..35ed131d50de9 --- /dev/null +++ b/vllm/v1/engine/core.py @@ -0,0 +1,363 @@ +import multiprocessing +import queue +import threading +import time +from contextlib import contextmanager +from multiprocessing.process import BaseProcess +from multiprocessing.sharedctypes import Synchronized +from typing import Any, Iterator, List, Tuple, Type, Union + +import zmq +import zmq.asyncio +from msgspec import msgpack + +from vllm.config import CacheConfig, VllmConfig +from vllm.logger import init_logger +from vllm.usage.usage_lib import UsageContext +from vllm.v1.core.scheduler import Scheduler +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + EngineCoreRequest, EngineCoreRequestType) +from vllm.v1.engine.mm_input_mapper import MMInputMapper +from vllm.v1.executor.gpu_executor import GPUExecutor +from vllm.v1.request import Request, RequestStatus +from vllm.v1.serial_utils import PickleEncoder +from vllm.version import __version__ as VLLM_VERSION + +logger = init_logger(__name__) + +POLLING_TIMEOUT_MS = 5000 +POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 +LOGGING_TIME_S = 5000 + + +class EngineCore: + """Inner loop of vLLM's Engine.""" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + ): + # Override the configs for V1. + # FIXME + if usage_context == UsageContext.LLM_CLASS: + vllm_config.scheduler_config.max_num_seqs = 1024 + vllm_config.scheduler_config.max_num_batched_tokens = 8192 + elif usage_context == UsageContext.OPENAI_API_SERVER: + vllm_config.scheduler_config.max_num_seqs = 1024 + vllm_config.scheduler_config.max_num_batched_tokens = 2048 + + # TODO (ywang96): Enable APC by default when VLM supports it. + if not vllm_config.model_config.is_multimodal_model: + vllm_config.cache_config.enable_prefix_caching = True + + assert vllm_config.model_config.task != "embedding" + + logger.info("Initializing an LLM engine (v%s) with config: %s", + VLLM_VERSION, vllm_config) + + # Setup Model. + self.model_executor = executor_class(vllm_config) + + # Setup KV Caches and update CacheConfig after profiling. + num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches( + vllm_config.cache_config) + vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks + vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks + + # Set up multimodal input mapper (e.g., convert PIL images to tensors). + self.mm_input_mapper = MMInputMapper(vllm_config.model_config) + + # Setup scheduler. + self.scheduler = Scheduler(vllm_config.scheduler_config, + vllm_config.cache_config, + vllm_config.lora_config) + + self._last_logging_time = time.time() + + def _initialize_kv_caches(self, + cache_config: CacheConfig) -> Tuple[int, int]: + num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( + ) + + if cache_config.num_gpu_blocks_override is not None: + num_gpu_blocks_override = cache_config.num_gpu_blocks_override + logger.info( + "Overriding num_gpu_blocks=%d with " + "num_gpu_blocks_override=%d", num_gpu_blocks, + num_gpu_blocks_override) + num_gpu_blocks = num_gpu_blocks_override + + num_cpu_blocks = 0 + self.model_executor.initialize_cache(num_gpu_blocks) + return num_gpu_blocks, num_cpu_blocks + + def add_request(self, request: EngineCoreRequest): + """Add request to the scheduler.""" + + req = Request.from_engine_core_request(request) + # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may + # take 10-50 ms, which can cause a spike in the latency. We should + # consider moving this to a separate thread. + if req.mm_data: + req.mm_inputs = self.mm_input_mapper.process_inputs( + req.mm_data, req.mm_processor_kwargs) + self.scheduler.add_request(req) + + def abort_requests(self, request_ids: List[str]): + """Abort requests from the scheduler.""" + + # TODO: The scheduler doesn't really need to know the + # specific finish reason, TBD whether we propagate that + # (i.e. client-aborted vs stop criteria met). + self.scheduler.finish_requests(request_ids, + RequestStatus.FINISHED_ABORTED) + + def step(self) -> List[EngineCoreOutput]: + """Schedule, execute, and make output.""" + + if not self.scheduler.has_unfinished_requests(): + return [] + + scheduler_output = self.scheduler.schedule() + output = self.model_executor.execute_model(scheduler_output) + engine_core_outputs = self.scheduler.update_from_output( + scheduler_output, output) + return engine_core_outputs + + +class EngineCoreProc(EngineCore): + """ZMQ-wrapper for running EngineCore in background process.""" + + READY_STR = "READY" + + def __init__( + self, + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + input_path: str, + output_path: str, + ready_path: str, + should_shutdown: Synchronized, + ): + super().__init__(vllm_config, executor_class, usage_context) + + # Signal from main process to shutdown (multiprocessing.Value). + self.should_shutdown = should_shutdown + + # Background Threads and Queues for IO. These enable us to + # overlap ZMQ socket IO with GPU since they release the GIL, + # and to overlap some serialization/deserialization with the + # model forward pass. + # Threads handle Socket <-> Queues and core_busy_loop uses Queue. + self.input_queue = queue.Queue() + self.output_queue = queue.Queue() + threading.Thread(target=self.process_input_socket, + args=(input_path, ), + daemon=True).start() + threading.Thread(target=self.process_output_socket, + args=(output_path, ), + daemon=True).start() + + # Send Readiness signal to EngineClient. + with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket: + ready_socket.send_string(EngineCoreProc.READY_STR) + + @contextmanager + def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]: + """Context manager for use """ + + ctx = zmq.Context() + try: + socket = ctx.socket(type) + + if type == zmq.constants.PULL: + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + yield socket + + except KeyboardInterrupt: + logger.debug("EngineCore had Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + + @staticmethod + def wait_for_startup( + proc: BaseProcess, + ready_path: str, + ) -> None: + """Wait until the EngineCore is ready.""" + + try: + sync_ctx = zmq.Context() # type: ignore[attr-defined] + socket = sync_ctx.socket(zmq.constants.PULL) + socket.connect(ready_path) + + # Wait for EngineCore to send EngineCoreProc.READY_STR. + while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: + logger.debug("Waiting for EngineCoreProc to startup.") + + if not proc.is_alive(): + raise RuntimeError("EngineCoreProc failed to start.") + + message = socket.recv_string() + assert message == EngineCoreProc.READY_STR + + except BaseException as e: + logger.exception(e) + raise e + + finally: + sync_ctx.destroy(linger=0) + + @staticmethod + def make_engine_core_process( + vllm_config: VllmConfig, + executor_class: Type[GPUExecutor], + usage_context: UsageContext, + input_path: str, + output_path: str, + ready_path: str, + should_shutdown: Synchronized, + ) -> BaseProcess: + # The current process might have CUDA context, + # so we need to spawn a new process. + # NOTE(rob): this is a problem for using EngineCoreProc w/ + # LLM, since we need a if __name__ == "__main__" guard. + context = multiprocessing.get_context("spawn") + + process_kwargs = { + "input_path": input_path, + "output_path": output_path, + "ready_path": ready_path, + "vllm_config": vllm_config, + "executor_class": executor_class, + "usage_context": usage_context, + "should_shutdown": should_shutdown + } + # Run EngineCore busy loop in background process. + proc = context.Process(target=EngineCoreProc.run_engine_core, + kwargs=process_kwargs) + proc.start() + + # Wait for startup + EngineCoreProc.wait_for_startup(proc, ready_path) + return proc + + @staticmethod + def run_engine_core(*args, **kwargs): + """Launch EngineCore busy loop in background process.""" + + try: + engine_core = EngineCoreProc(*args, **kwargs) + engine_core.run_busy_loop() + + except KeyboardInterrupt: + logger.debug("EngineCore interrupted.") + + except BaseException as e: + logger.exception(e) + raise e + + def run_busy_loop(self): + """Core busy loop of the EngineCore.""" + + # Loop until we get a shutdown signal. + while not self.should_shutdown: + # 1) Poll the input queue until there is work to do. + if not self.scheduler.has_unfinished_requests(): + while True: + try: + req = self.input_queue.get(timeout=POLLING_TIMEOUT_S) + self._handle_client_request(req) + break + except queue.Empty: + self._log_stats() + logger.debug("EngineCore busy loop waiting.") + if self.should_shutdown: + return + + # 2) Handle any new client requests (Abort or Add). + while not self.input_queue.empty(): + req = self.input_queue.get_nowait() + self._handle_client_request(req) + + # 3) Step the engine core. + outputs = self.step() + + # 4) Put EngineCoreOutputs into the output queue. + self.output_queue.put_nowait(outputs) + + self._log_stats() + + def _log_stats(self): + """Log basic stats every LOGGING_TIME_S""" + + now = time.time() + + if now - self._last_logging_time > LOGGING_TIME_S: + logger.info( + "RUNNING: %s | WAITING: %s", + len(self.scheduler.running), + len(self.scheduler.waiting), + ) + + self._last_logging_time = now + + def _handle_client_request( + self, request: Union[EngineCoreRequest, List[str]]) -> None: + """Handle EngineCoreRequest or EngineCoreABORT from Client.""" + + if isinstance(request, EngineCoreRequest): + self.add_request(request) + else: + # TODO: make an EngineCoreAbort wrapper + assert isinstance(request, list) + self.abort_requests(request) + + def process_input_socket(self, input_path: str): + """Input socket IO thread.""" + + # Msgpack serialization decoding. + decoder_add_req = PickleEncoder() + decoder_abort_req = PickleEncoder() + + with self.make_socket(input_path, zmq.constants.PULL) as socket: + while True: + # (RequestType, RequestData) + type_frame, data_frame = socket.recv_multipart(copy=False) + request_type = type_frame.buffer + request_data = data_frame.buffer + + # Deserialize the request data. + if request_type == EngineCoreRequestType.ADD.value: + request = decoder_add_req.decode(request_data) + elif request_type == EngineCoreRequestType.ABORT.value: + request = decoder_abort_req.decode(request_data) + else: + raise ValueError(f"Unknown RequestType: {request_type}") + + # Push to input queue for core busy loop. + self.input_queue.put_nowait(request) + + def process_output_socket(self, output_path: str): + """Output socket IO thread.""" + + # Msgpack serialization encoding. + encoder = msgpack.Encoder() + # Reuse send buffer. + buffer = bytearray() + + with self.make_socket(output_path, zmq.constants.PUSH) as socket: + while True: + engine_core_outputs = self.output_queue.get() + outputs = EngineCoreOutputs(outputs=engine_core_outputs) + encoder.encode_into(outputs, buffer) + socket.send_multipart((buffer, ), copy=False) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py new file mode 100644 index 0000000000000..09801e20e16ca --- /dev/null +++ b/vllm/v1/engine/core_client.py @@ -0,0 +1,219 @@ +import multiprocessing +import time +from typing import List, Union + +import msgspec +import zmq +import zmq.asyncio + +from vllm.logger import init_logger +from vllm.utils import get_open_zmq_ipc_path +from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, + EngineCoreRequest, EngineCoreRequestType) +from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.serial_utils import PickleEncoder + +logger = init_logger(__name__) + + +class EngineCoreClient: + """ + EngineCoreClient: subclasses handle different methods for pushing + and pulling from the EngineCore for asyncio / multiprocessing. + + Subclasses: + * InprocClient: In process EngineCore (for V0-style LLMEngine use) + * SyncMPClient: ZMQ + background proc EngineCore (for LLM) + * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM) + """ + + @staticmethod + def make_client( + *args, + multiprocess_mode: bool, + asyncio_mode: bool, + **kwargs, + ) -> "EngineCoreClient": + + # TODO: support this for debugging purposes. + if asyncio_mode and not multiprocess_mode: + raise NotImplementedError( + "Running EngineCore in asyncio without multiprocessing " + "is not currently supported.") + + if multiprocess_mode and asyncio_mode: + return AsyncMPClient(*args, **kwargs) + + if multiprocess_mode and not asyncio_mode: + return SyncMPClient(*args, **kwargs) + + return InprocClient(*args, **kwargs) + + def shutdown(self): + pass + + def get_output(self) -> List[EngineCoreOutput]: + raise NotImplementedError + + def add_request(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + def abort_requests(self, request_ids: List[str]) -> None: + raise NotImplementedError + + async def get_output_async(self) -> List[EngineCoreOutput]: + raise NotImplementedError + + async def add_request_async(self, request: EngineCoreRequest) -> None: + raise NotImplementedError + + async def abort_requests_async(self, request_ids: List[str]) -> None: + raise NotImplementedError + + +class InprocClient(EngineCoreClient): + """ + InprocClient: client for in-process EngineCore. Intended + for use in LLMEngine for V0-style add_request() and step() + EngineCore setup in this process (no busy loop). + + * pushes EngineCoreRequest directly into the EngineCore + * pulls EngineCoreOutputs by stepping the EngineCore + + TODO: support asyncio-mode for debugging. + """ + + def __init__(self, *args, **kwargs): + self.engine_core = EngineCore(*args, **kwargs) + + def get_output(self) -> List[EngineCoreOutput]: + return self.engine_core.step() + + def add_request(self, request: EngineCoreRequest) -> None: + self.engine_core.add_request(request) + + def abort_requests(self, request_ids: List[str]) -> None: + self.engine_core.abort_requests(request_ids) + + +class MPClient(EngineCoreClient): + """ + MPClient: base client for multi-proc EngineCore. + EngineCore runs in a background process busy loop, getting + new EngineCoreRequests and returning EngineCoreOutputs + + * pushes EngineCoreRequests via input_socket + * pulls EngineCoreOutputs via output_socket + + * AsyncMPClient subclass for AsyncLLM usage + * SyncMPClient subclass for LLM usage + """ + + def __init__( + self, + *args, + asyncio_mode: bool, + **kwargs, + ): + # Serialization setup. + self.encoder = PickleEncoder() + self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) + + # ZMQ setup. + self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context()) + + # Path for IPC. + ready_path = get_open_zmq_ipc_path() + output_path = get_open_zmq_ipc_path() + input_path = get_open_zmq_ipc_path() + + # Get output (EngineCoreOutput) from EngineCore. + self.output_socket = self.ctx.socket(zmq.constants.PULL) + self.output_socket.connect(output_path) + + # Send input (EngineCoreRequest) to EngineCore. + self.input_socket = self.ctx.socket(zmq.constants.PUSH) + self.input_socket.bind(input_path) + + # Start EngineCore in background process. + self.should_shutdown = multiprocessing.Value('b', False, lock=False) + self.proc = EngineCoreProc.make_engine_core_process( + *args, + input_path=input_path, + output_path=output_path, + ready_path=ready_path, + should_shutdown=self.should_shutdown, + **kwargs, + ) + + def shutdown(self): + # Send shutdown signal to background process. + self.should_shutdown = True + + # Shut down the zmq context. + self.ctx.destroy(linger=0) + + # Shutdown the process if needed. + if hasattr(self, "proc") and self.proc.is_alive(): + self.proc.terminate() + + time.sleep(5) + if self.proc.is_alive(): + self.proc.kill() + + def __del__(self): + self.shutdown() + + +class SyncMPClient(MPClient): + """Synchronous client for multi-proc EngineCore.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, asyncio_mode=False, **kwargs) + + def get_output(self) -> List[EngineCoreOutput]: + + (frame, ) = self.output_socket.recv_multipart(copy=False) + engine_core_outputs = self.decoder.decode(frame.buffer).outputs + return engine_core_outputs + + def _send_input(self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, List[str]]) -> None: + + # (RequestType, SerializedRequest) + msg = (request_type.value, self.encoder.encode(request)) + self.input_socket.send_multipart(msg, copy=False) + + def add_request(self, request: EngineCoreRequest) -> None: + self._send_input(EngineCoreRequestType.ADD, request) + + def abort_requests(self, request_ids: List[str]) -> None: + self._send_input(EngineCoreRequestType.ABORT, request_ids) + + +class AsyncMPClient(MPClient): + """Asyncio-compatible client for multi-proc EngineCore.""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, asyncio_mode=True, **kwargs) + + async def get_output_async(self) -> List[EngineCoreOutput]: + + frames = await self.output_socket.recv_multipart(copy=False) + engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs + + return engine_core_outputs + + async def _send_input( + self, request_type: EngineCoreRequestType, + request: Union[EngineCoreRequest, List[str]]) -> None: + + msg = (request_type.value, self.encoder.encode(request)) + await self.input_socket.send_multipart(msg, copy=False) + + async def add_request_async(self, request: EngineCoreRequest) -> None: + await self._send_input(EngineCoreRequestType.ADD, request) + + async def abort_requests_async(self, request_ids: List[str]) -> None: + if len(request_ids) > 0: + await self._send_input(EngineCoreRequestType.ABORT, request_ids) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py new file mode 100644 index 0000000000000..6249d60199a62 --- /dev/null +++ b/vllm/v1/engine/detokenizer.py @@ -0,0 +1,272 @@ +from dataclasses import dataclass +from typing import Dict, Iterable, List, Optional, Tuple + +from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import RequestOutputKind +from vllm.transformers_utils.detokenizer_utils import ( + AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput + +logger = init_logger(__name__) + + +@dataclass +class IncrementalDetokenizer: + + # Generation data + output_text: str + tokens: List[str] + token_ids: List[int] + + # Stop strings + stop: List[str] + include_stop_str_in_output: bool + + # Metadata for incremental detokenization + prefix_offset: int + read_offset: int + + # Parameters for detokenization + skip_special_tokens: bool + spaces_between_special_tokens: bool + output_kind: RequestOutputKind + + # TODO: Probably decouple these + request_id: str + prompt: Optional[str] + prompt_token_ids: List[int] + + # Tokenizer for this request + tokenizer: AnyTokenizer + + # Accounting for stop string buffering + stop_buffer_length: int + _last_output_text_offset: int = 0 + + @property + def output_token_ids(self) -> List[int]: + assert len(self.token_ids) >= len(self.prompt_token_ids) + return self.token_ids[len(self.prompt_token_ids):] + + @classmethod + def from_new_request( + cls, + tokenizer: AnyTokenizer, + request: DetokenizerRequest, + ) -> "IncrementalDetokenizer": + + tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( + tokenizer=tokenizer, + prompt_ids=request.prompt_token_ids, + skip_special_tokens=request.skip_special_tokens, + ) + + stops = request.stop + # Number of chars to hold back when stop strings are to be excluded + # from streamed output. + if stops and not request.include_stop_str_in_output: + stop_buffer_length = max(len(s) for s in stops) - 1 + else: + stop_buffer_length = 0 + + return cls( + output_text="", + tokens=tokens, + # Detokenizer mutates this list, so need a unique copy. + # NOTE(Nick): could we take ownership of it though? + token_ids=request.prompt_token_ids.copy(), + stop=stops, + include_stop_str_in_output=request.include_stop_str_in_output, + prefix_offset=prefix_offset, + read_offset=read_offset, + skip_special_tokens=request.skip_special_tokens, + spaces_between_special_tokens=request. + spaces_between_special_tokens, + output_kind=request.output_kind, + request_id=request.request_id, + prompt=request.prompt, + prompt_token_ids=request.prompt_token_ids, + tokenizer=tokenizer, + stop_buffer_length=stop_buffer_length, + ) + + def add_tokens( + self, + new_token_ids: List[int], + finish_reason: Optional[str], + stop_reason: Optional[str], + ) -> Optional[RequestOutput]: + """ + Update RequestState for the request_id by: + 1) Detokenize the new token ids incrementally. + 2) Update the RequestOutput with the new text. + """ + + # 1) Detokenize the new token ids incrementally. + # TODO(woosuk): This method becomes very inefficient when the number of + # new_token_ids is more than 1. We need to optimize this. + decoded_text = "" + for new_token_id in new_token_ids: + self.token_ids.append(new_token_id) + (new_tokens, new_decoded_token_text, prefix_offset, + read_offset) = detokenize_incrementally( + tokenizer=self.tokenizer, + all_input_ids=self.token_ids, + prev_tokens=self.tokens, + prefix_offset=self.prefix_offset, + read_offset=self.read_offset, + skip_special_tokens=self.skip_special_tokens, + spaces_between_special_tokens=self. + spaces_between_special_tokens, + ) + + self.tokens.extend(new_tokens) + self.prefix_offset = prefix_offset + self.read_offset = read_offset + self.output_text += new_decoded_token_text + + decoded_text += new_decoded_token_text + + # 2) Evaluate stop criteria. + if self.stop: + stop = StopChecker.check_stop_strings( + output_text=self.output_text, + new_char_count=len(decoded_text), + stop=self.stop, + include_in_output=self.include_stop_str_in_output, + ) + if stop is not None: + stop_str, truncate_to = stop + if truncate_to != -1: + self.output_text = self.output_text[:truncate_to] + finish_reason = "stop" # TODO: use constant + stop_reason = stop_str + + # TODO: handle stop_token_ids here too? + + # 3) Update the RequestOutput object with the new text. + finished = bool(finish_reason) + if self.output_kind == RequestOutputKind.FINAL_ONLY \ + and not finished: + return None + + delta = self.output_kind == RequestOutputKind.DELTA + output_text = self._get_next_output_text(finished, delta) + token_ids = new_token_ids if delta else self.output_token_ids + + request_output = RequestOutput.new( + self.request_id, + self.prompt, + self.prompt_token_ids, + output_text, + token_ids, + finished, + ) + + if finished: + completion_output = request_output.outputs[0] + completion_output.finish_reason = finish_reason + completion_output.stop_reason = stop_reason + + return request_output + + def _get_next_output_text(self, finished: bool, delta: bool) -> str: + """If delta is True, only new text since the last call to + this method is returned""" + + # We return the full output text if the sequence is finished. + buffer_length = 0 if finished else self.stop_buffer_length + if not delta: + return self.output_text[:-buffer_length] if buffer_length else ( + self.output_text) + length = len(self.output_text) - buffer_length + last_offset = self._last_output_text_offset + if last_offset < length: + self._last_output_text_offset = length + return self.output_text[last_offset:length] + return "" + + +class Detokenizer: + + def __init__(self, + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None): + # TODO: once we support LoRA, we should should pass the tokenizer + # here. We currently have two copies (this + in the LLMEngine). + self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + revision=revision) + + # Request id -> IncrementalDetokenizer + self.request_states: Dict[str, IncrementalDetokenizer] = {} + + def is_request_active(self, request_id: str): + return request_id in self.request_states + + def get_num_unfinished_requests(self): + return len(self.request_states) + + def has_unfinished_requests(self) -> bool: + return len(self.request_states) > 0 + + def abort_requests( + self, + request_ids: Iterable[str], + ) -> None: + """Remove the request_ids from the Detokenizer.""" + + for request_id in request_ids: + self.request_states.pop(request_id, None) + + def add_request( + self, + request: DetokenizerRequest, + ): + """Add new request to the Detokenizer.""" + + assert (request.request_id not in self.request_states) + + request_state = IncrementalDetokenizer.from_new_request( + self.tokenizer, request) + self.request_states[request.request_id] = request_state + + def step( + self, encore_core_outputs: List[EngineCoreOutput] + ) -> Tuple[List[RequestOutput], List[str]]: + """Update state and request the RequestOutputs to the LLMEngine.""" + + request_outputs: List[RequestOutput] = [] + requests_to_abort: List[str] = [] + for engine_core_output in encore_core_outputs: + request_id = engine_core_output.request_id + detokenizer = self.request_states.get(request_id) + if detokenizer is None: + # Ignore output for already-aborted request. + continue + + # Detokenize and update state. + request_output = detokenizer.add_tokens( + new_token_ids=engine_core_output.new_token_ids, + finish_reason=engine_core_output.finish_reason, + stop_reason=engine_core_output.stop_reason, + ) + + if request_output is not None: + # Add to RequestOutputs list. + request_outputs.append(request_output) + + # Free completed requests. + if request_output.finished: + self.request_states.pop(request_id) + if not engine_core_output.finished: + requests_to_abort.append(request_id) + + # Return to EngineClient. + return request_outputs, requests_to_abort diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f805c5e69bc1c..4ebfff9584267 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -1,35 +1,29 @@ -import time -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type, - Union) +from typing import Dict, List, Mapping, Optional, Type, Union -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import VllmConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderLLMInputs, InputRegistry, PromptType) -from vllm.inputs.preprocess import InputPreprocessor +from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING +from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.outputs import CompletionOutput, RequestOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import RequestOutputKind, SamplingParams -from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer_group import ( - BaseTokenizerGroup, init_tokenizer_from_configs) +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.v1.core.scheduler import Scheduler +from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.engine.detokenizer import Detokenizer +from vllm.v1.engine.processor import Processor from vllm.v1.executor.gpu_executor import GPUExecutor -from vllm.v1.request import Request, RequestStatus -from vllm.v1.tokenizer.detokenizer import Detokenizer, DetokenizerInputs -from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) class LLMEngine: + """Legacy LLMEngine for backwards compatibility.""" def __init__( self, @@ -39,216 +33,93 @@ def __init__( usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, + multiprocess_mode: bool = False, ) -> None: - # TODO: remove the local variables and use self.* throughout the class. - model_config = self.model_config = vllm_config.model_config - cache_config = self.cache_config = vllm_config.cache_config - lora_config = self.lora_config = vllm_config.lora_config - parallel_config = self.parallel_config = vllm_config.parallel_config - scheduler_config = self.scheduler_config = vllm_config.scheduler_config - device_config = self.device_config = vllm_config.device_config - speculative_config = self.speculative_config = vllm_config.speculative_config # noqa - load_config = self.load_config = vllm_config.load_config - decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa - ) - prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa - observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa - ) - - # Override the configs for V1. - # FIXME - if usage_context == UsageContext.LLM_CLASS: - scheduler_config.max_num_seqs = 1024 - scheduler_config.max_num_batched_tokens = 8192 - elif usage_context == UsageContext.OPENAI_API_SERVER: - scheduler_config.max_num_seqs = 1024 - scheduler_config.max_num_batched_tokens = 2048 - - # TODO (ywang96): Enable APC by default when VLM supports it. - if not model_config.is_multimodal_model: - cache_config.enable_prefix_caching = True - - logger.info( - "Initializing an LLM engine (v%s) with config: " - "model=%r, speculative_config=%r, tokenizer=%r, " - "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, tokenizer_revision=%s, " - "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " - "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " - "pipeline_parallel_size=%d, " - "disable_custom_all_reduce=%s, quantization=%s, " - "enforce_eager=%s, kv_cache_dtype=%s, " - "quantization_param_path=%s, device_config=%s, " - "decoding_config=%r, observability_config=%r, " - "seed=%d, served_model_name=%s, " - "num_scheduler_steps=%d, enable_prefix_caching=%s, " - "use_async_output_proc=%s, mm_processor_kwargs=%s)", - VLLM_VERSION, - model_config.model, - speculative_config, - model_config.tokenizer, - model_config.skip_tokenizer_init, - model_config.tokenizer_mode, - model_config.revision, - model_config.override_neuron_config, - model_config.tokenizer_revision, - model_config.trust_remote_code, - model_config.dtype, - model_config.max_model_len, - load_config.download_dir, - load_config.load_format, - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, - scheduler_config.num_scheduler_steps, - cache_config.enable_prefix_caching, - model_config.use_async_output_proc, - model_config.mm_processor_kwargs, + # TODO: Can we avoid this? + self.model_config = vllm_config.model_config + + # Tokenizer (+ ensure liveness if running in another process). + self.tokenizer = init_tokenizer_from_configs( + model_config=vllm_config.model_config, + scheduler_config=vllm_config.scheduler_config, + parallel_config=vllm_config.parallel_config, + enable_lora=bool(vllm_config.lora_config)) + self.tokenizer.ping() + + # Processor (convert Inputs --> EngineCoreRequests) + self.processor = Processor(vllm_config.model_config, + vllm_config.lora_config, self.tokenizer, + input_registry, mm_registry) + + # Detokenizer (converts EngineCoreOutputs --> RequestOutput) + self.detokenizer = Detokenizer( + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, ) - self.log_stats = log_stats - - assert not self.model_config.skip_tokenizer_init - self.tokenizer = self._init_tokenizer() - if self.tokenizer: - # Ping the tokenizer to ensure liveness if it runs in a - # different process. - self.tokenizer.ping() - self.detokenizer = Detokenizer(self.model_config.tokenizer) - - self.generation_config_fields = _load_generation_config_dict( - model_config) - self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) - self.input_registry = input_registry - self.input_processor = input_registry.create_input_processor( - model_config) - - # Request id -> Request - self.requests: Dict[str, Request] = {} - # NOTE(woosuk): Now that the detokenizer works asynchronously, we need - # to keep track of how many steps each request has been lagged behind - # in terms of detokenization. - # Request id -> how many detokenizer steps the request should wait for. - self.num_lagged_steps: Dict[str, int] = {} - # OPTIMIZATION: Cache the request output and update it incrementally. - # This is used to avoid creating a new RequestOutput object every step. - # Request id -> RequestOutput - self.request_outputs: Dict[str, RequestOutput] = {} - - self.model_executor = executor_class(vllm_config=vllm_config) - assert self.model_config.task != "embedding" - self._initialize_kv_caches() - - # Create the scheduler. - # NOTE: the cache_config here have been updated with the numbers of - # GPU and CPU blocks, which are profiled in the distributed executor. - self.scheduler = Scheduler(scheduler_config, cache_config, lora_config) - - def __del__(self): - # Small hack- implicit clean up of resources on garbage collect - # TODO: this should probably be explicitly invoked when we're done with - # the engine - self.terminate_detokenizer() - - def _initialize_kv_caches(self) -> None: - num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks( + # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) + self.engine_core = EngineCoreClient.make_client( + vllm_config, + executor_class, + usage_context, + multiprocess_mode=multiprocess_mode, + asyncio_mode=False, ) - if self.cache_config.num_gpu_blocks_override is not None: - num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override - logger.info( - "Overriding num_gpu_blocks=%d with " - "num_gpu_blocks_override=%d", num_gpu_blocks, - num_gpu_blocks_override) - num_gpu_blocks = num_gpu_blocks_override - - self.cache_config.num_gpu_blocks = num_gpu_blocks - self.cache_config.num_cpu_blocks = 0 - self.model_executor.initialize_cache(num_gpu_blocks) - @classmethod def from_engine_args( cls, engine_args: EngineArgs, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, + enable_multiprocessing: bool = False, ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" + # Create the engine configs. - engine_config = engine_args.create_engine_config() - executor_class = cls._get_executor_cls(engine_config) - # Create the LLM engine. - engine = cls( - vllm_config=engine_config, - executor_class=executor_class, - log_stats=not engine_args.disable_log_stats, - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - return engine - - def _init_tokenizer(self) -> BaseTokenizerGroup: - return init_tokenizer_from_configs( - model_config=self.model_config, - scheduler_config=self.scheduler_config, - parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) - - def _verify_args(self) -> None: - self.model_config.verify_with_parallel_config(self.parallel_config) - self.cache_config.verify_with_parallel_config(self.parallel_config) - if self.lora_config: - self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_with_scheduler_config( - self.scheduler_config) - if self.prompt_adapter_config: - self.prompt_adapter_config.verify_with_model_config( - self.model_config) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderLLMInputs], - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], - trace_headers: Optional[Mapping[str, str]] = None, - ) -> None: - assert prompt_adapter_request is None - assert trace_headers is None - self._validate_model_inputs(processed_inputs) - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - - # TODO(woosuk): Support embedding mode. - assert isinstance(params, SamplingParams) - sampling_params = params.clone() - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - - # TODO(woosuk): Check max_logprobs - # TODO(woosuk): Support encoder-decoder models. - req = Request(request_id, processed_inputs, params, eos_token_id, - arrival_time) - self.requests[request_id] = req - self.num_lagged_steps[request_id] = 0 - self.scheduler.add_request(req) + vllm_config = engine_args.create_engine_config() + executor_class = cls._get_executor_cls(vllm_config) + + if VLLM_ENABLE_V1_MULTIPROCESSING: + logger.debug("Enabling multiprocessing for LLMEngine.") + enable_multiprocessing = True + + # Create the LLMEngine. + return cls(vllm_config=vllm_config, + executor_class=executor_class, + log_stats=not engine_args.disable_log_stats, + usage_context=usage_context, + stat_loggers=stat_loggers, + multiprocess_mode=enable_multiprocessing) + + @classmethod + def _get_executor_cls(cls, vllm_config: VllmConfig): + return GPUExecutor def stop_remote_worker_execution_loop(self) -> None: raise NotImplementedError("TP not implemented yet.") + def get_num_unfinished_requests(self) -> int: + return self.detokenizer.get_num_unfinished_requests() + + def has_unfinished_requests(self) -> bool: + return self.detokenizer.has_unfinished_requests() + + @classmethod + def validate_outputs(cls, outputs, output_type): + return outputs + + def abort_request(self, request_ids: List[str]) -> None: + """Remove request_ids from EngineCore and Detokenizer.""" + + self.engine_core.abort_requests(request_ids) + self.detokenizer.abort_requests(request_ids) + def add_request( self, request_id: str, @@ -260,261 +131,46 @@ def add_request( prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, ) -> None: - if lora_request is not None and not self.lora_config: - raise ValueError(f"Got lora_request {lora_request} but LoRA is " - "not enabled!") - if arrival_time is None: - arrival_time = time.time() - assert priority == 0, "vLLM V1 does not support priority at the moment." - - preprocessed_inputs = self.input_preprocessor.preprocess( - prompt, - request_id=request_id, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - processed_inputs = self.input_processor(preprocessed_inputs) - - self._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=params, - arrival_time=arrival_time, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - trace_headers=trace_headers, - ) - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.scheduler.finish_requests(request_id, - RequestStatus.FINISHED_ABORTED) - self._free_request(request_id) + # 1) Process raw inputs into the request. + detokenizer_req, engine_core_req = self.processor.process_inputs( + request_id, prompt, params, arrival_time, lora_request, + trace_headers, prompt_adapter_request, priority) - def get_num_unfinished_requests(self) -> int: - """Gets the number of unfinished requests.""" - return len(self.requests) + # 2) Add the request to Detokenizer. + self.detokenizer.add_request(detokenizer_req) - def has_unfinished_requests(self) -> bool: - """Returns True if there are unfinished requests.""" - return len(self.requests) > 0 + # 3) Add the request to EngineCore. + self.engine_core.add_request(engine_core_req) def step(self) -> List[RequestOutput]: - # NOTE(woosuk): This method may return an empty list when the - # detokenizer is still processing the outputs. This should not be - # considered as the end of the generation process. - # FIXME(woosuk): Currently, the step method is inefficient because it - # creates RequestOutput objects for all running requests, while they - # may not be needed unless the output is streamed to the client. - if self.scheduler.has_unfinished_requests(): - scheduler_output = self.scheduler.schedule() - output = self.model_executor.execute_model(scheduler_output) - sampled = self.scheduler.update_from_output( - scheduler_output, output) - self.send_to_detokenizer(sampled) - req_outputs = self.recv_from_detokenizer() - return req_outputs - - def send_to_detokenizer(self, sampled: List[Tuple[Request, int]]) -> None: - inputs = DetokenizerInputs( - req_ids=[], - prompt_token_ids=[], - new_token_ids=[], - skip_special_tokens=[], - spaces_between_special_tokens=[], - free_req_ids=[], # TODO(woosuk): Implement freeing. - ) - for req, num_tokens in sampled: - inputs.req_ids.append(req.request_id) - if req.num_output_tokens == num_tokens: - # The request is first detokenized. - inputs.prompt_token_ids.append(req.prompt_token_ids) - else: - # The prompt token ids are already cached in the detokenizer. - inputs.prompt_token_ids.append([]) - inputs.new_token_ids.append(req.output_token_ids[-num_tokens:]) - inputs.skip_special_tokens.append( - req.sampling_params.skip_special_tokens) - inputs.spaces_between_special_tokens.append( - req.sampling_params.spaces_between_special_tokens) - - # Update the number of lagged steps. - self.num_lagged_steps[req.request_id] += 1 - self.detokenizer.send(inputs) - - def recv_from_detokenizer(self) -> List[RequestOutput]: - detokenizer_output = self.detokenizer.recv() - if detokenizer_output is None: - return [] - - req_outputs: List[RequestOutput] = [] - num_reqs = len(detokenizer_output.req_ids) - for i in range(num_reqs): - req_id = detokenizer_output.req_ids[i] - if req_id not in self.requests: - # The request has been aborted while the detokenizer was - # processing the outputs. - continue - - req = self.requests[req_id] - req.output_text += detokenizer_output.detokenized_texts[i] - - self.num_lagged_steps[req_id] -= 1 - finished = (self.num_lagged_steps[req_id] == 0 - and req.is_finished()) - req_output = self._make_request_output( - req, detokenizer_output.num_output_token_ids[i], - detokenizer_output.detokenized_texts[i], finished) - req_outputs.append(req_output) - - if finished: - self._free_request(req_id) - return req_outputs - - def terminate_detokenizer(self) -> None: - self.detokenizer.terminate() - - def _make_request_output( - self, - request: Request, - num_output_tokens: int, - new_output_text: str, - finished: bool, - ) -> RequestOutput: - req_output = self.request_outputs.get(request.request_id) - if req_output is None: - # TODO: Support `n` > 1. - completion_output = CompletionOutput( - index=0, - text="", - token_ids=[], - cumulative_logprob=None, - logprobs=None, # TODO - finish_reason=None, - stop_reason=None, - lora_request=None, - ) - req_output = RequestOutput( - request_id=request.request_id, - prompt=request.prompt, - prompt_token_ids=request.prompt_token_ids, - prompt_logprobs=None, # TODO - outputs=[completion_output], - finished=False, - metrics=None, - lora_request=None, - encoder_prompt=None, - encoder_prompt_token_ids=None, - ) - self.request_outputs[request.request_id] = req_output - - completion_output = req_output.outputs[0] - if request.sampling_params.output_kind == RequestOutputKind.CUMULATIVE: - completion_output.text += new_output_text - completion_output.token_ids = ( - request.output_token_ids[:num_output_tokens]) - elif request.sampling_params.output_kind == RequestOutputKind.DELTA: - completion_output.text = new_output_text - num_prev_tokens = len(completion_output.token_ids) - completion_output.token_ids = request.output_token_ids[ - num_prev_tokens:num_output_tokens] - elif (request.sampling_params.output_kind == - RequestOutputKind.FINAL_ONLY): - if finished: - completion_output.text = request.output_text - completion_output.token_ids = request.output_token_ids - else: - completion_output.text = "" - completion_output.token_ids = [] - - if finished: - completion_output.finish_reason = request.get_finished_reason() - completion_output.stop_reason = request.stop_reason - req_output.finished = finished - return req_output - - def _free_request(self, request_id: str) -> None: - self.requests.pop(request_id, None) - self.num_lagged_steps.pop(request_id, None) - self.request_outputs.pop(request_id, None) - - def check_health(self) -> None: - if self.tokenizer: - self.tokenizer.check_health() - self.model_executor.check_health() - - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderLLMInputs]): - prompt_ids = inputs.get("prompt_token_ids") - if prompt_ids is None or len(prompt_ids) == 0: - raise ValueError("Prompt cannot be empty") - - if self.model_config.is_multimodal_model: - max_prompt_len = self.model_config.max_model_len - - if len(prompt_ids) > max_prompt_len: - raise ValueError( - f"The prompt (total length {len(prompt_ids)}) is too long " - f"to fit into the model (context length {max_prompt_len}). " - "Make sure that `max_model_len` is no smaller than the " - "number of text tokens plus multimodal tokens. For image " - "inputs, the number of image tokens depends on the number " - "of images, and possibly their aspect ratios as well.") - @classmethod - def validate_outputs(cls, outputs, output_type): - return outputs + # 1) Get EngineCoreOutput from the EngineCore. + engine_core_outputs = self.engine_core.get_output() - def get_model_config(self) -> ModelConfig: - """Gets the model configuration.""" - return self.model_config + # 2) Detokenizer the EngineCoreOutput. + request_outputs, requests_to_abort = self.detokenizer.step( + engine_core_outputs) - def get_parallel_config(self) -> ParallelConfig: - """Gets the parallel configuration.""" - return self.parallel_config + # 3) Abort requests that finished due to stopping criteria. + if requests_to_abort: + self.abort_request(requests_to_abort) - def get_decoding_config(self) -> DecodingConfig: - """Gets the decoding configuration.""" - return self.decoding_config + return request_outputs - def get_scheduler_config(self) -> SchedulerConfig: - """Gets the scheduler configuration.""" - return self.scheduler_config + # TODO(rob): Can we get rid of these? - def get_lora_config(self) -> LoRAConfig: - """Gets the LoRA configuration.""" - return self.lora_config - - @classmethod - def _get_executor_cls(cls, engine_config: VllmConfig): - return GPUExecutor - - def is_tracing_enabled(self) -> bool: - return False - - def do_log_stats(self, *args, **kwargs) -> None: + def get_model_config(self): pass - def is_encoder_decoder_model(self) -> bool: - return False - - def start_profile(self) -> None: + def is_encoder_decoder_model(self): pass - def stop_profile(self) -> None: + def start_profile(self): pass - def get_tokenizer_group(self, *args, **kwargs): - return self.tokenizer - - -def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: - config = try_get_generation_config( - model_config.model, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.revision, - ) - - if config is None: - return {} + def stop_profile(self): + pass - return config.to_diff_dict() + def get_tokenizer_group(self, group_type): + pass diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py new file mode 100644 index 0000000000000..594c973678235 --- /dev/null +++ b/vllm/v1/engine/mm_input_mapper.py @@ -0,0 +1,39 @@ +from typing import Any, Dict, List, Optional + +from vllm.config import ModelConfig +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalKwargs, MultiModalRegistry) + + +class MMInputMapper: + + def __init__( + self, + model_config: ModelConfig, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + self.mm_registry = mm_registry + self.multi_modal_input_mapper = mm_registry.create_input_mapper( + model_config) + self.mm_registry.init_mm_limits_per_prompt(model_config) + + def process_inputs( + self, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Dict[str, Any]], + ) -> List[MultiModalKwargs]: + image_inputs = mm_data["image"] + if not isinstance(image_inputs, list): + image_inputs = [image_inputs] + + # Process each image input separately so that later we can schedule + # them in a fine-grained manner. + mm_inputs: List[MultiModalKwargs] = [] + num_images = len(image_inputs) + for i in range(num_images): + mm_input = self.multi_modal_input_mapper( + {"image": [image_inputs[i]]}, + mm_processor_kwargs=mm_processor_kwargs, + ) + mm_inputs.append(mm_input) + return mm_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py new file mode 100644 index 0000000000000..5c1577190c75a --- /dev/null +++ b/vllm/v1/engine/processor.py @@ -0,0 +1,168 @@ +import time +from typing import Any, Dict, Mapping, Optional, Tuple, Union + +from vllm.config import LoRAConfig, ModelConfig +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs +from vllm.inputs.preprocess import InputPreprocessor +from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.pooling_params import PoolingParams +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.transformers_utils.config import try_get_generation_config +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest + + +class Processor: + + def __init__( + self, + model_config: ModelConfig, + lora_config: Optional[LoRAConfig], + tokenizer: BaseTokenizerGroup, + input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + + self.model_config = model_config + self.lora_config = lora_config + self.tokenizer = tokenizer + + self.generation_config_fields = _load_generation_config_dict( + model_config) + self.input_preprocessor = InputPreprocessor(model_config, + self.tokenizer, + mm_registry) + self.input_processor = input_registry.create_input_processor( + model_config) + + # TODO: run in an ThreadpoolExecutor or BackgroundProcess. + # This ideally should releases the GIL, so we should not block the + # asyncio loop while this is running. + def process_inputs( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: float, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: + + # TODO(woosuk): Support embedding mode. + # TODO(woosuk): Check max_logprobs + # TODO(woosuk): Support encoder-decoder models. + + if lora_request is not None and not self.lora_config: + raise ValueError(f"Got lora_request {lora_request} but LoRA is " + "not enabled!") + if arrival_time is None: + arrival_time = time.time() + assert priority == 0, "vLLM V1 does not support priority at the moment." + assert trace_headers is None, "vLLM V1 does not support tracing yet." + + # Process inputs. + preprocessed_inputs = self.input_preprocessor.preprocess( + prompt, + request_id=request_id, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + processed_inputs = self.input_processor(preprocessed_inputs) + self._validate_model_inputs(processed_inputs) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = SingletonInputsAdapter( + processed_inputs["decoder"]) + encoder_inputs = SingletonInputsAdapter( + processed_inputs["encoder"]) + else: + decoder_inputs = SingletonInputsAdapter(processed_inputs) + encoder_inputs = None + + # TODO: Impl encoder-decoder + if encoder_inputs is not None: + raise NotImplementedError + + assert isinstance(params, SamplingParams) + # TODO: can we avoid cloning here in multiproc case + sampling_params = params.clone() + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + + # Make Request for Detokenizer. + detokenizer_request = DetokenizerRequest( + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, + sampling_params.skip_special_tokens, + sampling_params.spaces_between_special_tokens, + sampling_params.output_kind, + sampling_params.stop, + sampling_params.include_stop_str_in_output, + ) + + # Make Request for EngineCore. + engine_core_request = EngineCoreRequest( + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, + decoder_inputs.multi_modal_data, + decoder_inputs.multi_modal_placeholders, + decoder_inputs.mm_processor_kwargs, + sampling_params, + eos_token_id, + arrival_time, + lora_request, + ) + + return detokenizer_request, engine_core_request + + def _validate_model_inputs(self, inputs: ProcessorInputs): + if is_encoder_decoder_inputs(inputs): + # For encoder-decoder multimodal models, the max_prompt_len + # restricts the decoder prompt length + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] + else: + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids + + if prompt_ids is None or len(prompt_ids) == 0: + raise ValueError("Prompt cannot be empty") + + if self.model_config.is_multimodal_model: + max_prompt_len = self.model_config.max_model_len + + if len(prompt_ids) > max_prompt_len: + raise ValueError( + f"The prompt (total length {len(prompt_ids)}) is too long " + f"to fit into the model (context length {max_prompt_len}). " + "Make sure that `max_model_len` is no smaller than the " + "number of text tokens plus multimodal tokens. For image " + "inputs, the number of image tokens depends on the number " + "of images, and possibly their aspect ratios as well.") + + # TODO: Find out how many placeholder tokens are there so we can + # check that chunked prefill does not truncate them + # max_batch_len = self.scheduler_config.max_num_batched_tokens + + +def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: + config = try_get_generation_config( + model_config.model, + trust_remote_code=model_config.trust_remote_code, + revision=model_config.revision, + ) + + if config is None: + return {} + + return config.to_diff_dict() diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 087067cdac56f..51fb4003e5fe0 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,28 +1,28 @@ import enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Optional, Union +from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics +from vllm.v1.engine import EngineCoreRequest from vllm.v1.utils import ConstantList -if TYPE_CHECKING: - from vllm.inputs import DecoderOnlyInputs - class Request: def __init__( self, request_id: str, - inputs: "DecoderOnlyInputs", + inputs: DecoderOnlyInputs, sampling_params: SamplingParams, eos_token_id: Optional[int], arrival_time: float, lora_request: Optional[LoRARequest] = None, ) -> None: self.request_id = request_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.sampling_params = sampling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id @@ -38,14 +38,42 @@ def __init__( assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - self.prompt = inputs.get("prompt") - self.prompt_token_ids = inputs["prompt_token_ids"] + self.prompt = self.inputs.prompt + self.prompt_token_ids = self.inputs.prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) self._output_token_ids: List[int] = [] self._all_token_ids: List[int] = self.prompt_token_ids.copy() - self.output_text = "" self.num_computed_tokens = 0 + # Raw multimodal data before the mm input mapper (e.g., PIL images). + self.mm_data = self.inputs.multi_modal_data + self.mm_processor_kwargs = self.inputs.mm_processor_kwargs + mm_positions = self.inputs.multi_modal_placeholders + if mm_positions: + # FIXME(woosuk): Support other modalities. + self.mm_positions = mm_positions.get("image", []) + else: + self.mm_positions = [] + # Output of the mm input mapper (e.g., image tensors). + self.mm_inputs: List[MultiModalKwargs] = [] + + @classmethod + def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": + return cls( + request_id=request.request_id, + inputs=token_inputs( + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + multi_modal_data=request.mm_data, + multi_modal_placeholders=request.mm_placeholders, + mm_processor_kwargs=request.mm_processor_kwargs, + ), + sampling_params=request.sampling_params, + eos_token_id=request.eos_token_id, + arrival_time=request.arrival_time, + lora_request=request.lora_request, + ) + @property def output_token_ids(self) -> ConstantList[int]: # Prevent directly appending to the output_token_ids since @@ -81,9 +109,21 @@ def is_finished(self) -> bool: def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) + def has_encoder_inputs(self) -> bool: + return len(self.mm_data) > 0 + + @property + def num_encoder_inputs(self) -> int: + return len(self.mm_positions) + + def get_num_encoder_tokens(self, input_id: int) -> int: + assert input_id < len(self.mm_positions) + num_tokens = self.mm_positions[input_id]["length"] + return num_tokens + class RequestStatus(enum.IntEnum): - """Status of a sequence.""" + """Status of a request.""" WAITING = 0 RUNNING = 1 PREEMPTED = 2 @@ -104,7 +144,7 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]: # Mapping of finished statuses to their finish reasons. -# NOTE: The ignored sequences are the sequences whose prompt lengths +# NOTE: The ignored requests are the requests whose prompt lengths # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py new file mode 100644 index 0000000000000..b1cd5c11834f8 --- /dev/null +++ b/vllm/v1/serial_utils.py @@ -0,0 +1,10 @@ +import pickle + + +class PickleEncoder: + + def encode(self, obj): + return pickle.dumps(obj) + + def decode(self, data): + return pickle.loads(data) diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py deleted file mode 100644 index e485fcc3522d9..0000000000000 --- a/vllm/v1/tokenizer/detokenizer.py +++ /dev/null @@ -1,215 +0,0 @@ -import multiprocessing -from dataclasses import dataclass -from typing import Dict, List, Optional - -import msgspec -import zmq -from msgspec import msgpack - -from vllm.transformers_utils.detokenizer_utils import ( - convert_prompt_ids_to_tokens, detokenize_incrementally) -from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import get_open_port - - -class DetokenizerInputs(msgspec.Struct): - - # [num_reqs] - req_ids: List[str] - # A request's prompt token ids is sent to the detokenizer only when - # the request is first detokenized. Otherwise, an empty list is sent. - prompt_token_ids: List[List[int]] - new_token_ids: List[List[int]] - skip_special_tokens: List[bool] - spaces_between_special_tokens: List[bool] - - # [num_free_reqs] - free_req_ids: List[str] - - -class DetokenizerOutputs(msgspec.Struct): - - # [num_reqs] - req_ids: List[str] - detokenized_texts: List[str] - # NOTE(woosuk): The number of the output token ids of each request - # at the time of detokenization. The detokenizer returns this to the engine - # because the request state (including the output token ids) is - # asynchronously updated in the engine, while RequestOutput requires the - # output token ids to be consistent with the detokenized text. - num_output_token_ids: List[int] - - -class Detokenizer: - - def __init__(self, tokenizer_name: str): - # FIXME(woosuk): Currently, the detokenizer is just a hacky prototype. - # For example, it does not terminate properly. We need to improve this. - self.push_port = get_open_port() - self.pull_port = get_open_port() - self.detokenizer = DetokenizerProc(tokenizer_name, self.push_port, - self.pull_port) - self.detokenizer.start() - - self.zmq_context = zmq.Context() - self.push_socket = self.zmq_context.socket(zmq.PUSH) - self.push_socket.connect(f"tcp://localhost:{self.push_port}") - self.pull_socket = self.zmq_context.socket(zmq.PULL) - self.pull_socket.connect(f"tcp://localhost:{self.pull_port}") - self.poller = zmq.Poller() - self.poller.register(self.pull_socket, zmq.POLLIN) - self.msgpack_encoder = msgpack.Encoder() - self.msgpack_decoder = msgpack.Decoder(DetokenizerOutputs) - - def send(self, inputs: DetokenizerInputs) -> None: - self.push_socket.send(self.msgpack_encoder.encode(inputs), - flags=zmq.NOBLOCK) - - def recv(self) -> Optional[DetokenizerOutputs]: - socks = dict(self.poller.poll(timeout=0)) - if self.pull_socket in socks and socks[self.pull_socket] == zmq.POLLIN: - msg = self.pull_socket.recv() - return self.msgpack_decoder.decode(msg) - return None - - def terminate(self) -> None: - self.detokenizer.kill() - self.detokenizer.join() - - -class DetokenizerProc(multiprocessing.Process): - - def __init__( - self, - tokenizer_name: str, - pull_port: int, - push_port: int, - ): - super().__init__() - self.tokenizer_name = tokenizer_name - # NOTE: The pull_port of the detokenizer should be the same as the - # push_port of the engine. Vice versa. - self.pull_port = pull_port - self.push_port = push_port - - def run(self): - # Initialize these objects after the process is forked since they are - # not picklable. - self.msgpack_encoder = msgpack.Encoder() - self.msgpack_decoder = msgpack.Decoder(DetokenizerInputs) - self.tokenizer = get_tokenizer(self.tokenizer_name) - # req_id -> RequestState - self.request_states: Dict[str, RequestState] = {} - - self.zmq_context = zmq.Context() - self.pull_socket = self.zmq_context.socket(zmq.PULL) - self.pull_socket.bind(f"tcp://*:{self.pull_port}") - self.push_socket = self.zmq_context.socket(zmq.PUSH) - self.push_socket.bind(f"tcp://*:{self.push_port}") - - while True: - if self.pull_socket.poll(timeout=1000) == 0: - # Nothing to read - continue - message = self.pull_socket.recv() - inputs = self.msgpack_decoder.decode(message) - - for req_id in inputs.free_req_ids: - self.free(req_id) - - detokenized_texts: List[str] = [] - num_output_token_ids: List[int] = [] - num_reqs = len(inputs.req_ids) - for i in range(num_reqs): - req_id = inputs.req_ids[i] - if req_id not in self.request_states: - self.add_request( - request_id=req_id, - prompt_token_ids=inputs.prompt_token_ids[i], - skip_special_tokens=inputs.skip_special_tokens[i], - spaces_between_special_tokens=inputs. - spaces_between_special_tokens[i], - ) - new_str = self.detokenize(req_id, inputs.new_token_ids[i]) - detokenized_texts.append(new_str) - req_state = self.request_states[req_id] - num_output_token_ids.append( - len(req_state.token_ids) - req_state.num_prompt_tokens) - - detokenized = DetokenizerOutputs( - req_ids=inputs.req_ids, - detokenized_texts=detokenized_texts, - num_output_token_ids=num_output_token_ids, - ) - self.push_socket.send(self.msgpack_encoder.encode(detokenized), - flags=zmq.NOBLOCK) - - def add_request( - self, - request_id: str, - prompt_token_ids: List[int], - skip_special_tokens: bool, - spaces_between_special_tokens: bool, - ) -> None: - tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( - tokenizer=self.tokenizer, - prompt_ids=prompt_token_ids, - skip_special_tokens=skip_special_tokens, - ) - self.request_states[request_id] = RequestState( - req_id=request_id, - token_ids=prompt_token_ids, - tokens=tokens, - num_prompt_tokens=len(prompt_token_ids), - prefix_offset=prefix_offset, - read_offset=read_offset, - skip_special_tokens=skip_special_tokens, - spaces_between_special_tokens=spaces_between_special_tokens, - ) - - def free(self, request_id: str) -> None: - del self.request_states[request_id] - - def detokenize(self, request_id: str, new_token_ids: List[int]) -> str: - # TODO(woosuk): This method becomes very inefficient when the number of - # new_token_ids is more than 1. We need to optimize this. - req_state = self.request_states[request_id] - decoded_text = "" - for new_token_id in new_token_ids: - req_state.token_ids.append(new_token_id) - (new_tokens, new_decoded_token_text, prefix_offset, - read_offset) = detokenize_incrementally( - tokenizer=self.tokenizer, - all_input_ids=req_state.token_ids, - prev_tokens=req_state.tokens, - prefix_offset=req_state.prefix_offset, - read_offset=req_state.read_offset, - skip_special_tokens=req_state.skip_special_tokens, - spaces_between_special_tokens=req_state. - spaces_between_special_tokens, - ) - - req_state.tokens.extend(new_tokens) - req_state.prefix_offset = prefix_offset - req_state.read_offset = read_offset - req_state.output_text += new_decoded_token_text - decoded_text += new_decoded_token_text - return decoded_text - - -@dataclass -class RequestState: - - req_id: str - - token_ids: List[int] - tokens: List[str] - num_prompt_tokens: int - - prefix_offset: int - read_offset: int - - skip_special_tokens: bool - spaces_between_special_tokens: bool - - output_text: str = "" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2469048536e49..eebd1de96537f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,7 +1,7 @@ import os import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple import numpy as np import torch @@ -14,9 +14,10 @@ from vllm.compilation.levels import CompilationLevel from vllm.config import VllmConfig from vllm.forward_context import set_forward_context +from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.multimodal import MultiModalDataDict +from vllm.multimodal import MultiModalKwargs from vllm.plugins import set_compilation_config from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, @@ -27,6 +28,7 @@ from vllm.v1.sample.metadata import SamplingMetadata if TYPE_CHECKING: + from vllm.multimodal.inputs import PlaceholderRange from vllm.v1.core.scheduler import SchedulerOutput logger = init_logger(__name__) @@ -37,8 +39,8 @@ class GPUModelRunner: def __init__( self, vllm_config: VllmConfig, + input_registry: InputRegistry = INPUT_REGISTRY, ): - # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config) self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -75,10 +77,16 @@ def __init__( parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() + self.hidden_size = model_config.get_hidden_size() + + # Multi-modal data support + self.input_registry = input_registry # Lazy initialization # self.model: nn.Module # Set after load_model self.kv_caches: List[torch.Tensor] = [] + # req_id -> (input_id -> encoder_output) + self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} # Request states. self.requests: Dict[str, CachedRequestState] = {} @@ -96,18 +104,28 @@ def __init__( and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] - self.input_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int32, - device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. # Keep the states of the pre-empted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) + self.encoder_cache.pop(req_id, None) + + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) # Remove the requests from the persistent batch. stopped_req_ids = set().union( @@ -156,7 +174,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_id=req_id, prompt_token_ids=req_data.prompt_token_ids, prompt=req_data.prompt, - multi_modal_data=req_data.multi_modal_data, + mm_inputs=req_data.mm_inputs, + mm_positions=req_data.mm_positions, sampling_params=sampling_params, generator=generator, block_ids=req_data.block_ids, @@ -285,11 +304,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, - non_blocking=True) + input_ids = input_ids.to(self.device, non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) - query_start_loc = query_start_loc.to(self.device, non_blocking=True) seq_start_loc = seq_start_loc.to(self.device, non_blocking=True) slot_mapping = slot_mapping.to(self.device, non_blocking=True).long() @@ -308,7 +325,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return attn_metadata, logits_indices + return input_ids, attn_metadata, logits_indices def _prepare_sampling( self, @@ -325,13 +342,91 @@ def _prepare_sampling( sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy) return sampling_metadata + def _execute_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_inputs: List[MultiModalKwargs] = [] + req_input_ids: List[Tuple[int, int]] = [] + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + for input_id in encoder_input_ids: + mm_inputs.append(req_state.mm_inputs[input_id]) + req_input_ids.append((req_id, input_id)) + batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `encoder_outputs` is either of the following: + # 1. A tensor of shape [num_images, feature_size, hidden_size] + # in case when feature_size is fixed across all images. + # 2. A list (length: num_images) of tensors, each of shape + # [feature_size, hidden_size] in case when the feature size is + # dynamic depending on input images. + encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) + + # Cache the encoder outputs. + for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + self.encoder_cache[req_id][input_id] = output + + def _gather_encoder_outputs( + self, + scheduler_output: "SchedulerOutput", + ) -> List[torch.Tensor]: + encoder_outputs: List[torch.Tensor] = [] + num_reqs = self.input_batch.num_reqs + for req_id in self.input_batch.req_ids[:num_reqs]: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + encoder_outputs.append(encoder_output[start_idx:end_idx]) + return encoder_outputs + @torch.inference_mode() def execute_model( self, scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: self._update_states(scheduler_output) - attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + + # Run the encoder. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + + # Prepare the decoder inputs. + input_ids, attn_metadata, logits_indices = self._prepare_inputs( + scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -343,12 +438,26 @@ def execute_model( # Eager mode. num_input_tokens = num_scheduled_tokens + # Get the inputs embeds. + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), + # always use embeddings (rather than token ids) as input to the model. + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + + # Run the decoder. + # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata): hidden_states = self.model( - input_ids=self.input_ids[:num_input_tokens], + input_ids=None, positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_input_tokens], ) hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] @@ -404,15 +513,17 @@ def execute_model( def load_model(self) -> None: if self.use_cuda_graph: - # FIXME(woosuk): Currently, the custom ops are not supported - # in the piecewise compilation mode. We rely on TorchInductor - # to optimize the model. + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. os.environ["VLLM_CUSTOM_OPS"] = "none" set_compilation_config( CompilationConfig( use_cudagraph=True, non_cudagraph_ops=["vllm.unified_v1_flash_attention"], use_inductor=True, + enable_fusion=False, )) logger.info("Starting to load model %s...", self.model_config.model) @@ -438,13 +549,16 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: with set_forward_context(None): # noqa: SIM117 with set_compile_context(self.cudagraph_batch_sizes): # Trigger compilation for general shape. - model(self.input_ids, - self.positions, - dummy_kv_caches, - attn_metadata=None) + model(input_ids=None, + positions=self.positions, + kv_caches=dummy_kv_caches, + attn_metadata=None, + inputs_embeds=self.inputs_embeds) @torch.inference_mode() def profile_run(self) -> None: + # TODO(woosuk): Profile the max memory usage of the encoder and + # the encoder cache. self._dummy_run(self.model, self.max_num_tokens) torch.cuda.synchronize() @@ -466,10 +580,11 @@ def capture_model(self) -> None: # can reuse the memory pool allocated for the large shapes. for num_tokens in reversed(self.cudagraph_batch_sizes): self.model( - self.input_ids[:num_tokens], - self.positions[:num_tokens], + input_ids=None, + positions=self.positions[:num_tokens], kv_caches=self.kv_caches, attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_tokens], ) end_time = time.perf_counter() @@ -504,7 +619,8 @@ class CachedRequestState: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional["MultiModalDataDict"] + mm_inputs: List[MultiModalKwargs] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams generator: Optional[torch.Generator] diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py new file mode 100644 index 0000000000000..7053075bf4d8f --- /dev/null +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -0,0 +1,123 @@ +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +import torch + +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MultiModalKwargs +from vllm.pooling_params import PoolingParams +from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, + SequenceGroupMetadata) +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, + ModelInputForCPUBuilder) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): + """ + Used by the CPUEmbeddingModelRunner. + """ + pooling_metadata: Optional["PoolingMetadata"] = None + + +class CPUEmbeddingModelRunner( + CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( + ModelInputForCPUWithPoolingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForCPUWithPoolingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError( + "CPU worker does not support multi-step execution.") + + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + + model_executable = self.model + execute_model_kwargs = { + "input_ids": + model_input.input_tokens, + "positions": + model_input.input_positions, + "kv_caches": + kv_caches, + "attn_metadata": + model_input.attn_metadata, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, + device=self.device), + "intermediate_tensors": + intermediate_tensors, + } + + hidden_states = model_executable(**execute_model_kwargs) + + return [ + self.model.pooler(hidden_states=hidden_states, + pooling_metadata=model_input.pooling_metadata) + ] + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, + Any]) -> ModelInputForCPUWithPoolingMetadata: + return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForCPUWithPoolingMetadata: + assert seq_group_metadata_list is not None + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + # Prepare PoolingMetadata. + assert model_input.seq_lens is not None + pooling_metadata = self._prepare_pooling(seq_group_metadata_list, + model_input.seq_lens) + + return dataclasses.replace(model_input, + virtual_engine=virtual_engine, + pooling_metadata=pooling_metadata) + + def _prepare_pooling( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + ) -> PoolingMetadata: + """Prepare PoolingMetadata for the sequence group metadata list.""" + seq_groups: List[Tuple[List[int], PoolingParams]] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + pooling_params = seq_group_metadata.pooling_params + seq_groups.append((seq_ids, pooling_params)) + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + pooling_metadata = PoolingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + ) + + return pooling_metadata diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 994af7c5a455f..d040831870bd8 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -4,11 +4,12 @@ import torch from vllm.attention import AttentionMetadata +from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import (CPUModelRunner, +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPUBuilder, ModelInputForCPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -50,7 +51,8 @@ def from_broadcasted_tensor_dict( super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) -class CPUEncoderDecoderModelRunner(CPUModelRunner): +class CPUEncoderDecoderModelRunner( + CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( EncoderDecoderModelInputForCPU) _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder @@ -87,21 +89,29 @@ def prepare_model_input( virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> EncoderDecoderModelInputForCPU: - model_input = super().prepare_model_input(seq_group_metadata_list, - virtual_engine, - finished_requests_ids) - model_input = cast(EncoderDecoderModelInputForCPU, model_input) + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) ( attn_metadata, encoder_input_tokens_tensor, encoder_input_positions_tensor, ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list, model_input) + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + pin_memory=False, + generators=generators) return dataclasses.replace( model_input, + sampling_metadata=sampling_metadata, attn_metadata=attn_metadata, encoder_input_tokens=encoder_input_tokens_tensor, encoder_input_positions=encoder_input_positions_tensor, + virtual_engine=virtual_engine, ) def _prepare_encoder_model_input_tensors( diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 1590184d6f831..d3e1202c15e61 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -2,7 +2,8 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import torch from torch import nn @@ -31,6 +32,7 @@ logger = init_logger(__name__) +TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU") _PAD_SLOT_ID = -1 @@ -60,10 +62,10 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type["ModelInputForCPU"], + cls: Type[TModelInputForCPU], tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None - ) -> "ModelInputForCPU": + ) -> TModelInputForCPU: if attn_backend is not None: tensor_dict = _init_attn_metadata_from_tensor_dict( attn_backend, tensor_dict) @@ -146,19 +148,29 @@ def build(self) -> ModelInputForCPU: query_lens=seq_lens, ) - def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata, - seq_data: SequenceData, computed_len: int, - mm_processor_kwargs: Dict[str, Any]): - + def _compute_multi_modal_input( + self, + seq_data: SequenceData, + computed_len: int, + seq_group_metadata: SequenceGroupMetadata, + ): # NOTE: mm_data only includes the subset of multi-modal items that # intersect with the current prefill positions. mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group, range(computed_len, len(seq_data.get_token_ids()))) + seq_group_metadata, + range(computed_len, len(seq_data.get_token_ids())), + ) if not mm_data: - return + return None, None, None - mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs) + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) # special processing for mrope position deltas. mrope_positions = None @@ -200,7 +212,7 @@ def _prepare_prompt( slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -221,11 +233,14 @@ def _prepare_prompt( mrope_positions = None if seq_group_metadata.multi_modal_data: - mm_kwargs, placeholder_maps, mrope_positions = self \ - ._compute_multi_modal_input( - seq_group_metadata, seq_data, computed_len, - seq_group_metadata.mm_processor_kwargs) - multi_model_kwargs_list.append(mm_kwargs) + ( + mm_kwargs, + placeholder_maps, + mrope_positions, + ) = self._compute_multi_modal_input(seq_data, computed_len, + seq_group_metadata) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( placeholder_map) @@ -255,11 +270,14 @@ def _prepare_prompt( slot_mapping.append(_PAD_SLOT_ID) continue - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) + # For encoder-only models, the block_table is None, + # and there is no need to initialize the slot_mapping. + if block_table is not None: + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) if any(input_mrope_positions): input_positions = None # type: ignore @@ -297,7 +315,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -402,10 +420,12 @@ def _prepare_decode( ) -class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( - ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder +class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): + """ + Helper class for shared methods between CPU model runners. + """ + _model_input_cls: Type[TModelInputForCPU] + _builder_cls: Type[ModelInputForCPUBuilder] def __init__( self, @@ -448,20 +468,11 @@ def __init__( def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForCPUWithSamplingMetadata: - return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 - tensor_dict, - attn_backend=self.attn_backend, - ) - def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithSamplingMetadata: + ) -> TModelInputForCPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. @@ -473,6 +484,21 @@ def _prepare_model_input_tensors( return builder.build() # type: ignore + +class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( + ModelInputForCPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForCPUWithSamplingMetadata: + return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 + tensor_dict, + attn_backend=self.attn_backend, + ) + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 162e1e4be873b..bc9164bd9d5df 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -14,8 +14,9 @@ from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner -from vllm.worker.cpu_model_runner import CPUModelRunner +from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerBase, WorkerInput) @@ -150,21 +151,20 @@ def __init__( else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] - ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner + ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.task == "embedding": - raise NotImplementedError( - "Embedding models are not supported for CPU backend") - # ModelRunnerClass = CPUEmbeddingModelRunner + ModelRunnerClass = CPUEmbeddingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner - self.model_runner: CPUModelRunner = ModelRunnerClass( + self.model_runner: CPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - self.cpu_cache: List[List[torch.Tensor]] + # Initialize cpu_cache as embedding models don't initialize kv_caches + self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 008e0c9745994..82824faa6629a 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -9,15 +9,13 @@ AttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.attention.selector import (_Backend, get_env_variable_attn_backend, - get_global_forced_attn_backend, - global_force_attn_backend) -from vllm.config import ModelConfig, VllmConfig + get_global_forced_attn_backend) +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.utils import get_architecture_class_name from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.sampling_params import SamplingParams @@ -35,11 +33,6 @@ logger = init_logger(__name__) -# The Mllama model has PagedAttention specific logic because of which it -# can only be run with the XFORMERS backend -# TODO Make Mllama model work with Flash Attention backend. -_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"] - @dataclasses.dataclass(frozen=True) class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): @@ -97,7 +90,7 @@ def __init__( models) but these arguments are present here for compatibility with the base-class constructor. ''' - self._maybe_force_supported_attention_backend(vllm_config.model_config) + self._maybe_force_supported_attention_backend() super().__init__( vllm_config=vllm_config, @@ -108,12 +101,7 @@ def __init__( # Crash for unsupported encoder/scenarios assert_enc_dec_mr_supported_scenario(self) - def _is_xformers_only_encoder_decoder_model(self, - model: ModelConfig) -> bool: - return get_architecture_class_name( - model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS - - def _maybe_force_supported_attention_backend(self, model: ModelConfig): + def _maybe_force_supported_attention_backend(self): ''' Force vLLM to use the XFormers attention backend, which is currently the only supported option. @@ -128,23 +116,13 @@ def raise_backend_err(): maybe_global_forced_backend = get_global_forced_attn_backend() is_forced_by_global = maybe_global_forced_backend is not None is_forced_by_env_var = maybe_env_var_forced_backend is not None - - if not (is_forced_by_global or is_forced_by_env_var) \ - and self._is_xformers_only_encoder_decoder_model(model): - # The user has not already specified an attention backend - # override - logger.info( - "Encoder-Decoder Model Architecture %s requires XFormers " - "backend; overriding backend auto-selection and " - "forcing XFormers.", get_architecture_class_name(model)) - global_force_attn_backend(_Backend.XFORMERS) - elif is_forced_by_global: + if is_forced_by_global: # noqa: SIM102 # Backend override enforced by global variable takes # precedence over vLLM backend environment variable. if maybe_global_forced_backend not in\ [_Backend.XFORMERS, _Backend.FLASH_ATTN]: raise_backend_err() - elif is_forced_by_env_var: + elif is_forced_by_env_var: # noqa: SIM102 # Backend override enforced by vLLM backend # environment variable if maybe_env_var_forced_backend not in\ diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 92d6552b2f428..1ff30d685c6b1 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -716,7 +716,7 @@ def _prepare_prompt( context_lens: List[int] = [] query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -777,7 +777,7 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: mm_kwargs = self.multi_modal_input_mapper(mm_data) - multi_model_kwargs_list.append(mm_kwargs) + multi_modal_kwargs_list.append(mm_kwargs) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -876,7 +876,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps= None # FIXME(kzawora): mutli-modality will not work here ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return PreparePromptMetadata(input_tokens=input_tokens, input_positions=input_positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e1446192ce3d6..042f9f07eace6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -252,7 +252,7 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_model_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_kwargs: Optional[MultiModalKwargs] = None, multi_modal_placeholder_maps: Optional[Dict[ str, MultiModalPlaceholderMap]] = None, @@ -373,7 +373,7 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_model_kwargs = multi_model_kwargs + self.multi_modal_kwargs = multi_modal_kwargs self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit @@ -542,6 +542,9 @@ def _compute_for_prefix_cache_hit( # this may be larger than the sequence length if chunked # prefill is enabled. prefix_cache_len = len(computed_block_nums) * self.block_size + seq_group_metadata.seq_data[inter_data.seq_ids[ + seq_idx]].update_num_cached_tokens(prefix_cache_len) + # The number of so far computed prompt tokens in this sequence. context_len = inter_data.context_lens[seq_idx] # The total number of prompt tokens in this sequence. @@ -658,10 +661,15 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_model_kwargs = mm_kwargs + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. @@ -935,11 +943,11 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_model_kwargs_list = [ - data.multi_model_kwargs for data in self.inter_data_list - if data.multi_model_kwargs is not None + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None ] - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 0ed33e435aa2f..ae4eb6ba6eaec 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -67,7 +67,8 @@ def __init__( self.pin_memory = is_pin_memory_available() # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -122,7 +123,7 @@ def _prepare_prompt( input_block_ids: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -144,12 +145,15 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: - # Process multi-modal data - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs, - ) - multi_model_kwargs_list.append(mm_kwargs) + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) assert max_seq_len > 0 @@ -167,7 +171,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, input_block_ids, seq_lens, multi_modal_kwargs) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 378e1e06039b2..6000e5dfe4e30 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -70,7 +70,8 @@ def __init__( ) # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -102,7 +103,7 @@ def _prepare_model_input( seq_lens: List[int] = [] past_lens: List[int] = [] query_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -222,11 +223,15 @@ def _prepare_model_input( mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, positions_range) - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata. - mm_processor_kwargs) - multi_model_kwargs_list.append(mm_kwargs) + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -275,7 +280,7 @@ def _prepare_model_input( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return ModelInput( input_tokens, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index c9e637c057979..e6322e095bbb9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -160,7 +160,7 @@ def _prepare_prompt( input_positions: List[int] = [] slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -191,8 +191,16 @@ def _prepare_prompt( mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, positions_range) - mm_kwargs = self.runner.multi_modal_input_mapper(mm_data) - multi_model_kwargs_list.append(mm_kwargs) + if self.runner.mm_registry.has_processor( + self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.runner.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -264,7 +272,7 @@ def _prepare_prompt( block_tables=torch.tensor([], device=self.device, dtype=torch.int), ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs)