diff --git a/.github/workflows/coverage-tests.yml b/.github/workflows/coverage-tests.yml index bcac25393..f44dba5f9 100644 --- a/.github/workflows/coverage-tests.yml +++ b/.github/workflows/coverage-tests.yml @@ -38,14 +38,15 @@ jobs: submodules: false set-safe-directory: true + - name: Coverage Online Report + run: | + REPORT_ADDR=$(cat "/workspace/config/report_address") + echo "After 'Run Unit Tests and Check Coverage' is completed/stopped, you can access the test diff-coverage report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html" + - name: Run Unit Tests and Check Coverage run: | # Execute the test coverage script with the appropriate backend and id (commit SHA) tests/scripts/unit_tests/test_coverage.sh --backend ${{ inputs.backend }} --id ${{ github.sha }} --status online - - name: Coverage Online Report - run: | - REPORT_ADDR=$(cat "/workspace/config/report_address") - echo "You can access the test diff-coverage report at the http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/diff-cover-report-${{ inputs.backend }}.html" diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 06eefb64d..38c0509b7 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -40,11 +40,12 @@ jobs: lfs: false submodules: false set-safe-directory: true - - - name: Run Unit Test - run: tests/scripts/unit_tests/test_subset.sh --backend ${{ inputs.backend }} --subset ${{ inputs.subset }} --id ${{ github.sha }} - + - name: Unit Test Coverage Online Report run: | REPORT_ADDR=$(cat "/workspace/config/report_address") - echo "You can access the test coverage report at http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/index.html" + echo "After 'Run Unit Test' is completed/stopped, you can access the test coverage report at http://${REPORT_ADDR}/${{github.sha}}/cov-report-${{ inputs.backend }}/index.html" + + - name: Run Unit Test + run: tests/scripts/unit_tests/test_subset.sh --backend ${{ inputs.backend }} --subset ${{ inputs.subset }} --id ${{ github.sha }} + diff --git a/.gitignore b/.gitignore index 3b579c698..8e520bcbb 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,6 @@ slurm* logs .vscode log_file/* -outputs \ No newline at end of file +outputs +*.log +*.out diff --git a/README.md b/README.md index a49ce14ab..809db47f4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[](https://flagopen.baai.ac.cn/) + ## Latest News - **[2024/11]** Released [v0.6.0](https://github.com/FlagOpen/FlagScale/tree/release/v0.6.0): - Introduced general multi-dimensional heterogeneous parallelism and CPU-based communication between different chips. @@ -12,7 +14,9 @@ [FlagScale](https://github.com/FlagOpen/FlagScale.git) is a comprehensive toolkit designed to support the entire lifecycle of large models, developed with the backing of the Beijing Academy of Artificial Intelligence (BAAI). It builds on the strengths of several prominent open-source projects, including [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) and [vllm](https://github.com/vllm-project/vllm), to provide a robust, end-to-end solution for managing and scaling large models. -The primary objective of FlagScale is to enable seamless scalability across diverse hardware architectures while maximizing computational resource efficiency and enhancing model performance. By offering essential components for model development, training, and deployment, FlagScale aims to serve as an indispensable toolkit for optimizing both the speed and effectiveness of large model workflows. +The primary objective of FlagScale is to enable seamless scalability across diverse hardware architectures while maximizing computational resource efficiency and enhancing model performance. By offering essential components for model development, training, and deployment, FlagScale seeks to establish itself as an indispensable toolkit for optimizing both the speed and effectiveness of large model workflows. + +FlagScale is also a part of [FlagAI-Open](https://flagopen.baai.ac.cn/), an open-source initiative by BAAI that aims to foster an open-source ecosystem for AI technologies. It serves as a platform where developers, researchers, and AI enthusiasts can collaborate on various AI projects, contribute to the development of cutting-edge AI solutions, and share their work with the global community. ## Quick Start @@ -43,13 +47,15 @@ We recommend using the latest release of [NGC's PyTorch container](https://catal cd vllm pip install . - cd megatron-energon - pip install . + pip install -e ./megatron-energon + cp -r megatron-energon/src/megatron/energon megatron/megatron ``` ### Run a Task -FlagScale provides a unified runner for various tasks, including training and inference. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task. +FlagScale provides a unified runner for various tasks, including training,inference and serve. Simply specify the configuration file to run the task with a single command. The runner will automatically load the configurations and execute the task. The following example demonstrates how to run a distributed training task. + +#### Train 1. Start the distributed training job: ```sh @@ -62,6 +68,18 @@ FlagScale provides a unified runner for various tasks, including training and in python run.py --config-path ./examples/aquila/conf --config-name config action=stop ``` +#### Serve + +1. Start the server: + ```sh + python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=run + ``` +2. Stop the server: + ```sh + python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=stop + ``` +For more details, please refer to [Quick Start](./flagscale/serve/README.md). + ## License This project is licensed under the [Apache License (Version 2.0)](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE). This project also contains other third-party components under other open-source licenses. See the [LICENSE](https://github.com/FlagOpen/FlagScale/blob/main/LICENSE) file for more information. diff --git a/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml b/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml deleted file mode 100644 index 90995e2d9..000000000 --- a/examples/llama/conf/train/train_llama2_7b_tp_hetero.yaml +++ /dev/null @@ -1,67 +0,0 @@ -system: - tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 3 - disable_bias_linear: True - use_flash_attn: True - sequence_parallel: True - use_distributed_optimizer: True - hetero_mode: pp - hetero_device_types: A100 - hetero_current_device_type: A100 - hetero_pipeline_stages: [3,16,8,8] - process_meshes: [4,1,1,2,1,2] - precision: - bf16: True - initial_loss_scale: 16384 - min_loss_scale: 1.0 - logging: - log_interval: 1 - checkpoint: - save_interval: 100 - -model: - use_mcore_models: True - transformer_impl: transformer_engine - num_layers: 32 - hidden_size: 4096 - ffn_hidden_size: 11008 - num_attention_heads: 32 - seq_length: 4096 - group_query_attention: False - num_query_groups: 8 - max_position_embeddings: 4096 - norm_epsilon: 1e-5 - use_rotary_position_embeddings: True - no_position_embedding: True - swiglu: True - normalization: RMSNorm - untie_embeddings_and_output_weights: True - init_method_std: 0.02 - attention_dropout: 0.0 - hidden_dropout: 0.0 - weight_decay: 0.1 - clip_grad: 1.0 - train_iters: 30 - eval_iters: 0 - eval_interval: 2000 - micro_batch_size: 1 - global_batch_size: 32 - - optimizer: - weight_decay: 1e-2 - adam_beta1: 0.9 - adam_beta2: 0.95 - lr_scheduler: - lr: 0.00015 - min_lr: 1.0e-5 - lr_warmup_fraction: .01 - lr_decay_iters: 1 - lr_decay_style: cosine - -data: - data_path: ${data_path:??} - split: 1 - tokenizer: - tokenizer_type: Llama2Tokenizer - tokenizer_model: examples/llama/tokenizer.model - vocab_size: 32000 diff --git a/examples/llama/conf/train/train_llama3_8b_hetero.yaml b/examples/llama/conf/train/train_llama3_8b_hetero.yaml new file mode 100644 index 000000000..c73be22c8 --- /dev/null +++ b/examples/llama/conf/train/train_llama3_8b_hetero.yaml @@ -0,0 +1,100 @@ +system: + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 2 + disable_bias_linear: True + use_flash_attn: True + sequence_parallel: True + use_distributed_optimizer: True + precision: + bf16: True + attention_softmax_in_fp32: true + accumulate_allreduce_grads_in_fp32: true + logging: + log_interval: 1 + tensorboard_log_interval: 1 + wandb_project: "train-llama3-8B" + wandb_exp_name: "train-test-8B" + checkpoint: + load: outputs_llama3/checkpoint_mc + save_interval: 10 + finetune: True + ckpt_format: "torch" + +model: + use_mcore_models: True + transformer_impl: transformer_engine + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 14336 + num_attention_heads: 32 + seq_length: 4096 + group_query_attention: True + num_query_groups: 8 + max_position_embeddings: 8192 + norm_epsilon: 1e-5 + use_rotary_position_embeddings: True + no_position_embedding: True + swiglu: True + normalization: RMSNorm + rotary_interleaved_patch: False + position_embedding_type: rope + rotary_base: 500000 + untie_embeddings_and_output_weights: True + init_method_std: 0.02 + attention_dropout: 0.0 + hidden_dropout: 0.0 + clip_grad: 1.0 + train_samples: 200000 + eval_iters: 100 + eval_interval: 1000 + micro_batch_size: 1 + global_batch_size: 16 + + hetero: + enable_hetero: True + hetero_use_cpu_communication: True + # mesh format [tp1,cp1,ep1,dp1,pp1,(tp2,cp2...)] + + # 2 mesh, diff tp dp pp + hetero_pipeline_layer_split: [18, 14] + hetero_process_meshes: [2, 1, 1, 4, 1, 4, 1, 1, 2, 1] + hetero_device_types: ["A800", "A100"] + + standalone_embedding_stage: False + hetero_current_device_type: "A800" + + # recompute: + # recompute_granularity: "full" + # recompute_method: "uniform" + # recompute_num_layers: 1 + + # ## pp 2 stages and num_micro_batches 4 + # recompute_granularity_per_stage_micro_batch: + # - [1, 3, 0, 1, 0] + # - [1, 3, 1, 1, 1] + # recompute_method_per_stage_micro_batch: + # - [1, 3, 0, 1, 0] + # - [1, 3, 0, 1, 0] + # recompute_num_layers_per_stage_micro_batch: + # - [1, 3, 2, 1, 2] + # - [1, 3, 1, 1, 1] + + + optimizer: + weight_decay: 1e-2 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 1.0e-5 + min_lr: 1.0e-6 + lr_warmup_fraction: .1 + lr_decay_style: cosine + +data: + data_path: examples/llama/pile-openwebtext_text_document/pile-openwebtext_text_document + split: 1 + tokenizer: + tokenizer_type: Llama3TokenizerFS + tokenizer_path: meta-llama3/Meta-Llama-3-8B + vocab_size: 128256 + make_vocab_size_divisible_by: 64 diff --git a/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml b/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml index 142e59a8f..c9b44e0bd 100644 --- a/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml +++ b/examples/llava_onevision/conf/train/train_llava_onevision_1.5b.yaml @@ -44,6 +44,11 @@ model: hidden_dropout: 0.0 clip_grad: 1.0 train_iters: 10 + profile: False + profile-step-start: 10 + profile-step-end: 20 + profile_ranks: 7 + use_pytorch_profiler: True eval_iters: 0 micro_batch_size: 2 global_batch_size: 512 diff --git a/vllm/vllm/v1/tokenizer/__init__.py b/examples/qwen/__init__.py similarity index 100% rename from vllm/vllm/v1/tokenizer/__init__.py rename to examples/qwen/__init__.py diff --git a/examples/qwen/conf/config_qwen2.5_1.5b.yaml b/examples/qwen/conf/config_qwen2.5_1.5b.yaml new file mode 100644 index 000000000..c3ab52d9f --- /dev/null +++ b/examples/qwen/conf/config_qwen2.5_1.5b.yaml @@ -0,0 +1,30 @@ +defaults: + - _self_ + - train: train_qwen_2.5_1.5b + # - train: train_mixtral_1.8b + +experiment: + exp_name: train_qwen_2.5_1.5b + exp_dir: ./outputs # outputs ## log、checkpoints output path + task: + type: train + backend: megatron + entrypoint: ./flagscale/train/train_aquila.py + runner: + backend: torchrun + nnodes: 2 + nproc_per_node: 8 + hostfile: torchrun # Please replace with your actual hostfile path + envs: + CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NCCL_SOCKET_IFNAME: eth0 + NCCL_IB_DISABLE: 0 + NCCL_IB_CUDA_SUPPORT: 1 + NCCL_IB_GID_INDEX: 0 + NCCL_DEBUG: INFO + OMP_NUM_THREADS: 4 + GLOO_SOCKET_IFNAME: eth0 + NCCL_IB_HCA: mlx5_2,mlx5_5 + +action: run diff --git a/examples/qwen/conf/config_qwen2.5_72b_tp.yaml b/examples/qwen/conf/config_qwen2.5_72b_tp.yaml new file mode 100644 index 000000000..a68a1e752 --- /dev/null +++ b/examples/qwen/conf/config_qwen2.5_72b_tp.yaml @@ -0,0 +1,22 @@ +defaults: + - _self_ + - serve: serve_qwen2.5_72b + +experiment: + exp_name: qwen2.5_72b + exp_dir: outputs/${experiment.exp_name} + task: + type: serve + backend: vllm + entrypoint: null + runner: + hostfile: null + envs: + CUDA_VISIBLE_DEVICES: 0,1,2,3 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/examples/qwen/conf/config_qwen2.5_7b.yaml b/examples/qwen/conf/config_qwen2.5_7b.yaml new file mode 100644 index 000000000..48609ecd3 --- /dev/null +++ b/examples/qwen/conf/config_qwen2.5_7b.yaml @@ -0,0 +1,22 @@ +defaults: + - _self_ + - serve: serve_qwen2.5_7b + +experiment: + exp_name: qwen2.5_7b + exp_dir: outputs/${experiment.exp_name} + task: + type: serve + backend: vllm + entrypoint: null + runner: + hostfile: null + envs: + CUDA_VISIBLE_DEVICES: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/examples/qwen/conf/config_ssh_qwen2.5_7b.yaml b/examples/qwen/conf/config_ssh_qwen2.5_7b.yaml new file mode 100644 index 000000000..3a94895e4 --- /dev/null +++ b/examples/qwen/conf/config_ssh_qwen2.5_7b.yaml @@ -0,0 +1,25 @@ +defaults: + - _self_ + - serve: serve_qwen2.5_7b + +experiment: + exp_name: qwen2.5_7b + exp_dir: outputs/${experiment.exp_name} + task: + type: serve + backend: vllm + entrypoint: null + runner: + hostfile: /path/to/hostfile # type: {remote ip} slots={gpu num} type={gpu type} (like: x.x.x.x slots=8 type=A100) + ssh_port: 22 # replace with your ssh port + envs: + CUDA_VISIBLE_DEVICES: 0 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + cmds: + before_start: source /root/miniconda3/bin/activate flagscale + +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra diff --git a/examples/qwen/conf/serve/serve_qwen2.5_72b.yaml b/examples/qwen/conf/serve/serve_qwen2.5_72b.yaml new file mode 100644 index 000000000..86d05e037 --- /dev/null +++ b/examples/qwen/conf/serve/serve_qwen2.5_72b.yaml @@ -0,0 +1,17 @@ +model_args: + vllm_model: + model-tag: /models/Qwen2.5-72B-Instruct + tensor-parallel-size: 4 + gpu-memory-utilization: 0.9 + max-model-len: 32768 + max-num-seqs: 256 + port: 4567 + action-args: + - trust-remote-code + - enable-chunked-prefill + +deploy: + command-line-mode: true + models: + vllm_model: + num_gpus: 4 diff --git a/examples/qwen/conf/serve/serve_qwen2.5_7b.yaml b/examples/qwen/conf/serve/serve_qwen2.5_7b.yaml new file mode 100644 index 000000000..0faa78b42 --- /dev/null +++ b/examples/qwen/conf/serve/serve_qwen2.5_7b.yaml @@ -0,0 +1,17 @@ +model_args: + vllm_model: + model-tag: /models/Qwen2.5-7B-Instruct + tensor-parallel-size: 1 + gpu-memory-utilization: 0.9 + max-model-len: 32768 + max-num-seqs: 256 + port: 4567 + action-args: + - trust-remote-code + - enable-chunked-prefill + +deploy: + command-line-mode: true + models: + vllm_model: + num_gpus: 1 diff --git a/examples/qwen/conf/train/qwen_2.5_1.5b.yaml b/examples/qwen/conf/train/qwen_2.5_1.5b.yaml new file mode 100644 index 000000000..2ea5368fb --- /dev/null +++ b/examples/qwen/conf/train/qwen_2.5_1.5b.yaml @@ -0,0 +1,80 @@ +system: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + make_vocab_size_divisible_by: 128 + disable_bias_linear: True + sequence_parallel: True + use_flash_attn: True + use_distributed_optimizer: True + distributed-timeout-minutes: 60 + precision: + bf16: True + attention_softmax_in_fp32: True + accumulate_allreduce_grads_in_fp32: True + logging: + log_interval: 1 + tensorboard_log_interval: 1 + wandb_project: "train-qwen2.5-1.5B" + wandb_exp_name: "train-qwen2.5-1.5B" + checkpoint: + load: ${megatron_model__path:} + # If you want to train the model, you need to comment out ckpt_format, ckpt_convert_format, ckpt_convert_save, which are used for converting ckpt. + ckpt_format: torch_dist # ${experiment.ckpt_format} + ckpt_convert_format: torch # ${experiment.ckpt_convert_format} + ckpt_convert_save: ${experiment.ckpt_convert_save} + save_interval: 5000000 + rampup_save_interval: 50000 + +model: + use_mcore_models: true + num_layers: 28 + hidden_size: 1536 + num_attention_heads: 12 + num_query_groups: 2 + group_query_attention: True + ffn_hidden_size: 8960 + seq_length: 4096 + max_position_embeddings: 4096 + norm_epsilon: 1e-6 + norm_init_weight: 0.02 + use_rotary_position_embeddings: true + rotary_base: 1000000.0 + no_position_embedding: true + reset_position_ids: true + add_qkv_bias: true + reset_attention_mask: true + swiglu: true + normalization: RMSNorm + untie_embeddings_and_output_weights: false + init_method_std: 0.02 + attention_dropout: 0.0 + hidden_dropout: 0.0 + weight_decay: 0.0 + clip_grad: 1.0 + train_samples: 1478125 + eval_iters: 0 + eval_interval: 2000000 + micro_batch_size: 1 + global_batch_size: 512 + finetune: true + transformer_impl: transformer_engine + seed: 42 + #data_searching_range: [1156,1274] + optimizer: + weight_decay: 0.0 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 1e-5 + min_lr: 0 + lr_warmup_samples: 21120 + lr_decay_style: cosine + +data: + data_path: ${data_path:??} + split: 1 + apply_sft_dataset_separated_loss_mask_if_existed: true + tokenizer: + tokenizer_type: HFTokenizerFS + tokenizer_path: ${HF_model_path:??} + vocab_size: 151665 diff --git a/examples/qwen/utils/__init__.py b/examples/qwen/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/qwen/utils/convo_dataset.py b/examples/qwen/utils/convo_dataset.py new file mode 100644 index 000000000..a9234b7d2 --- /dev/null +++ b/examples/qwen/utils/convo_dataset.py @@ -0,0 +1,237 @@ +"""GPT style dataset.""" + +import copy +import hashlib +import os +import time + +import numpy as np +import torch + +from megatron import print_rank_0 +from megatron.core import mpu +from megatron.data.data_samplers import RandomSeedDataset + +class ConversationDatasetCPT(torch.utils.data.Dataset): + def __init__(self, conversations, tokenizer, maxlen, seed, num_samples, role_sep="\n\n"): + super(ConversationDatasetCPT, self).__init__() + self.conversations = conversations + self.tokenizer = tokenizer + self.maxlen = maxlen+1 + self.seed = seed + self.num_samples = num_samples + + ## TODO convo template + self.sep = role_sep + + # rng state + np_rng = np.random.RandomState(seed=seed) + np_rng.shuffle(self.conversations) + + def __getitem__(self, i): + source = self.conversations[i] + + instruction = source['instruction'] + conversations = source['conversations'] + + BOS_TOKEN = self.tokenizer.cls + EOS_TOKEN = self.tokenizer.eod + example = [BOS_TOKEN] + + # instruction + instruction = self.tokenizer.tokenize(f"{instruction}") + example += instruction + + labels = [-100] * len(example) + + for conversation in conversations: + role = conversation['from'] + content = conversation['value'] + content += self.sep + + content = self.tokenizer.tokenize(f"{content}") + + example += content + if role == 'gpt': + role_labels = copy.deepcopy(content) + else: + # masking + role_labels = [-100] * len(content) + labels += role_labels + + example.append(EOS_TOKEN) + labels.append(EOS_TOKEN) + + # maxlen + example = example[:self.maxlen] + labels = labels[:self.maxlen] + + # padding + delta = self.maxlen - len(example) + if delta > 0: + example.extend([self.tokenizer.pad]*delta) + labels.extend([-100]*delta) + + output = { + "tokens": np.array(example, dtype=np.int64), + "labels": np.array(labels, dtype=np.int64), + } + return output + + def __len__(self): + return len(self.conversations) + + +class ConversationDatasetV2(torch.utils.data.Dataset): + def __init__(self, conversations, tokenizer, maxlen, seed, num_samples): + super(ConversationDatasetV2, self).__init__() + self.conversations = conversations + self.tokenizer = tokenizer + self.maxlen = maxlen+1 + self.seed = seed + self.num_samples = num_samples + + # rng state + np_rng = np.random.RandomState(seed=seed) + np_rng.shuffle(self.conversations) + + + def __getitem__(self, i): + from examples.aquila.utils.convo_prompt import _add_speaker_and_signal + from examples.aquila.utils.convo_prompt import header + + #source = self.conversations[self.sample_idx[i]] + source = self.conversations[i] + _add_speaker_and_signal(source) + + source["chat_desc"] = header + chat_desc = source['chat_desc'] + instruction = source['instruction'] + conversations = source['conversations'] + + BOS_TOKEN = self.tokenizer.cls + EOS_TOKEN = self.tokenizer.eod + example = [BOS_TOKEN] + + # chat_desc + example += self.tokenizer.tokenize(f"{chat_desc}") + + # instruction + instruction = self.tokenizer.tokenize(f"{instruction}") + example += instruction + + labels = copy.deepcopy(example) + # add zero-out + #labels = [-100] * len(example) + + for conversation in conversations: + role = conversation['from'] + content = conversation['value'] + content = self.tokenizer.tokenize(f"{content}") + example += content + if role == 'gpt': + role_labels = copy.deepcopy(content) + else: + # masking + role_labels = [-100] * len(content) + labels += role_labels + + example.append(EOS_TOKEN) + labels.append(EOS_TOKEN) + + # maxlen + example = example[:self.maxlen] + labels = labels[:self.maxlen] + + # padding + delta = self.maxlen - len(example) + if delta > 0: + example.extend([self.tokenizer.pad]*delta) + labels.extend([-100]*delta) + + output = { + "tokens": np.array(example, dtype=np.int64), + "labels": np.array(labels, dtype=np.int64), + } + return output + + def __len__(self): + #return len(self.sample_idx) + return len(self.conversations) + + +def build_train_valid_test_datasets(train_valid_test_num_samples, + seq_length, seed, tokenizer, + train_data_prefix, + valid_data_prefix, + test_data_prefix=None, + finetune_dataset_type=None): + """Build train, valid, and test datasets.""" + suppored_dataset_types = dict(CPT=ConversationDatasetCPT) + dataset_cls = ConversationDatasetV2 + if finetune_dataset_type in suppored_dataset_types: + dataset_cls = suppored_dataset_types[finetune_dataset_type] + + def read_file(jsonl_file): + import jsonlines + conversations = [] + with jsonlines.open(jsonl_file) as reader: + for line in reader: + conversations.append(line) + return conversations + + train_dataset, valid_dataset, test_dataset = None, None, None + # Single dataset. + if train_data_prefix is not None: + train_conversations = read_file(train_data_prefix[0]) + train_dataset = dataset_cls( + train_conversations, + tokenizer=tokenizer, + maxlen=seq_length, + seed=seed, + num_samples=train_valid_test_num_samples[0]) + train_dataset = RandomSeedDataset(train_dataset) + + if valid_data_prefix is not None: + valid_conversations = read_file(valid_data_prefix[0]) + valid_dataset = dataset_cls( + valid_conversations, + tokenizer=tokenizer, + maxlen=seq_length, + seed=seed, + num_samples=train_valid_test_num_samples[1]) + valid_dataset = RandomSeedDataset(valid_dataset) + + if test_data_prefix is not None: + test_conversations = read_file(test_data_prefix[0]) + test_dataset = dataset_cls( + test_conversations, + tokenizer=tokenizer, + maxlen=seq_length, + seed=seed, + num_samples=train_valid_test_num_samples[2]) + test_dataset = RandomSeedDataset(test_dataset) + + return (train_dataset, valid_dataset, test_dataset) + +if __name__ == "__main__": + train_valid_test_num_samples = [12000,2000,0] + seq_length = 2048 + seed = 1234 + from megatron.tokenizer.tokenizer import _AquilaTokenizer + tokenizer = _AquilaTokenizer( + '../examples/aquila/tokenizer/vocab.json', + '../examples/aquila/tokenizer/merges.txt') + print(f"{dir(tokenizer)}") + train_data_prefix = ['path/to/train/set'] + valid_data_prefix = ['path/to/valid/set'] + train_dataset, valid_dataset, test_dataset = build_train_valid_test_datasets( + train_valid_test_num_samples, + seq_length, seed, tokenizer, + train_data_prefix, + valid_data_prefix, + test_data_prefix=None) + for idx, sample in enumerate(train_dataset): + print(f"idx={idx} sample={type(sample['labels'])}") + break + diff --git a/examples/qwen/utils/convo_prompt.py b/examples/qwen/utils/convo_prompt.py new file mode 100644 index 000000000..e0e947282 --- /dev/null +++ b/examples/qwen/utils/convo_prompt.py @@ -0,0 +1,166 @@ +import dataclasses +from enum import auto, Enum +from typing import List, Tuple, Any + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + instruction: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + + skip_next: bool = False + conv_id: Any = None + + def get_prompt(self): + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + if self.instruction is not None and len(self.instruction) > 0: + ret += self.roles[2] + ": " + self.instruction + self.sep + for role, message in self.messages: + if message: + ret += role + ": " + message + self.sep + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + if self.instruction is not None and len(self.instruction) > 0: + ret += self.roles[2] + ": " + self.instruction + self.sep + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + return ret + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + def append_message(self, role, message): + self.messages.append([role, message]) + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + instruction=self.instruction, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + conv_id=self.conv_id) + + def dict(self): + return { + "system": self.system, + "instruction": self.instruction, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + "conv_id": self.conv_id, + } + + +conv_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + instruction="", + roles=("Human", "Assistant", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_v1_2 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + instruction="", + roles=("Human", "Assistant", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_bair_v1 = Conversation( + system="BEGINNING OF CONVERSATION:", + instruction="", + roles=("USER", "GPT", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + + +default_conversation = conv_v1_2 +conv_templates = { + "v1": conv_v1_2, + "bair_v1": conv_bair_v1, +} + +# utils +"""Add speaker and start/end signal on each round.""" +BEGIN_SIGNAL = "### " +END_SIGNAL = "\n" +header = f"{default_conversation.system}\n\n" +unknown_role = "unknown" # use default unknown role +roles = { + "human": default_conversation.roles[0], # human role + "gpt": default_conversation.roles[1], # gpt role +} + +def _add_speaker_and_signal(source, get_conversation=True): + conversation = header + if "instruction" in source and source["instruction"] is not None and len(source["instruction"]) > 0: + source["instruction"] = ( + BEGIN_SIGNAL + + conversation_lib.default_conversation.roles[2] + + ": " + + source["instruction"] + + END_SIGNAL + ) + if get_conversation: + conversation += source["instruction"] + for sentence in source["conversations"]: + sentence_from = sentence["from"].lower() + sentence["value"] = ( + BEGIN_SIGNAL + + roles.get(sentence_from, unknown_role) + + ": " + + sentence["value"] + + END_SIGNAL + ) + if get_conversation: + conversation += sentence["value"] + return conversation + +if __name__ == "__main__": + print(default_conversation.get_prompt()) diff --git a/examples/qwen/utils/cyg_conversation.py b/examples/qwen/utils/cyg_conversation.py new file mode 100644 index 000000000..6bb9c3a0f --- /dev/null +++ b/examples/qwen/utils/cyg_conversation.py @@ -0,0 +1,157 @@ +import dataclasses +from enum import auto, Enum +from typing import List, Tuple, Any + + +class SeparatorStyle(Enum): + """Different separator style.""" + SINGLE = auto() + TWO = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that keeps all conversation history.""" + system: str + instruction: str + roles: List[str] + messages: List[List[str]] + offset: int + sep_style: SeparatorStyle = SeparatorStyle.SINGLE + sep: str = "###" + sep2: str = None + + skip_next: bool = False + conv_id: Any = None + + def get_prompt(self): + if self.sep_style == SeparatorStyle.SINGLE: + ret = self.system + self.sep + if self.instruction is not None and len(self.instruction) > 0: + ret += self.roles[2] + ": " + self.instruction + self.sep + for role, message in self.messages: + if message: + ret += role + ": " + message + self.sep + else: + ret += role + ":" + return ret + elif self.sep_style == SeparatorStyle.TWO: + seps = [self.sep, self.sep2] + ret = self.system + seps[0] + if self.instruction is not None and len(self.instruction) > 0: + ret += self.roles[2] + ": " + self.instruction + self.sep + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ": " + message + seps[i % 2] + else: + ret += role + ":" + return ret + else: + raise ValueError(f"Invalid style: {self.sep_style}") + + def append_message(self, role, message): + self.messages.append([role, message]) + + def to_gradio_chatbot(self): + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset:]): + if i % 2 == 0: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def copy(self): + return Conversation( + system=self.system, + instruction=self.instruction, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + conv_id=self.conv_id) + + def dict(self): + return { + "system": self.system, + "instruction": self.instruction, + "roles": self.roles, + "messages": self.messages, + "offset": self.offset, + "sep": self.sep, + "sep2": self.sep2, + "conv_id": self.conv_id, + } + + +conv_v1 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + instruction="", + roles=("Human", "Assistant", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_v1_2 = Conversation( + system="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + instruction="", + roles=("Human", "Assistant", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.SINGLE, + sep="###", +) + +conv_bair_v1 = Conversation( + system="BEGINNING OF CONVERSATION:", + instruction="", + roles=("USER", "GPT", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.TWO, + sep=" ", + sep2="", +) + + +default_conversation = conv_v1_2 +conv_templates = { + "v1": conv_v1_2, + "bair_v1": conv_bair_v1, +} + + +def covert_prompt_to_input_ids_with_history(text, history, tokenizer, max_token): + conv = default_conversation.copy() + + conv.append_message(conv.roles[1], None) + conv.append_message(conv.roles[0], text) + + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + while(len(history) > 0 and (len(example) < max_token)): + tmp = history.pop() + if tmp[0] == 'ASSISTANT': + conv.append_message(conv.roles[1], tmp[1]) + else: + conv.append_message(conv.roles[0], tmp[1]) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + + if len(example) >= max_token: + conv.messages.pop() + conv.messages = conv.messages[::-1] + print('model in:', conv.get_prompt()) + example = tokenizer.encode_plus(f"{conv.get_prompt()} ", None, max_length=None)['input_ids'] + #example = example[1:-1] + + return example + +if __name__ == "__main__": + print(default_conversation.get_prompt()) + diff --git a/flagopen.png b/flagopen.png new file mode 100644 index 000000000..82ffee806 Binary files /dev/null and b/flagopen.png differ diff --git a/flagscale/inference/core/block_manager.py b/flagscale/inference/core/block_manager.py index 34a0d3d52..93770b8dc 100644 --- a/flagscale/inference/core/block_manager.py +++ b/flagscale/inference/core/block_manager.py @@ -16,9 +16,7 @@ SeqId = int EncoderSeqId = str -# --- FLAGSCALE MODIFICATION BEG --- -NegativeSeqId = int -# --- FLAGSCALE MODIFICATION END --- +NegativeSeqId = int # --- FLAGSCALE MODIFICATION --- class SelfAttnBlockSpaceManager(BlockSpaceManager): @@ -103,12 +101,10 @@ def __init__( self.block_tables: Dict[SeqId, BlockTable] = {} self.cross_block_tables: Dict[EncoderSeqId, BlockTable] = {} - # --- FLAGSCALE MODIFICATION BEG --- - self.negative_block_tables: Dict[NegativeSeqId, BlockTable] = {} - # --- FLAGSCALE MODIFICATION END --- + self.negative_block_tables: Dict[NegativeSeqId, BlockTable] = {} # --- FLAGSCALE MODIFICATION --- self._computed_blocks_tracker = ComputedBlocksTracker( - self.block_allocator) + self.block_allocator, self.block_size, self.enable_caching) self._last_access_blocks_tracker = LastAccessBlocksTracker( self.block_allocator) @@ -187,7 +183,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: self.block_tables[seq.seq_id] = block_table # Track seq - self._computed_blocks_tracker.add_seq(seq.seq_id) self._last_access_blocks_tracker.add_seq(seq.seq_id) # Assign the block table for each sequence. @@ -195,7 +190,6 @@ def allocate(self, seq_group: SequenceGroup) -> None: self.block_tables[seq.seq_id] = block_table.fork() # Track seq - self._computed_blocks_tracker.add_seq(seq.seq_id) self._last_access_blocks_tracker.add_seq(seq.seq_id) # Allocate cross-attention block table for encoder sequence @@ -380,11 +374,13 @@ def get_common_computed_block_ids( """ computed_seq_block_ids = [] for seq in seqs: - computed_seq_block_ids.append( - self._computed_blocks_tracker. - get_cached_computed_blocks_and_update( - seq.seq_id, - self.block_tables[seq.seq_id].physical_block_ids)) + all_blocks = self.block_tables[seq.seq_id].physical_block_ids + num_cached_tokens = ( + self._computed_blocks_tracker.get_num_cached_tokens(seq)) + assert num_cached_tokens % self.block_size == 0 + num_cached_blocks = num_cached_tokens // self.block_size + computed_block_ids = all_blocks[:num_cached_blocks] + computed_seq_block_ids.append(computed_block_ids) # NOTE(sang): This assumes seq_block_ids doesn't contain any None. return self.block_allocator.get_common_computed_block_ids( @@ -398,7 +394,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_tables[child_seq.seq_id] = src_block_table.fork() # Track child seq - self._computed_blocks_tracker.add_seq(child_seq.seq_id) self._last_access_blocks_tracker.add_seq(child_seq.seq_id) def can_swap_in(self, seq_group: SequenceGroup, @@ -459,7 +454,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: with num_lookahead_slots. Args: - seq_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap out. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. @@ -475,7 +470,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: swapping out the given sequence_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap out. Returns: List[Tuple[int, int]]: The mapping of swapping block from @@ -525,7 +520,7 @@ def _can_swap(self, on to the 'device'. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + sequence_group (SequenceGroup): The sequence group to swap in/out. device (Device): device to swap the 'seq_group' on. status (SequenceStatus): The status of sequence which is needed for action. RUNNING for swap out and SWAPPED for swap in @@ -569,3 +564,9 @@ def _can_swap(self, return AllocStatus.OK else: return AllocStatus.LATER + + def get_num_cached_tokens(self, seq: Sequence) -> int: + """Get the number of tokens in blocks that are already computed and + cached in the block manager for the sequence. + """ + return self._computed_blocks_tracker.get_num_cached_tokens(seq) diff --git a/flagscale/inference/core/data.py b/flagscale/inference/core/data.py index 55b7240c0..5321d6a6e 100644 --- a/flagscale/inference/core/data.py +++ b/flagscale/inference/core/data.py @@ -1,11 +1,16 @@ # This file is modified from 'FlagScale/vllm/vllm/inputs/data.py' -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, +from dataclasses import dataclass +from functools import cached_property +from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) -from typing_extensions import NotRequired, TypedDict, TypeVar +import torch +from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict + from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict) + from vllm.multimodal.inputs import MultiModalInputsV2 class TextPrompt(TypedDict): @@ -40,15 +45,18 @@ class TokensPrompt(TypedDict): prompt_token_ids: List[int] """A list of token IDs to pass to the model.""" + token_type_ids: NotRequired[List[int]] + """A list of token type IDs to pass to the cross encoder model.""" + multi_modal_data: NotRequired["MultiModalDataDict"] """ - Optional multi-modal data to pass to the model, + DEPRECATED: Optional multi-modal data to pass to the model, if the model supports it. """ mm_processor_kwargs: NotRequired[Dict[str, Any]] """ - Optional multi-modal processor kwargs to be forwarded to the + DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them. @@ -133,21 +141,39 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): class TokenInputs(TypedDict): """Represents token-based inputs.""" + + type: Literal["token"] + """The type of inputs.""" + prompt_token_ids: List[int] """The token IDs of the prompt.""" - prompt: NotRequired[Optional[str]] + token_type_ids: NotRequired[List[int]] + """The token type IDs of the prompt.""" + + prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. """ - mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]] + multi_modal_inputs: NotRequired["MultiModalKwargs"] + """ + Optional multi-modal inputs to pass to the model, + if the model supports it. + """ + + multi_modal_placeholders: NotRequired["MultiModalPlaceholderDict"] + """ + Placeholder ranges for the multi-modal data. + """ + + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -163,8 +189,11 @@ class TokenInputs(TypedDict): def token_inputs( prompt_token_ids: List[int], + token_type_ids: Optional[List[int]] = None, prompt: Optional[str] = None, multi_modal_data: Optional["MultiModalDataDict"] = None, + multi_modal_inputs: Optional["MultiModalKwargs"] = None, + multi_modal_placeholders: Optional["MultiModalPlaceholderDict"] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, # --- FLAGSCALE MODIFICATION BEG --- negative_prompt_token_ids: Optional[List[int]] = None, @@ -173,31 +202,31 @@ def token_inputs( ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" # --- FLAGSCALE MODIFICATION BEG --- - inputs = TokenInputs(prompt_token_ids=prompt_token_ids, + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids, negative_prompt_token_ids=negative_prompt_token_ids) # --- FLAGSCALE MODIFICATION END --- if prompt is not None: inputs["prompt"] = prompt - if multi_modal_data is not None: - inputs["multi_modal_data"] = multi_modal_data - if mm_processor_kwargs is not None: - inputs["mm_processor_kwargs"] = mm_processor_kwargs # --- FLAGSCALE MODIFICATION BEG --- if negative_prompt is not None: inputs["negative_prompt"] = negative_prompt # --- FLAGSCALE MODIFICATION END --- + if token_type_ids is not None: + inputs["token_type_ids"] = token_type_ids + if multi_modal_data is not None: + inputs["multi_modal_data"] = multi_modal_data + if multi_modal_inputs is not None: + inputs["multi_modal_inputs"] = multi_modal_inputs + if multi_modal_placeholders is not None: + inputs["multi_modal_placeholders"] = multi_modal_placeholders + if mm_processor_kwargs is not None: + inputs["mm_processor_kwargs"] = mm_processor_kwargs return inputs -SingletonInputs = TokenInputs -""" -A processed :class:`SingletonPrompt` which can be passed to -:class:`vllm.sequence.Sequence`. -""" - -DecoderOnlyInputs = TokenInputs +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -205,28 +234,123 @@ def token_inputs( """ -class EncoderDecoderInputs(TokenInputs): +class EncoderDecoderInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. This specifies the required data for encoder-decoder models. """ - encoder_prompt_token_ids: List[int] - """The token IDs of the encoder prompt.""" + encoder: Union[TokenInputs, "MultiModalInputsV2"] + """The inputs for the encoder portion.""" + + decoder: Union[TokenInputs, "MultiModalInputsV2"] + """The inputs for the decoder portion.""" - encoder_prompt: NotRequired[Optional[str]] - """ - The original encoder prompt text corresponding to the token IDs, if - available. - """ - encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] +SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +""" +A processed :class:`SingletonPrompt` which can be passed to +:class:`vllm.sequence.Sequence`. +""" + + +@dataclass +class SingletonInputsAdapter: """ - Optional multi-modal data to pass to the encoder model, - if the model supports it. + Unified interface to access the components of :class:`SingletonInputs`. """ + inputs: SingletonInputs + + @cached_property + def prompt(self) -> Optional[str]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt") + + assert_never(inputs) + @cached_property + def prompt_token_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt_token_ids", []) + + assert_never(inputs) + + @cached_property + def token_type_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("token_type_ids", []) + + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return None + + assert_never(inputs) + + @cached_property + def multi_modal_data(self) -> "MultiModalDataDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + + @cached_property + def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_inputs", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + + @cached_property + def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_placeholders", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_placeholders", {}) + + assert_never(inputs) + + @cached_property + def mm_processor_kwargs(self) -> Dict[str, Any]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + if inputs["type"] == "multimodal": + return {} + + assert_never(inputs) + + +ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] +""" +The inputs to :data:`vllm.inputs.InputProcessor`. +""" _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) @@ -253,10 +377,11 @@ def zip_enc_dec_prompts( ) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs - may also be provided; if a dict is passed, the same dictionary will be - used for every encoder/decoder prompt. If an iterable is provided, it will - be zipped with the encoder/decoder prompts. + :class:`ExplicitEncoderDecoderPrompt` instances. + + ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: mm_processor_kwargs = cast(Dict[str, Any], {}) @@ -282,34 +407,3 @@ def to_enc_dec_tuple_list( return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] - - -def __getattr__(name: str): - import warnings - - if name == "PromptInput": - msg = ("PromptInput has been renamed to PromptType. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return PromptType - - if name == "LLMInputs": - msg = ("LLMInputs has been renamed to DecoderOnlyInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return DecoderOnlyInputs - - if name == "EncoderDecoderLLMInputs": - msg = ( - "EncoderDecoderLLMInputs has been renamed to EncoderDecoderInputs. " - "The original name will be removed in an upcoming version.") - - warnings.warn(DeprecationWarning(msg), stacklevel=2) - - return EncoderDecoderInputs - - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/flagscale/inference/core/llm_engine.py b/flagscale/inference/core/llm_engine.py index a765f6cf5..53b6d4a99 100644 --- a/flagscale/inference/core/llm_engine.py +++ b/flagscale/inference/core/llm_engine.py @@ -1,4 +1,5 @@ # This file is modified from 'FlagScale/vllm/vllm/engine/llm_engine.py' +import copy import time from collections import Counter as collectionsCounter from collections import deque @@ -11,14 +12,12 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeVar +from typing_extensions import TypeVar, deprecated import vllm.envs as envs -from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, - EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) +from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, + ObservabilityConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, SchedulerOutputs) from vllm.engine.arg_utils import EngineArgs @@ -32,8 +31,9 @@ from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputRegistry, PromptType) +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.logits_process import get_bad_words_logits_processors @@ -41,7 +41,8 @@ from vllm.model_executor.guided_decoding import ( get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.outputs import (PoolingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -81,7 +82,7 @@ def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) -_O = TypeVar("_O", RequestOutput, EmbeddingRequestOutput) +_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) @dataclass @@ -113,7 +114,7 @@ class SchedulerContext: def __init__(self, multi_step_stream_outputs: bool = False): self.output_queue: Deque[OutputData] = deque() self.request_outputs: List[Union[RequestOutput, - EmbeddingRequestOutput]] = [] + PoolingRequestOutput]] = [] self.seq_group_metadata_list: Optional[ List[SequenceGroupMetadata]] = None self.scheduler_outputs: Optional[SchedulerOutputs] = None @@ -222,30 +223,35 @@ def validate_outputs( def __init__( self, - model_config: ModelConfig, - cache_config: CacheConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], - speculative_config: Optional[SpeculativeConfig], - decoding_config: Optional[DecodingConfig], - observability_config: Optional[ObservabilityConfig], - prompt_adapter_config: Optional[PromptAdapterConfig], + vllm_config: VllmConfig, executor_class: Type[ExecutorBase], log_stats: bool, usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: + + self.model_config = vllm_config.model_config + self.cache_config = vllm_config.cache_config + self.lora_config = vllm_config.lora_config + self.parallel_config = vllm_config.parallel_config + self.scheduler_config = vllm_config.scheduler_config + self.device_config = vllm_config.device_config + self.speculative_config = vllm_config.speculative_config # noqa + self.load_config = vllm_config.load_config + self.decoding_config = vllm_config.decoding_config or DecodingConfig( # noqa + ) + self.prompt_adapter_config = vllm_config.prompt_adapter_config # noqa + self.observability_config = vllm_config.observability_config or ObservabilityConfig( # noqa + ) + logger.info( "Initializing an LLM engine (v%s) with config: " "model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " - "override_neuron_config=%s, " - "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, " + "override_neuron_config=%s, tokenizer_revision=%s, " "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " "pipeline_parallel_size=%d, " @@ -257,57 +263,46 @@ def __init__( "num_scheduler_steps=%d, chunked_prefill_enabled=%s " "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " "use_async_output_proc=%s, use_cached_outputs=%s, " - "chat_template_text_format=%s, mm_processor_kwargs=%s)", + "mm_processor_kwargs=%s, pooler_config=%r," + "compilation_config=%r", VLLM_VERSION, - model_config.model, - speculative_config, - model_config.tokenizer, - model_config.skip_tokenizer_init, - model_config.tokenizer_mode, - model_config.revision, - model_config.override_neuron_config, - model_config.rope_scaling, - model_config.rope_theta, - model_config.tokenizer_revision, - model_config.trust_remote_code, - model_config.dtype, - model_config.max_model_len, - load_config.download_dir, - load_config.load_format, - parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size, - parallel_config.disable_custom_all_reduce, - model_config.quantization, - model_config.enforce_eager, - cache_config.cache_dtype, - model_config.quantization_param_path, - device_config.device, - decoding_config, - observability_config, - model_config.seed, - model_config.served_model_name, - scheduler_config.num_scheduler_steps, - scheduler_config.chunked_prefill_enabled, - scheduler_config.multi_step_stream_outputs, - cache_config.enable_prefix_caching, - model_config.use_async_output_proc, + self.model_config.model, + self.speculative_config, + self.model_config.tokenizer, + self.model_config.skip_tokenizer_init, + self.model_config.tokenizer_mode, + self.model_config.revision, + self.model_config.override_neuron_config, + self.model_config.tokenizer_revision, + self.model_config.trust_remote_code, + self.model_config.dtype, + self.model_config.max_model_len, + self.load_config.download_dir, + self.load_config.load_format, + self.parallel_config.tensor_parallel_size, + self.parallel_config.pipeline_parallel_size, + self.parallel_config.disable_custom_all_reduce, + self.model_config.quantization, + self.model_config.enforce_eager, + self.cache_config.cache_dtype, + self.model_config.quantization_param_path, + self.device_config.device, + self.decoding_config, + self.observability_config, + self.model_config.seed, + self.model_config.served_model_name, + self.scheduler_config.num_scheduler_steps, + self.scheduler_config.chunked_prefill_enabled, + self.scheduler_config.multi_step_stream_outputs, + self.cache_config.enable_prefix_caching, + self.model_config.use_async_output_proc, use_cached_outputs, - model_config.chat_template_text_format, - model_config.mm_processor_kwargs, + self.model_config.mm_processor_kwargs, + self.model_config.pooler_config, + vllm_config.compilation_config, ) # TODO(woosuk): Print more configs in debug mode. - self.model_config = model_config - self.cache_config = cache_config - self.lora_config = lora_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.speculative_config = speculative_config - self.load_config = load_config - self.decoding_config = decoding_config or DecodingConfig() - self.prompt_adapter_config = prompt_adapter_config - self.observability_config = observability_config or ObservabilityConfig( - ) + self.log_stats = log_stats self.use_cached_outputs = use_cached_outputs @@ -329,27 +324,17 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: self.seq_counter = Counter() self.generation_config_fields = _load_generation_config_dict( - model_config) + self.model_config) - self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) + self.input_preprocessor = InputPreprocessor(self.model_config, + self.tokenizer, + mm_registry) self.input_registry = input_registry self.input_processor = input_registry.create_input_processor( - model_config) - - self.model_executor = executor_class( - model_config=model_config, - cache_config=cache_config, - parallel_config=parallel_config, - scheduler_config=scheduler_config, - device_config=device_config, - lora_config=lora_config, - speculative_config=speculative_config, - load_config=load_config, - prompt_adapter_config=prompt_adapter_config, - observability_config=self.observability_config, - ) + self.model_config) + + self.model_executor = executor_class(vllm_config=vllm_config, ) if self.model_config.task != "embedding": self._initialize_kv_caches() @@ -359,36 +344,36 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: from vllm.model_executor.model_loader import ( get_architecture_class_name) usage_message.report_usage( - get_architecture_class_name(model_config), + get_architecture_class_name(self.model_config), usage_context, extra_kvs={ # Common configuration "dtype": - str(model_config.dtype), + str(self.model_config.dtype), "tensor_parallel_size": - parallel_config.tensor_parallel_size, + self.parallel_config.tensor_parallel_size, "block_size": - cache_config.block_size, + self.cache_config.block_size, "gpu_memory_utilization": - cache_config.gpu_memory_utilization, + self.cache_config.gpu_memory_utilization, # Quantization "quantization": - model_config.quantization, + self.model_config.quantization, "kv_cache_dtype": - str(cache_config.cache_dtype), + str(self.cache_config.cache_dtype), # Feature flags "enable_lora": - bool(lora_config), + bool(self.lora_config), "enable_prompt_adapter": - bool(prompt_adapter_config), + bool(self.prompt_adapter_config), "enable_prefix_caching": - cache_config.enable_prefix_caching, + self.cache_config.enable_prefix_caching, "enforce_eager": - model_config.enforce_eager, + self.model_config.enforce_eager, "disable_custom_all_reduce": - parallel_config.disable_custom_all_reduce, + self.parallel_config.disable_custom_all_reduce, }) if self.tokenizer: @@ -407,7 +392,7 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: for _ in range(self.parallel_config.pipeline_parallel_size) ] - if model_config.use_async_output_proc: + if self.model_config.use_async_output_proc: process_model_outputs = weak_bind(self._process_model_outputs) self.async_callbacks = [ @@ -427,11 +412,11 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: # GPU and CPU blocks, which are profiled in the distributed executor. self.scheduler = [ Scheduler( - scheduler_config, cache_config, lora_config, - parallel_config.pipeline_parallel_size, + self.scheduler_config, self.cache_config, self.lora_config, + self.parallel_config.pipeline_parallel_size, self.async_callbacks[v_id] - if model_config.use_async_output_proc else None) - for v_id in range(parallel_config.pipeline_parallel_size) + if self.model_config.use_async_output_proc else None) + for v_id in range(self.parallel_config.pipeline_parallel_size) ] # Metric Logging. @@ -453,7 +438,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: "prometheus": PrometheusStatLogger( local_interval=_LOCAL_LOGGING_INTERVAL_SEC, - labels=dict(model_name=model_config.served_model_name), + labels=dict( + model_name=self.model_config.served_model_name), max_model_len=self.model_config.max_model_len), } self.stat_loggers["prometheus"].info("cache_config", @@ -506,7 +492,7 @@ def _initialize_kv_caches(self) -> None: @classmethod def _get_executor_cls(cls, - engine_config: EngineConfig) -> Type[ExecutorBase]: + engine_config: VllmConfig) -> Type[ExecutorBase]: distributed_executor_backend = ( engine_config.parallel_config.distributed_executor_backend) # Initialize the cluster and specify the executor class. @@ -533,6 +519,14 @@ def _get_executor_cls(cls, elif engine_config.device_config.device_type == "cpu": from vllm.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor + elif engine_config.device_config.device_type == "hpu": + if distributed_executor_backend == "ray": + initialize_ray_cluster(engine_config.parallel_config) + from vllm.executor.ray_hpu_executor import RayHPUExecutor + executor_class = RayHPUExecutor + else: + from vllm.executor.hpu_executor import HPUExecutor + executor_class = HPUExecutor elif engine_config.device_config.device_type == "openvino": from vllm.executor.openvino_executor import OpenVINOExecutor executor_class = OpenVINOExecutor @@ -576,11 +570,11 @@ def from_engine_args( ) -> "LLMEngine": """Creates an LLM engine from the engine arguments.""" # Create the engine configs. - engine_config = engine_args.create_engine_config() + engine_config = engine_args.create_engine_config(usage_context) executor_class = cls._get_executor_cls(engine_config) # Create the LLM engine. engine = cls( - **engine_config.to_dict(), + vllm_config=engine_config, executor_class=executor_class, log_stats=not engine_args.disable_log_stats, usage_context=usage_context, @@ -627,7 +621,7 @@ def _init_tokenizer(self) -> BaseTokenizerGroup: model_config=self.model_config, scheduler_config=self.scheduler_config, parallel_config=self.parallel_config, - enable_lora=bool(self.lora_config)) + lora_config=self.lora_config) def _verify_args(self) -> None: self.model_config.verify_with_parallel_config(self.parallel_config) @@ -643,7 +637,7 @@ def _verify_args(self) -> None: def _add_processed_request( self, request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], + processed_inputs: ProcessorInputs, params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], @@ -668,38 +662,45 @@ def _add_processed_request( ) return None - self._validate_model_inputs(processed_inputs) + self._validate_model_inputs(processed_inputs, lora_request) # Create the sequences. block_size = self.cache_config.block_size seq_id = next(self.seq_counter) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id, - lora_request, prompt_adapter_request) + negative_decoder_inputs = None # --- FLAGSCALE MODIFICATION --- + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = processed_inputs["decoder"] + encoder_inputs = processed_inputs["encoder"] + # --- FLAGSCALE MODIFICATION BEG --- + elif "negative_prompt_token_ids" in processed_inputs \ + and processed_inputs["negative_prompt_token_ids"] is not None: + positive_inputs = processed_inputs.copy() + positive_inputs.pop("negative_prompt_token_ids") + negative_inputs = processed_inputs.copy() + negative_inputs["prompt_token_ids"] = negative_inputs["negative_prompt_token_ids"] + negative_inputs.pop("negative_prompt_token_ids") + decoder_inputs = positive_inputs + negative_decoder_inputs = negative_inputs + encoder_inputs = None + # --- FLAGSCALE MODIFICATION END --- + else: + decoder_inputs = processed_inputs + encoder_inputs = None - encoder_seq = None - if 'encoder_prompt_token_ids' in processed_inputs: - encoder_seq = Sequence(seq_id, - processed_inputs, - block_size, - eos_token_id, - lora_request, - prompt_adapter_request, - from_decoder_prompt=False) + seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, + lora_request, prompt_adapter_request) # --- FLAGSCALE MODIFICATION BEG --- - negative_seq = None - if "negative_prompt_token_ids" in processed_inputs \ - and processed_inputs["negative_prompt_token_ids"]: - negative_seq = Sequence(seq_id, - processed_inputs, - block_size, - eos_token_id, - lora_request, - prompt_adapter_request, - from_negative_prompt=True) + negative_seq = (None if negative_decoder_inputs is None else Sequence( + seq_id, negative_decoder_inputs, block_size, eos_token_id, lora_request, + prompt_adapter_request)) # --- FLAGSCALE MODIFICATION END --- + encoder_seq = (None if encoder_inputs is None else Sequence( + seq_id, encoder_inputs, block_size, eos_token_id, lora_request, + prompt_adapter_request)) + # Create a SequenceGroup based on SamplingParams or PoolingParams if isinstance(params, SamplingParams): seq_group = self._create_sequence_group_with_sampling( @@ -740,7 +741,8 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() - @overload # DEPRECATED + @overload + @deprecated("'inputs' will be renamed to 'prompt") def add_request( self, request_id: str, @@ -841,9 +843,21 @@ def add_request( raise ValueError(f"Got priority {priority} but " "Priority scheduling is not enabled.") + if isinstance(params, SamplingParams) \ + and (params.guided_decoding or params.logits_processors) \ + and self.scheduler_config.num_scheduler_steps > 1: + raise ValueError( + "Guided decoding and logits processors are not supported " + "in multi-step decoding") + if arrival_time is None: arrival_time = time.time() + if self.tokenizer is not None: + self._validate_token_prompt( + prompt, + tokenizer=self.get_tokenizer(lora_request=lora_request)) + preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -852,13 +866,6 @@ def add_request( ) processed_inputs = self.input_processor(preprocessed_inputs) - # This is a bit of a hack - copy the mm_processor_kwargs that were - # used in the input processor to the processed output, since these - # kwargs are presumed to be immutable and the values should be aligned - # between the input processor (here) and the input mapper. - processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get( - "mm_processor_kwargs") - self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, @@ -870,6 +877,27 @@ def add_request( priority=priority, ) + def _validate_token_prompt(self, prompt: PromptType, + tokenizer: AnyTokenizer): + # Guard against out-of-vocab tokens. + # For some tokenizers, tokenizer.decode will happily return empty text + # for token ids that are out of vocab, and we don't detect token ids + # that are greater than the max token id before running the model. + # However, these token ids will later crash a cuda kernel at runtime + # with an index out of bounds error. This will crash the entire engine. + # This needs to happen before multimodal input pre-processing, which + # may add dummy tokens that aren't part of the tokenizer's + # vocabulary. + if is_token_prompt(prompt): + prompt_ids = prompt["prompt_token_ids"] + if len(prompt_ids) == 0: + # Empty prompt check is handled later + return + max_input_id = max(prompt_ids) + if max_input_id > tokenizer.max_token_id: + raise ValueError( + "Token id {} is out of vocabulary".format(max_input_id)) + def _create_sequence_group_with_sampling( self, request_id: str, @@ -1020,9 +1048,9 @@ def _update_num_computed_tokens_for_multi_step_prefill( This function updates num_computed_tokens for prompt sequences when Multi-Step is enabled. - seq_group: SequenceGroup to update the num_computed_tokens for. + seq_group: SequenceGroup to update the num_computed_tokens for. seq_group_meta: Metadata of the given SequenceGroup. - is_first_step_output: Optional[bool] - + is_first_step_output: Optional[bool] - When available, is_first_step_output indicates if the appended output token is the output of the first-step in multi-step. A value of None indicates that outputs from all steps in @@ -1142,14 +1170,11 @@ def _process_model_outputs(self, else: seq_group.update_num_computed_tokens( seq_group_meta.token_chunk_size or 0) - - # --- FLAGSCALE MODIFICATION BEG --- - if seq_group.has_negative_seqs(): - assert self.scheduler_config.is_multi_step is False + # --- FLAGSCALE MODIFICATION BEG --- seq_group.update_negative_num_computed_tokens( - scheduled_seq_group.negative_token_chunk_size + scheduled_seq_group.negative_token_chunk_size or 0 ) - # --- FLAGSCALE MODIFICATION END --- + # --- FLAGSCALE MODIFICATION END --- if outputs: for o in outputs: @@ -1300,13 +1325,13 @@ def _advance_to_next_step( if seq_group_metadata.token_chunk_size is not None else 0) seq_group.update_num_computed_tokens(token_chunk_size) - - # --- FLAGSCALE MODIFICATION BEG --- - if seq_group.has_negative_seqs(): - seq_group.update_negative_num_computed_tokens( - scheduled_seq_group.negative_token_chunk_size - ) - # --- FLAGSCALE MODIFICATION END --- + # --- FLAGSCALE MODIFICATION BEG --- + if seq_group.has_negative_seqs(): + negative_token_chunk_size = (seq_group_metadata.negative_token_chunk_size + if seq_group_metadata.negative_token_chunk_size + is not None else 0) + seq_group.update_negative_num_computed_tokens(negative_token_chunk_size) + # --- FLAGSCALE MODIFICATION END --- if seq_group_metadata.do_sample: assert len(sequence_group_outputs.samples) == 1, ( @@ -1325,15 +1350,13 @@ def _advance_to_next_step( seq_group.update_num_computed_tokens(1) else: seq.append_token_id(sample.output_token, sample.logprobs) + # --- FLAGSCALE MODIFICATION BEG --- + if seq_group.has_negative_seqs(): + negative_seq = seq_group.negative_seqs[0] + negative_seq.append_token_id(sample.output_token, sample.logprobs) + # --- FLAGSCALE MODIFICATION END --- - # --- FLAGSCALE MODIFICATION BEG --- - if seq_group.has_negative_seqs(): - assert self.scheduler_config.is_multi_step is False - negative_seq = seq_group.negative_seqs[0] - negative_seq.append_token_id(sample.output_token, sample.logprobs) - # --- FLAGSCALE MODIFICATION END --- - - def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: + def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. .. figure:: https://i.imgur.com/sv2HssD.png @@ -1417,6 +1440,9 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: ctx.seq_group_metadata_list = seq_group_metadata_list ctx.scheduler_outputs = scheduler_outputs + finished_requests_ids = self.scheduler[ + virtual_engine].get_and_reset_finished_requests_ids() + # Maybe switch from async mode to sync mode if not allow_async_output_proc and len(ctx.output_queue) > 0: self._process_model_outputs(ctx=ctx) @@ -1428,13 +1454,13 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: self._cache_scheduler_outputs_for_multi_step( virtual_engine, seq_group_metadata_list, scheduler_outputs, allow_async_output_proc) + else: + finished_requests_ids = list() assert seq_group_metadata_list is not None assert scheduler_outputs is not None if not scheduler_outputs.is_empty(): - finished_requests_ids = self.scheduler[ - virtual_engine].get_and_reset_finished_requests_ids() # Check if we have a cached last_output from the previous iteration. # For supporting PP this is probably the best way to pass the @@ -1550,8 +1576,8 @@ def _has_remaining_steps( seq_group.state.remaining_steps != ref_remaining_steps for seq_group in seq_group_metadata_list[1:] ]): - raise AssertionError(("All running sequence groups should " - "have the same remaining steps.")) + raise AssertionError("All running sequence groups should " + "have the same remaining steps.") return ref_remaining_steps > 0 @@ -1676,6 +1702,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter = 0 num_generation_tokens_iter = 0 + num_tokens_iter = 0 time_to_first_tokens_iter: List[float] = [] time_per_output_tokens_iter: List[float] = [] num_preemption_iter = (0 if scheduler_outputs is None else @@ -1684,10 +1711,19 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests: List[float] = [] + time_queue_requests: List[float] = [] + time_inference_requests: List[float] = [] + time_prefill_requests: List[float] = [] + time_decode_requests: List[float] = [] + time_in_queue_requests: List[float] = [] + model_forward_time_requests: List[float] = [] + model_execute_time_requests: List[float] = [] # Metadata num_prompt_tokens_requests: List[int] = [] num_generation_tokens_requests: List[int] = [] n_requests: List[int] = [] + max_num_generation_tokens_requests: List[int] = [] + max_tokens_requests: List[int] = [] finished_reason_requests: List[str] = [] # Lora requests @@ -1716,7 +1752,7 @@ def _get_stats(self, # not counted (to avoid double counting) actual_num_batched_tokens = scheduler_outputs.num_batched_tokens # type: ignore - num_generation_tokens_from_prefill_groups = 0. + num_generation_tokens_from_prefill_groups = 0 # NOTE: if scheduler_outputs.num_prefill_groups > 0 and # the len of scheduler_outputs.scheduled_seq_groups is != # scheduler_outputs.num_prefill_groups, this means that @@ -1777,6 +1813,27 @@ def _get_stats(self, # Latency timings time_e2e_requests.append(now - seq_group.metrics.arrival_time) + if (seq_group.metrics.first_scheduled_time is not None and + seq_group.metrics.first_token_time is not None): + time_queue_requests.append( + seq_group.metrics.first_scheduled_time - + seq_group.metrics.arrival_time) + time_prefill_requests.append( + seq_group.metrics.first_token_time - + seq_group.metrics.first_scheduled_time) + time_decode_requests.append( + now - seq_group.metrics.first_token_time) + time_inference_requests.append( + now - seq_group.metrics.first_scheduled_time) + if seq_group.metrics.time_in_queue is not None: + time_in_queue_requests.append( + seq_group.metrics.time_in_queue) + if seq_group.metrics.model_forward_time is not None: + model_forward_time_requests.append( + seq_group.metrics.model_forward_time) + if seq_group.metrics.model_execute_time is not None: + model_execute_time_requests.append( + seq_group.metrics.model_execute_time * 1000) # Metadata num_prompt_tokens_requests.append( len(seq_group.prompt_token_ids)) @@ -1784,8 +1841,13 @@ def _get_stats(self, seq.get_output_len() for seq in seq_group.get_finished_seqs() ]) + max_num_generation_tokens_requests.append( + max(seq.get_output_len() + for seq in seq_group.get_seqs())) if seq_group.sampling_params is not None: n_requests.append(seq_group.sampling_params.n) + max_tokens_requests.append( + seq_group.sampling_params.max_tokens) finished_reason_requests.extend([ SequenceStatus.get_finished_reason(seq.status) for seq in seq_group.get_finished_seqs() @@ -1800,7 +1862,8 @@ def _get_stats(self, num_generation_tokens_iter = ( actual_num_batched_tokens - num_prompt_tokens_iter + num_generation_tokens_from_prefill_groups) - + num_tokens_iter = (num_generation_tokens_iter + + num_prompt_tokens_iter) # Spec decode, if enabled, emits specialized metrics from the worker in # sampler output. if model_output and (model_output[0].spec_decode_worker_metrics @@ -1826,6 +1889,7 @@ def _get_stats(self, # Iteration stats num_prompt_tokens_iter=num_prompt_tokens_iter, num_generation_tokens_iter=num_generation_tokens_iter, + num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, spec_decode_metrics=spec_decode_metrics, @@ -1834,10 +1898,20 @@ def _get_stats(self, # Request stats # Latency time_e2e_requests=time_e2e_requests, + time_queue_requests=time_queue_requests, + time_inference_requests=time_inference_requests, + time_prefill_requests=time_prefill_requests, + time_decode_requests=time_decode_requests, + time_in_queue_requests=time_in_queue_requests, + model_forward_time_requests=model_forward_time_requests, + model_execute_time_requests=model_execute_time_requests, # Metadata num_prompt_tokens_requests=num_prompt_tokens_requests, num_generation_tokens_requests=num_generation_tokens_requests, + max_num_generation_tokens_requests= + max_num_generation_tokens_requests, n_requests=n_requests, + max_tokens_requests=max_tokens_requests, finished_reason_requests=finished_reason_requests, max_lora=str(max_lora_stat), waiting_lora_adapters=list(waiting_lora_adapters.keys()), @@ -1962,19 +2036,17 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) - def is_encoder_decoder_model(self): - return self.input_preprocessor.is_encoder_decoder_model() - - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): - if self.model_config.is_multimodal_model: + def _validate_model_inputs(self, inputs: ProcessorInputs, + lora_request: Optional[LoRARequest]): + if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length - prompt_ids = inputs.get("prompt_token_ids") - elif self.is_encoder_decoder_model(): - prompt_ids = inputs.get("encoder_prompt_token_ids") + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] else: - prompt_ids = inputs.get("prompt_token_ids") + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") @@ -2005,7 +2077,11 @@ def _build_logits_processors( logits_processors = [] - if (guided_decoding := sampling_params.guided_decoding) is not None: + if sampling_params.guided_decoding is not None: + # Defensively copy sampling params since guided decoding logits + # processors can have different state for each request + sampling_params = copy.copy(sampling_params) + guided_decoding = sampling_params.guided_decoding logger.debug( "Building guided decoding logits processor in " @@ -2016,7 +2092,9 @@ def _build_logits_processors( self.decoding_config.guided_decoding_backend processor = get_local_guided_decoding_logits_processor( - guided_params=guided_decoding, tokenizer=tokenizer) + guided_params=guided_decoding, + tokenizer=tokenizer, + model_config=self.model_config) if processor: logits_processors.append(processor) diff --git a/flagscale/inference/core/logits_processor.py b/flagscale/inference/core/logits_processor.py index 7ce2105fb..1964197e2 100644 --- a/flagscale/inference/core/logits_processor.py +++ b/flagscale/inference/core/logits_processor.py @@ -112,8 +112,14 @@ def _prune_hidden_states( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> torch.Tensor: - return hidden_states.index_select(0, - sampling_metadata.selected_token_indices) + # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios + # (warmup, profile_run) we might not have selected_token_indices, + # so we skip pruning. + if sampling_metadata.selected_token_indices is not None: + return hidden_states.index_select( + 0, sampling_metadata.selected_token_indices) + else: + return hidden_states def _apply_logits_processors( @@ -152,10 +158,9 @@ def _apply_logits_processors( else: logits_row = logits[2 * logits_row_idx] logits_row_idx *= 2 + # --- FLAGSCALE MODIFICATION END --- else: logits_row = logits[logits_row_idx] - # --- FLAGSCALE MODIFICATION END --- - past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids diff --git a/flagscale/inference/core/model_runner.py b/flagscale/inference/core/model_runner.py index cc27dc6dc..49e1d1c66 100644 --- a/flagscale/inference/core/model_runner.py +++ b/flagscale/inference/core/model_runner.py @@ -19,13 +19,9 @@ from vllm.attention import AttentionMetadata, get_attn_backend from vllm.attention.backends.abstract import AttentionState from vllm.attention.backends.utils import CommonAttentionState -from vllm.compilation.compile_context import set_compile_context -from vllm.compilation.levels import CompilationLevel -from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ObservabilityConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) +from vllm.config import CompilationLevel, VllmConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.distributed import get_pp_group +from vllm.distributed import get_kv_transfer_group, get_pp_group from vllm.distributed.parallel_state import graph_capture from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry @@ -41,7 +37,8 @@ from vllm.model_executor.models import supports_lora, supports_multimodal from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, - MultiModalInputs, MultiModalRegistry) + MultiModalKwargs, MultiModalPlaceholderMap, + MultiModalRegistry) from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.request import PromptAdapterRequest @@ -49,10 +46,10 @@ LRUCacheWorkerPromptAdapterManager) from vllm.sampling_params import SamplingParams from vllm.sequence import IntermediateTensors, SequenceGroupMetadata -from vllm.transformers_utils.config import uses_mrope -from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, - flatten_2d_lists, is_pin_memory_available, - supports_dynamo, weak_ref_tensor) +from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache, + async_tensor_h2d, flatten_2d_lists, + is_pin_memory_available, supports_dynamo, + weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -66,16 +63,7 @@ logger = init_logger(__name__) LORA_WARMUP_RANK = 8 -_BATCH_SIZE_ALIGNMENT = 8 -# all the token sizes that **can** be captured by cudagraph. -# they can be arbitrarily large. -# currently it includes: 1, 2, 4, 8, 16, 24, 32, 40, ..., 8192. -# the actual sizes to capture will be determined by the model, -# depending on the model's max_num_seqs. -# NOTE: _get_graph_batch_size needs to be updated if this list is changed. -_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [ - _BATCH_SIZE_ALIGNMENT * i for i in range(1, 1025) -] + _NUM_WARMUP_ITERS = 2 TModelInputForGPU = TypeVar('TModelInputForGPU', bound="ModelInputForGPU") @@ -95,6 +83,7 @@ class ModelInputForGPU(ModelRunnerInputBase): """ input_tokens: Optional[torch.Tensor] = None input_positions: Optional[torch.Tensor] = None + token_types: Optional[torch.Tensor] = None seq_lens: Optional[List[int]] = None query_lens: Optional[List[int]] = None lora_mapping: Optional["LoRAMapping"] = None @@ -137,6 +126,18 @@ def from_broadcasted_tensor_dict( attn_backend, tensor_dict) return cls(**tensor_dict) + # Exclude `async_callback` to be able to pickle this object + def __getstate__(self): + state = self.__dict__.copy() + del state["async_callback"] + return state + + # TODO: What happens when we depickle this object? + # How can we update this callback to properly pass it to the engine? + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__.update({'async_callback': None}) + @dataclass(frozen=True) class ModelInputForGPUWithSamplingMetadata(ModelInputForGPU): @@ -191,6 +192,7 @@ class InterDataForSeqGroup: def simple_reinit(self): self.input_tokens[0].clear() # type: ignore self.input_positions[0].clear() # type: ignore + self.token_types[0].clear() # type: ignore self.mrope_input_positions = None # type: ignore self.seq_lens[0] = 0 # type: ignore self.orig_seq_lens[0] = 0 # type: ignore @@ -217,6 +219,7 @@ def __init__( # Input tokens and positions. input_tokens: Optional[List[List[int]]] = None, input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, mrope_input_positions: Optional[List[List[List[int]]]] = None, # The sequence length (may be capped to the sliding window). @@ -242,7 +245,9 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_modal_inputs: Optional[MultiModalInputs] = None, + multi_modal_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_placeholder_maps: Optional[Dict[ + str, MultiModalPlaceholderMap]] = None, # Whether the prefix cache is hit (prefill only). prefix_cache_hit: bool = False, @@ -280,6 +285,12 @@ def __init__( for seq_id in range(len(self.seq_ids)): self.input_positions[seq_id].clear() + if token_types: + self.token_types = token_types + else: + for seq_id in range(len(self.seq_ids)): + self.token_types[seq_id].clear() + self.mrope_input_positions = None if seq_lens: @@ -343,6 +354,7 @@ def __init__( else: self.input_tokens = input_tokens or [] self.input_positions = input_positions or [] + self.token_types = token_types or [] self.mrope_input_positions = mrope_input_positions or None self.seq_lens = seq_lens or [] self.orig_seq_lens = orig_seq_lens or [] @@ -361,7 +373,8 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_modal_inputs = multi_modal_inputs + self.multi_modal_kwargs = multi_modal_kwargs + self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit self.n_seqs = len(self.seq_ids) @@ -374,6 +387,7 @@ def __post_init__(self): self.input_tokens = [[] for _ in range(self.n_seqs)] self.input_positions = [[] for _ in range(self.n_seqs)] + self.token_types = [[] for _ in range(self.n_seqs)] self.mrope_input_positions = None self.seq_lens = [0] * self.n_seqs self.orig_seq_lens = [0] * self.n_seqs @@ -472,10 +486,10 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, if is_negative: seq_data = seq_group_metadata.negative_seq_data[inter_data.seq_ids[seq_idx]] token_chunk_size = seq_group_metadata.negative_token_chunk_size + # --- FLAGSCALE MODIFICATION END --- else: seq_data = seq_group_metadata.seq_data[inter_data.seq_ids[seq_idx]] token_chunk_size = seq_group_metadata.token_chunk_size - # --- FLAGSCALE MODIFICATION END --- # Compute context length (the number of tokens that are # already computed) and sequence length (total number of tokens). @@ -485,19 +499,22 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, context_len = seq_data.get_num_computed_tokens() seq_len = min(seq_len, context_len + token_chunk_size) elif self.runner.scheduler_config.is_multi_step or \ - self.runner.model_config.is_encoder_decoder_model: + self.runner.model_config.is_encoder_decoder: context_len = seq_len - 1 else: context_len = seq_data.get_num_computed_tokens() # Compute tokens. tokens = seq_data.get_token_ids()[context_len:seq_len] + token_types = seq_group_metadata.token_type_ids inter_data.seq_lens[seq_idx] = seq_len inter_data.orig_seq_lens[seq_idx] = seq_len inter_data.context_lens[seq_idx] = context_len inter_data.input_tokens[seq_idx].extend(tokens) inter_data.input_positions[seq_idx].extend(range(context_len, seq_len)) + inter_data.token_types[seq_idx].extend( + token_types if token_types else []) inter_data.query_lens[seq_idx] = seq_len - context_len if seq_data.mrope_position_delta is not None: @@ -535,6 +552,9 @@ def _compute_for_prefix_cache_hit( # this may be larger than the sequence length if chunked # prefill is enabled. prefix_cache_len = len(computed_block_nums) * self.block_size + seq_group_metadata.seq_data[inter_data.seq_ids[ + seq_idx]].update_num_cached_tokens(prefix_cache_len) + # The number of so far computed prompt tokens in this sequence. context_len = inter_data.context_lens[seq_idx] # The total number of prompt tokens in this sequence. @@ -552,6 +572,8 @@ def _compute_for_prefix_cache_hit( seq_idx][uncomputed_start:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][uncomputed_start:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + uncomputed_start:] context_len = prefix_cache_len inter_data.context_lens[seq_idx] = context_len @@ -566,6 +588,8 @@ def _compute_for_prefix_cache_hit( seq_idx][-1:] inter_data.input_positions[seq_idx] = inter_data.input_positions[ seq_idx][-1:] + inter_data.token_types[seq_idx] = inter_data.token_types[seq_idx][ + -1:] inter_data.query_lens[seq_idx] = 1 inter_data.context_lens[seq_idx] = inter_data.seq_lens[seq_idx] - 1 @@ -615,7 +639,8 @@ def _compute_lora_input(self, inter_data: InterDataForSeqGroup, def _compute_prompt_adapter_input( self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): + seq_group_metadata: SequenceGroupMetadata, + is_negative: bool = False): # --- FLAGSCALE MODIFICATION --- """If prompt adapter is enabled, compute index and prompt mapping. """ # Note that when is_prompt=True, we expect only one sequence @@ -642,19 +667,31 @@ def _compute_prompt_adapter_input( and seq_group_metadata.sampling_params.prompt_logprobs else 1) def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, - seq_group_metadata: SequenceGroupMetadata): + seq_group_metadata: SequenceGroupMetadata, + is_negative: bool = False): # --- FLAGSCALE MODIFICATION --- """If multi-modal data is given, add it to the input.""" - mm_data = seq_group_metadata.multi_modal_data + # NOTE: mm_data only includes the subset of multi-modal items that + # intersect with the current prefill positions. + positions = inter_data.input_positions[0] + mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( + seq_group_metadata, + range(positions[0], positions[0] + len(positions))) if not mm_data: return - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_modal_inputs = mm_kwargs + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + inter_data.multi_modal_kwargs = mm_kwargs + inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. - if self.runner.model_is_mrope: + if self.runner.model_config.uses_mrope: image_grid_thw = mm_kwargs.get("image_grid_thw", None) video_grid_thw = mm_kwargs.get("video_grid_thw", None) assert image_grid_thw is not None or video_grid_thw is not None, ( @@ -681,6 +718,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, spatial_merge_size=hf_config.vision_config. spatial_merge_size, context_len=inter_data.context_lens[seq_idx], + seq_len=inter_data.seq_lens[seq_idx], ) seq_data.mrope_position_delta = mrope_position_delta @@ -699,7 +737,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len = 0 - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len() inter_data = self.init_cached_inter_data( @@ -729,8 +767,8 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): block_tables=seq_group_metadata.negative_block_tables, computed_block_nums=[], # for prefix caching. reinit=True, - reinit_use_defaults=True - ) + reinit_use_defaults=True, + encoder_seq_len=encoder_seq_len) self.inter_data_list.append(negative_inter_data) for seq_idx in range(n_seqs): @@ -746,7 +784,6 @@ def _use_captured_graph(self, max_decode_seq_len: int, max_encoder_seq_len: int = 0) -> bool: return (decode_only and not self.runner.model_config.enforce_eager - and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1] and max_decode_seq_len <= self.runner.max_seq_len_to_capture and max_encoder_seq_len <= self.runner.max_seq_len_to_capture and batch_size <= self.runner.max_batchsize_to_capture) @@ -794,7 +831,7 @@ def _get_cuda_graph_pad_size(self, max_encoder_seq_len): return -1 - graph_batch_size = _get_graph_batch_size(batch_size) + graph_batch_size = VllmConfig.get_graph_batch_size(batch_size) assert graph_batch_size >= batch_size return graph_batch_size - batch_size @@ -804,9 +841,12 @@ def build(self) -> ModelInputForGPU: """ # Combine and flatten intermediate data. input_tokens = [] + token_types = [] for inter_data in self.inter_data_list: for cur_input_tokens in inter_data.input_tokens: input_tokens.extend(cur_input_tokens) + for cur_token_types in inter_data.token_types: + token_types.extend(cur_token_types) if not input_tokens: # This may happen when all prefill requests hit @@ -845,7 +885,7 @@ def build(self) -> ModelInputForGPU: if not inter_data.is_prompt: max_decode_seq_len = max(max_decode_seq_len, max(inter_data.seq_lens)) - if self.runner.model_config.is_encoder_decoder_model: + if self.runner.model_config.is_encoder_decoder: max_encoder_seq_len = max(max_encoder_seq_len, inter_data.encoder_seq_len) @@ -875,6 +915,12 @@ def build(self) -> ModelInputForGPU: input_tokens_tensor = async_tensor_h2d(input_tokens, torch.long, self.runner.device, self.runner.pin_memory) + + token_types_tensor = async_tensor_h2d(token_types, torch.long, + self.runner.device, + self.runner.pin_memory) \ + if token_types else None + if mrope_input_positions is not None: for idx in range(3): mrope_input_positions[idx].extend( @@ -944,15 +990,16 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_modal_inputs_list = [ - data.multi_modal_inputs for data in self.inter_data_list - if data.multi_modal_inputs is not None + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None ] - multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, input_positions=input_positions_tensor, + token_types=token_types_tensor, attn_metadata=attn_metadata, seq_lens=seq_lens, query_lens=query_lens, @@ -974,32 +1021,20 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): def __init__( self, - model_config: ModelConfig, - parallel_config: ParallelConfig, - scheduler_config: SchedulerConfig, - device_config: DeviceConfig, - cache_config: CacheConfig, - load_config: LoadConfig, - lora_config: Optional[LoRAConfig], + vllm_config: VllmConfig, kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - prompt_adapter_config: Optional[PromptAdapterConfig] = None, return_hidden_states: bool = False, - observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): - self.model_config = model_config - self.parallel_config = parallel_config - self.scheduler_config = scheduler_config - self.device_config = device_config - self.cache_config = cache_config - self.lora_config = lora_config - self.load_config = load_config + + ModelRunnerBase.__init__(self, vllm_config) + model_config = self.model_config + cache_config = self.cache_config + self.is_driver_worker = is_driver_worker - self.prompt_adapter_config = prompt_adapter_config self.return_hidden_states = return_hidden_states - self.observability_config = observability_config self.device = self.device_config.device self.pin_memory = is_pin_memory_available() @@ -1008,7 +1043,7 @@ def __init__( self.sliding_window = model_config.get_sliding_window() self.block_size = cache_config.block_size self.max_seq_len_to_capture = self.model_config.max_seq_len_to_capture - self.max_batchsize_to_capture = _get_max_graph_batch_size( + self.max_batchsize_to_capture = VllmConfig.get_max_graph_batch_size( self.scheduler_config.max_num_seqs) self.graph_runners: List[Dict[int, CUDAGraphRunner]] = [ @@ -1024,7 +1059,7 @@ def __init__( # Python can be expensive. To optimize this, we cache the block table # in numpy and only copy the actual input content at every iteration. # The shape of the cached block table will be - # (max batch size to capture, max context len to capture / block size). + # (max batch size to capture, max seq len to capture / block size). self.graph_block_tables = np.zeros( (self.max_batchsize_to_capture, self.get_max_block_per_batch()), dtype=np.int32) @@ -1084,13 +1119,7 @@ def __init__( def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: - self.model = get_model(model_config=self.model_config, - device_config=self.device_config, - load_config=self.load_config, - lora_config=self.lora_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config, - cache_config=self.cache_config) + self.model = get_model(vllm_config=self.vllm_config) self.model_memory_usage = m.consumed_memory logger.info("Loading model weights took %.4f GB", @@ -1160,10 +1189,9 @@ def load_model(self) -> None: "provided. Defaulting to scaling factors of 1.0. " "This may lead to less accurate results!") - if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \ - and supports_dynamo(): - from vllm.plugins import get_torch_compile_backend - backend = get_torch_compile_backend() or "eager" + if self.vllm_config.compilation_config.level ==\ + CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): + backend = self.vllm_config.compilation_config.init_backend() self.model = torch.compile( self.model, fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, @@ -1284,7 +1312,7 @@ def profile_run(self) -> None: (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - seq_data, dummy_multi_modal_data = self.input_registry \ + dummy_data = self.input_registry \ .dummy_data_for_profiling(self.model_config, seq_len, self.mm_registry) @@ -1292,12 +1320,13 @@ def profile_run(self) -> None: seq = SequenceGroupMetadata( request_id=str(group_id), is_prompt=True, - seq_data={group_id: seq_data}, + seq_data={group_id: dummy_data.seq_data}, sampling_params=sampling_params, block_tables=None, lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_multi_modal_data, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders, ) seqs.append(seq) @@ -1324,14 +1353,7 @@ def profile_run(self) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - if self.model_config.enforce_eager: - batch_size_capture_list = [] - with set_compile_context(batch_size_capture_list): - self.execute_model(model_input, kv_caches, intermediate_tensors) + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() return @@ -1400,12 +1422,6 @@ def list_prompt_adapters(self) -> Set[int]: raise RuntimeError("PromptAdapter is not enabled.") return self.prompt_adapter_manager.list_adapters() - @property - def model_is_mrope(self) -> bool: - """Detect if the model has "mrope" rope_scaling type. - mrope requires keep "rope_deltas" between prompt and decoding phases.""" - return uses_mrope(self.model_config.hf_config) - @torch.inference_mode() def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: """Cuda graph capture a model. @@ -1421,22 +1437,22 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: per sequence in the batch. """ assert not self.model_config.enforce_eager - logger.info("Capturing the model for CUDA graphs. This may lead to " + logger.info("Capturing cudagraphs for decoding. This may lead to " "unexpected consequences if the model is not static. To " "run the model in eager mode, set 'enforce_eager=True' or " "use '--enforce-eager' in the CLI.") - logger.info("CUDA graphs can take additional 1~3 GiB memory per GPU. " - "If you are running out of memory, consider decreasing " - "`gpu_memory_utilization` or enforcing eager mode. " - "You can also reduce the `max_num_seqs` as needed " - "to decrease memory usage.") + logger.info("If out-of-memory error occurs during cudagraph capture," + " consider decreasing `gpu_memory_utilization` or " + "switching to eager mode. You can also reduce the " + "`max_num_seqs` as needed to decrease memory usage.") start_time = time.perf_counter() + start_free_gpu_memory = torch.cuda.mem_get_info()[0] # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() - if self.model_is_mrope: + if self.model_config.uses_mrope: input_positions = torch.tile(input_positions, (3, 1)) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. @@ -1456,23 +1472,19 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: dtype=self.model_config.dtype, device=self.device) - graph_batch_size = self.max_batchsize_to_capture - batch_size_capture_list = [ - bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size - ] - with self.attn_state.graph_capture( max_batch_size), graph_capture() as graph_capture_context: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. for virtual_engine in range( self.parallel_config.pipeline_parallel_size): - for batch_size in reversed(batch_size_capture_list): + for batch_size in \ + self.vllm_config.compilation_config.capture_sizes: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, is_encoder_decoder_model=self.model_config. - is_encoder_decoder_model)) + is_encoder_decoder)) if self.lora_config: lora_mapping = LoRAMapping( @@ -1491,7 +1503,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: graph_runner = CUDAGraphRunner( self.model, self.attn_backend.get_name(), self.attn_state.graph_clone(batch_size), - self.model_config.is_encoder_decoder_model) + self.model_config.is_encoder_decoder) capture_inputs = { "input_ids": @@ -1522,22 +1534,25 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: self.model.get_seqlen_agnostic_capture_inputs( batch_size) }) - if self.model_config.is_encoder_decoder_model: + if self.model_config.is_encoder_decoder: # add the additional inputs to capture for # encoder-decoder models. self._update_inputs_to_capture_for_enc_dec_model( capture_inputs) - with set_forward_context(attn_metadata): + with set_forward_context(attn_metadata, self.vllm_config): graph_runner.capture(**capture_inputs) self.graph_memory_pool = graph_runner.graph.pool() self.graph_runners[virtual_engine][batch_size] = ( graph_runner) end_time = time.perf_counter() + end_free_gpu_memory = torch.cuda.mem_get_info()[0] elapsed_time = end_time - start_time + cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory # This usually takes < 10 seconds. - logger.info("Graph capturing finished in %.0f secs.", elapsed_time) + logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", + elapsed_time, cuda_graph_size / GiB_bytes) def _update_inputs_to_capture_for_enc_dec_model(self, capture_inputs: Dict[str, @@ -1660,6 +1675,24 @@ def execute_model( else: model_executable = self.model + # Receive KV cache in distributed KV cache transfer setting + # In disagg prefill setting, it will also recv hidden states and bypass + # model forwarding + # In KV cache database setting, it will change the model input so that + # we can skip prefilling on tokens that successfully received KV caches + # NOTE: The receive operation is blocking + bypass_model_exec = False + if self.need_recv_kv(model_input, kv_caches): + hidden_or_intermediate_states, bypass_model_exec, model_input = \ + get_kv_transfer_group().recv_kv_caches_and_hidden_states( + # model is used to know which layer the current worker + # is working on, so that we can receive KV for only those + # layers. + model_executable, + model_input, + kv_caches=kv_caches + ) + multi_modal_kwargs = model_input.multi_modal_kwargs or {} seqlen_agnostic_kwargs = { "finished_requests_ids": model_input.finished_requests_ids, @@ -1671,21 +1704,36 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - with set_forward_context(model_input.attn_metadata): - hidden_or_intermediate_states = model_executable( - input_ids=model_input.input_tokens, - positions=model_input.input_positions, - kv_caches=kv_caches, - attn_metadata=model_input.attn_metadata, - intermediate_tensors=intermediate_tensors, - **MultiModalInputs.as_kwargs(multi_modal_kwargs, - device=self.device), - **seqlen_agnostic_kwargs) + if not bypass_model_exec: + with set_forward_context(model_input.attn_metadata, + self.vllm_config): + hidden_or_intermediate_states = model_executable( + input_ids=model_input.input_tokens, + positions=model_input.input_positions, + kv_caches=kv_caches, + attn_metadata=model_input.attn_metadata, + intermediate_tensors=intermediate_tensors, + **MultiModalKwargs.as_kwargs(multi_modal_kwargs, + device=self.device), + **seqlen_agnostic_kwargs) if (self.observability_config is not None and self.observability_config.collect_model_forward_time): model_forward_end.record() + # Sending KV cache in distributed KV cache transfer setting + # NOTE: the send operation is non-blocking + if self.need_send_kv(model_input, kv_caches): + get_kv_transfer_group().send_kv_caches_and_hidden_states( + # model_executable is used to know which layer the current + # worker is working on, so that we can send KV for only those + # layers. + model_executable, + model_input, + kv_caches, + hidden_or_intermediate_states, + ) + # Compute the logits in the last pipeline stage. if not get_pp_group().is_last_rank: if (self.is_driver_worker @@ -1753,6 +1801,56 @@ def execute_model( return [output] + def need_recv_kv(self, model_input, kv_caches) -> bool: + """Check if we need to receive kv-cache from the other worker. + We need to receive KV when + 1. current vLLM instance is KV cache consumer/decode vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + if self.vllm_config.kv_transfer_config is None: + return False + + return self.vllm_config.kv_transfer_config.is_kv_consumer and ( + not is_profile_run) and is_prefill_run + + def need_send_kv(self, model_input, kv_caches) -> bool: + """Check if we need to send kv-cache to the other worker. + We need to send KV when + 1. current vLLM instance is KV cache producer/prefill vLLM instance + 2. this batch is not a profiling run + 3. this batch is a prefill run + + Args: + model_input: input to the model executable + kv_caches: vLLM's paged memory + """ + + prefill_meta = model_input.attn_metadata.prefill_metadata + + # check if the current run is profiling + is_profile_run = (kv_caches[0].numel() == 0) + # check if the current run is prefill + is_prefill_run = prefill_meta is not None + + if self.vllm_config.kv_transfer_config is None: + return False + + return self.vllm_config.kv_transfer_config.is_kv_producer and ( + not is_profile_run) and is_prefill_run + # NOTE: this is nn.Module so the profiler can properly capture/group # kernels calls made within the graph @@ -1791,7 +1889,7 @@ def capture( # Run the model a few times without capturing the graph. # This is to make sure that the captured graph does not include the # kernel launches for initial benchmarking (e.g., Triton autotune). - # Note one iteration is not enough for torch.jit.script + # Note one iteration is not enough for torch.compile for _ in range(_NUM_WARMUP_ITERS): self.model( input_ids=input_ids, @@ -1904,37 +2002,3 @@ def forward( return self.output_buffers["hidden_states"] return self.output_buffers - - -def _get_graph_batch_size(batch_size: int) -> int: - """Returns the padded batch size given actual batch size. - - Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT, - 2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT... - """ - if batch_size <= 2: - return batch_size - elif batch_size <= 4: - return 4 - else: - return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) // - _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT) - - -def _get_max_graph_batch_size(max_num_seqs: int) -> int: - """ - max_num_seqs: Maximum number of sequences in a batch. - _BATCH_SIZES_TO_CAPTURE: all the sizes that we want to capture. - - pad the max_num_seqs if necessary by calling _get_graph_batch_size, - which will deal with some edge cases like 1, 2, 4. - - if the padded size is in _BATCH_SIZES_TO_CAPTURE, return the padded size. - if not, it means the padded size is larger than the largest size in - _BATCH_SIZES_TO_CAPTURE, return the largest size in _BATCH_SIZES_TO_CAPTURE. - """ - padded_size = _get_graph_batch_size(max_num_seqs) - if padded_size in _BATCH_SIZES_TO_CAPTURE: - return padded_size - assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1] - return _BATCH_SIZES_TO_CAPTURE[-1] diff --git a/flagscale/inference/core/preprocess.py b/flagscale/inference/core/preprocess.py index c91d70424..1f93a2920 100644 --- a/flagscale/inference/core/preprocess.py +++ b/flagscale/inference/core/preprocess.py @@ -1,37 +1,26 @@ # This file is modified from 'FlagScale/vllm/vllm/inputs/preprocess.py' import asyncio -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import List, Mapping, Optional, Union from typing_extensions import assert_never from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.utils import print_warning_once +from vllm.utils import print_info_once, print_warning_once # --- FLAGSCALE MODIFICATION BEG --- -from vllm.inputs.data import (DecoderOnlyInputs, EncoderDecoderInputs, PromptType, - SingletonPrompt) +from vllm.inputs.data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, + PromptType, SingletonInputs, SingletonPrompt, token_inputs) from vllm.inputs.parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt # --- FLAGSCALE MODIFICATION END --- -if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict - logger = init_logger(__name__) -# --- FLAGSCALE MODIFICATION BEG --- -PromptComponents = Tuple[Optional[str], List[int], - Optional["MultiModalDataDict"], - Optional[Dict[str, Any]], - Optional[None], Optional[None]] -DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]], - Optional["MultiModalDataDict"], - Optional[Dict[str, Any]]] -# --- FLAGSCALE MODIFICATION END --- - class InputPreprocessor: @@ -39,11 +28,13 @@ def __init__( self, model_config: ModelConfig, tokenizer: Optional[BaseTokenizerGroup], + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: super().__init__() self.model_config = model_config self.tokenizer = tokenizer + self.mm_registry = mm_registry def get_tokenizer_group(self) -> BaseTokenizerGroup: if self.tokenizer is None: @@ -79,7 +70,7 @@ def get_decoder_start_token_id(self) -> Optional[int]: model config is unavailable. ''' - if not self.is_encoder_decoder_model(): + if not self.model_config.is_encoder_decoder: print_warning_once("Using None for decoder start token id because " "this is not an encoder/decoder model.") return None @@ -121,7 +112,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: "default" decoder prompt be . However, it is possible that in the future - other models may have different or more + other models may have different or more complex logic for the default decoder prompt. This motivates having a special helper method for default decoder prompts. @@ -138,7 +129,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: def _prepare_decoder_input_ids_for_generation( self, decoder_input_ids: Optional[List[int]], - force_bos: bool = True, ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -168,8 +158,8 @@ def _prepare_decoder_input_ids_for_generation( # use decoder_start_token_id as decoder_input_ids decoder_input_ids = self._get_default_enc_dec_decoder_prompt() - if force_bos and (len(decoder_input_ids) == 0 - or decoder_input_ids[0] != decoder_start_token_id): + if (len(decoder_input_ids) == 0 + or decoder_input_ids[0] != decoder_start_token_id): decoder_input_ids = [decoder_start_token_id] + decoder_input_ids return decoder_input_ids @@ -215,14 +205,79 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) - def _extract_prompt_components( + def _can_process_multimodal(self) -> bool: + model_config = self.model_config + + if not model_config.is_multimodal_model: + raise ValueError("Your model does not support multi-modal inputs") + + # Interim measure so we can handle models that have yet to be + # updated to use the new multi-modal processor + can_process_multimodal = self.mm_registry.has_processor(model_config) + if not can_process_multimodal: + print_info_once( + "Your model uses the legacy input pipeline instead of the new " + "multi-modal processor. Please note that the legacy pipeline " + "will be removed in a future release. For more details, see: " + "https://github.com/vllm-project/vllm/issues/10114") + + return can_process_multimodal + + def _process_multimodal( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """ + Apply the model's multi-modal processor to a multi-modal prompt, + returning the corresponding token IDs and metadata. + """ + tokenizer_group = self.get_tokenizer_group() + tokenizer = tokenizer_group.get_lora_tokenizer(lora_request) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + + if isinstance(prompt, list): + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + async def _process_multimodal_async( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """Async version of :meth:`_process_multimodal`.""" + tokenizer_group = self.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request + ) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + if isinstance(prompt, list): + logger.warning("Passing `multi_modal_data` in TokensPrompt is" + "deprecated and will be removed in a future update") + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: - ''' - Extract the components of any single encoder or decoder input prompt. + ) -> SingletonInputs: + """ + Extract the singleton inputs from a prompt. Arguments: @@ -232,12 +287,8 @@ def _extract_prompt_components( Returns: - * prompt - * prompt_token_ids - * multi_modal_data - * mm_processor_kwargs (request-level input processor/mapper overrides) - ''' - + * :class:`SingletonInputs` instance + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": @@ -247,29 +298,60 @@ def _extract_prompt_components( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - negative_prompt_text = negative_prompt_token_ids = None # --- FLAGSCALE MODIFICATION --- - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - # --- FLAGSCALE MODIFICATION BEG --- - negative_prompt_text = None - negative_prompt_token_ids = parsed["content"].get("negative_prompt_token_ids") - # --- FLAGSCALE MODIFICATION END --- - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + token_type_ids = tokens_content.get("token_type_ids") + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + negative_prompt_token_ids = tokens_content.get("negative_prompt_token_ids") # --- FLAGSCALE MODIFICATION --- + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + + return token_inputs( + prompt_token_ids=prompt_token_ids, + token_type_ids=token_type_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + negative_prompt_token_ids=negative_prompt_token_ids, # --- FLAGSCALE MODIFICATION --- + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + negative_prompt_text = tokens_content.get("negative_prompt") # --- FLAGSCALE MODIFICATION --- + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + # --- FLAGSCALE MODIFICATION BEG --- - negative_prompt_text = parsed["content"].get("negative_prompt") negative_prompt_token_ids = None if negative_prompt_text: negative_prompt_token_ids = self._tokenize_prompt( @@ -278,19 +360,26 @@ def _extract_prompt_components( lora_request=lora_request, ) # --- FLAGSCALE MODIFICATION END --- - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs, - negative_prompt_text, negative_prompt_token_ids) # --- FLAGSCALE MODIFICATION --- + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + # --- FLAGSCALE MODIFICATION BEG --- + negative_prompt_text=negative_prompt_text, + negative_prompt_token_ids=negative_prompt_token_ids, + # --- FLAGSCALE MODIFICATION END --- + ) + + assert_never(parsed) - async def _extract_prompt_components_async( + async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -301,29 +390,58 @@ async def _extract_prompt_components_async( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - negative_prompt_text = negative_prompt_token_ids = None # --- FLAGSCALE MODIFICATION --- - elif parsed["type"] == "tokens": - prompt_text = None - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - # --- FLAGSCALE MODIFICATION BEG --- - negative_prompt_text = None - negative_prompt_token_ids = parsed["content"].get("negative_prompt_token_ids") - # --- FLAGSCALE MODIFICATION END --- - elif parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": + tokens_content = parsed["content"] + + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + negative_prompt_token_ids = tokens_content.get("negative_prompt_token_ids") # --- FLAGSCALE MODIFICATION --- + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + negative_prompt_token_ids=negative_prompt_token_ids, # --- FLAGSCALE MODIFICATION --- + ) + + if parsed["type"] == "text": + text_content = parsed["content"] + + prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + negative_prompt_text = tokens_content.get("negative_prompt") # --- FLAGSCALE MODIFICATION --- + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + # --- FLAGSCALE MODIFICATION BEG --- - negative_prompt_text = parsed["content"].get("negative_prompt") negative_prompt_token_ids = None if negative_prompt_text: negative_prompt_token_ids = await self._tokenize_prompt_async( @@ -332,44 +450,50 @@ async def _extract_prompt_components_async( lora_request=lora_request, ) # --- FLAGSCALE MODIFICATION END --- - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs, - negative_prompt_text, negative_prompt_token_ids) # --- FLAGSCALE MODIFICATION --- + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + # --- FLAGSCALE MODIFICATION BEG --- + negative_prompt_text=negative_prompt_text, + negative_prompt_token_ids=negative_prompt_token_ids + # --- FLAGSCALE MODIFICATION END --- + ) + + assert_never(parsed) def _build_enc_dec_llm_inputs( self, - encoder_comps: PromptComponents, - decoder_comps: DecoderPromptComponents, - mm_processor_kwargs: Dict[str, Any], + encoder_inputs: SingletonInputs, + decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps - - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if decoder_mm_data is not None: - raise ValueError( - "Multi-modality decoder inputs of encoder-decoder models are " - "not supported yet") - - # For Multi-Modal models (e.g., mllama), the text input can be - # <|image|><|begin_of_text|>hello world. And we should not add - # another <|begin_of_text|> to the beginning. - decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation( - decoder_prompt_ids, - force_bos=(encoder_mm_data is None and decoder_mm_data is None))) + if (encoder_inputs["type"] == "token" + or encoder_inputs["type"] == "multimodal"): + pass + else: + assert_never(encoder_inputs) + + if decoder_inputs is None: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) + decoder_inputs = token_inputs(dec_token_ids) + elif (decoder_inputs["type"] == "token" + or decoder_inputs["type"] == "multimodal"): + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"]) + decoder_inputs["prompt_token_ids"] = dec_token_ids + + if "multi_modal_data" in decoder_inputs: + raise ValueError("Multi-modal decoder inputs of encoder-" + "decoder models are not supported yet") + else: + assert_never(encoder_inputs) return EncoderDecoderInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - multi_modal_data=decoder_mm_data, - mm_processor_kwargs=mm_processor_kwargs, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - encoder_multi_modal_data=encoder_mm_data, + encoder=encoder_inputs, + decoder=decoder_inputs, ) def _process_encoder_decoder_prompt( @@ -377,10 +501,9 @@ def _process_encoder_decoder_prompt( prompt: PromptType, request_id: str, ) -> EncoderDecoderInputs: - ''' + """ For encoder/decoder models only: - Process an input prompt into an - :class:`EncoderDecoderInputs` instance. + Process an input prompt into an :class:`EncoderDecoderInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -399,7 +522,7 @@ def _process_encoder_decoder_prompt( have any possible singleton type; thus this method relies on helper functions to obtain token ids for the sub-prompts. - + Arguments: * prompt: an input prompt @@ -408,42 +531,32 @@ def _process_encoder_decoder_prompt( Returns: * :class:`EncoderDecoderInputs` instance - ''' - - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + """ + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - decoder_comps = None, None, None, None + decoder_inputs = None else: - decoder_comps = self._extract_prompt_components( + decoder_inputs = self._prompt_to_llm_inputs( decoder_input, request_id=request_id, ) - # Handle this carefully in case it was directly initialized by user - mm_processor_kwargs = prompt.get("mm_processor_kwargs", {}) else: - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) async def _process_encoder_decoder_prompt_async( self, @@ -451,63 +564,51 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_task = self._extract_prompt_components_async( + encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - encoder_comps = await encoder_task - decoder_comps = None, None, None, None + encoder_inputs = await encoder_task + decoder_inputs = None else: - decoder_task = self._extract_prompt_components_async( + decoder_task = self._prompt_to_llm_inputs_async( decoder_input, request_id=request_id, ) - encoder_comps, decoder_comps = await asyncio.gather( + encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) - mm_processor_kwargs = prompt["mm_processor_kwargs"] else: - encoder_comps = await self._extract_prompt_components_async( + encoder_inputs = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) def _build_decoder_only_llm_inputs( self, - prompt_comps: PromptComponents, + prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - # --- FLAGSCALE MODIFICATION BEG --- - (prompt, prompt_token_ids, multi_modal_data, - mm_processor_kwargs, negative_prompt, negative_prompt_token_ids) = prompt_comps - # --- FLAGSCALE MODIFICATION END --- - - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) + if (prompt_inputs["type"] == "token" + or prompt_inputs["type"] == "multimodal"): + prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( + prompt_inputs["prompt_token_ids"], + prompt_adapter_request=prompt_adapter_request, + ) + else: + assert_never(prompt_inputs) - return DecoderOnlyInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs, - negative_prompt_token_ids=negative_prompt_token_ids, # --- FLAGSCALE MODIFICATION --- - negative_prompt=negative_prompt) # --- FLAGSCALE MODIFICATION --- + return prompt_inputs def _process_decoder_only_prompt( self, @@ -516,7 +617,7 @@ def _process_decoder_only_prompt( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: - ''' + """ For decoder-only models: Process an input prompt into an :class:`DecoderOnlyInputs` instance. @@ -530,9 +631,9 @@ def _process_decoder_only_prompt( Returns: * :class:`DecoderOnlyInputs` instance - ''' + """ - prompt_comps = self._extract_prompt_components( + prompt_comps = self._prompt_to_llm_inputs( prompt, request_id=request_id, lora_request=lora_request, @@ -551,7 +652,7 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" - prompt_comps = await self._extract_prompt_components_async( + prompt_comps = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, lora_request=lora_request, @@ -568,9 +669,9 @@ def preprocess( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Preprocess the input prompt.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( @@ -596,9 +697,9 @@ async def preprocess_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" - if self.is_encoder_decoder_model(): + if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( @@ -617,6 +718,3 @@ async def preprocess_async( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - - def is_encoder_decoder_model(self): - return self.model_config.is_encoder_decoder_model diff --git a/flagscale/inference/core/sampling_metadata.py b/flagscale/inference/core/sampling_metadata.py index 9762ddb30..f86c9cae2 100644 --- a/flagscale/inference/core/sampling_metadata.py +++ b/flagscale/inference/core/sampling_metadata.py @@ -301,6 +301,7 @@ def _prepare_seq_groups( negative_prompt_logprob_len = (negative_query_len - num_prefill_sample if do_sample else negative_query_len) negative_sample_len = num_prefill_sample if do_sample else 0 + # --- FLAGSCALE MODIFICATION END --- else: query_len, seq_len = query_lens[i], seq_lens[i] # If we need sampling, exclude num_prefill_sample tokens from @@ -308,22 +309,26 @@ def _prepare_seq_groups( prompt_logprob_len = (query_len - num_prefill_sample if do_sample else query_len) sample_len = num_prefill_sample if do_sample else 0 - # --- FLAGSCALE MODIFICATION END --- else: # Decode # --- FLAGSCALE MODIFICATION BEG --- if has_negative: + # positive prompt_logprob_len = 0 - query_len = query_lens[::2][i] if query_lens is not None else 1 + query_len = query_lens[::2][i] if query_lens is not None and len( + query_lens[::2]) > 0 else 1 sample_len = len(seq_ids) * query_len if do_sample else 0 + # negative negative_prompt_logprob_len = 0 - negative_query_len = query_lens[1::2][i] if query_lens is not None else 1 + negative_query_len = query_lens[1::2][i] if query_lens is not None and len( + query_lens[1::2]) > 0 else 1 negative_sample_len = len(seq_ids) * negative_query_len if do_sample else 0 + # --- FLAGSCALE MODIFICATION END --- else: prompt_logprob_len = 0 - query_len = query_lens[i] if query_lens is not None else 1 + query_len = query_lens[i] if query_lens is not None and len( + query_lens) > 0 else 1 sample_len = len(seq_ids) * query_len if do_sample else 0 - # --- FLAGSCALE MODIFICATION END --- if sampling_params.seed is not None and generators is not None: generator = generators.get(seq_group_metadata.request_id) @@ -515,6 +520,7 @@ def from_sampling_metadata( if do_penalties: for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids + sampling_params = seq_group.sampling_params if (seq_group.is_prompt and sampling_params.prompt_logprobs is not None): prefill_len = len(seq_group.prompt_logprob_indices) diff --git a/flagscale/inference/core/sampling_params.py b/flagscale/inference/core/sampling_params.py index 4cd943a23..fbe33c808 100644 --- a/flagscale/inference/core/sampling_params.py +++ b/flagscale/inference/core/sampling_params.py @@ -199,9 +199,7 @@ class SamplingParams( include_stop_str_in_output: bool = False truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=1)]] = None output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE - # --- FLAGSCALE MODIFICATION BEG --- - guidance_scale: Optional[float] = None - # --- FLAGSCALE MODIFICATION END --- + guidance_scale: Optional[float] = None # --- FLAGSCALE MODIFICATION --- # The below fields are not supposed to be used as an input. # They are set in post_init. @@ -297,8 +295,9 @@ def __post_init__(self) -> None: raise ValueError( f"best_of must be greater than or equal to n, " f"got n={self.n} and best_of={self.best_of}.") - self._real_n = self.n - self.n = self.best_of + if not self._real_n: + self._real_n = self.n + self.n = self.best_of if 0 < self.temperature < _MAX_TEMP: logger.warning( @@ -490,10 +489,8 @@ def __repr__(self) -> str: "spaces_between_special_tokens=" f"{self.spaces_between_special_tokens}, " f"truncate_prompt_tokens={self.truncate_prompt_tokens}, " - f"guided_decoding={self.guided_decoding}, " - # --- FLAGSCALE MODIFICATION BEG --- - f"guidance_scale={self.guidance_scale})") - # --- FLAGSCALE MODIFICATION END --- + f"guided_decoding={self.guided_decoding}, " + f"guidance_scale={self.guidance_scale})") # --- FLAGSCALE MODIFICATION --- class BeamSearchParams( @@ -507,3 +504,4 @@ class BeamSearchParams( ignore_eos: bool = False temperature: float = 0.0 length_penalty: float = 1.0 + include_stop_str_in_output: bool = False diff --git a/flagscale/inference/core/scheduler.py b/flagscale/inference/core/scheduler.py index df0a010a8..f2e5c1c10 100644 --- a/flagscale/inference/core/scheduler.py +++ b/flagscale/inference/core/scheduler.py @@ -3,6 +3,7 @@ import os import random import time +import contextlib # --- FLAGSCALE MODIFICATION --- from collections import deque from dataclasses import dataclass, field from typing import Callable, Deque, Dict, Iterable, List, Optional @@ -57,11 +58,16 @@ class SchedulingBudget: max_num_seqs: int _request_ids_num_batched_tokens: Set[str] = field(default_factory=set) _request_ids_num_curr_seqs: Set[str] = field(default_factory=set) + # Number of cached tokens in the batch. + _num_cached_tokens: int = 0 + # Number of actual non-cached tokens in the batch. _num_batched_tokens: int = 0 _num_curr_seqs: int = 0 def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): - assert num_new_tokens != 0 + # We allow num_new_tokens to be 0 when the entire sequence has + # been cached. + assert num_new_tokens >= 0 assert num_new_seqs != 0 return (self.num_batched_tokens + num_new_tokens <= self.token_budget and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs) @@ -69,12 +75,18 @@ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int): def remaining_token_budget(self): return self.token_budget - self.num_batched_tokens - def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int): + def add_num_batched_tokens(self, + req_id: str, + num_batched_tokens: int, + num_cached_tokens: int = 0): if req_id in self._request_ids_num_batched_tokens: return + assert num_cached_tokens >= 0 + assert num_batched_tokens >= 0 self._request_ids_num_batched_tokens.add(req_id) self._num_batched_tokens += num_batched_tokens + self._num_cached_tokens += num_cached_tokens def subtract_num_batched_tokens(self, req_id: str, num_batched_tokens: int): @@ -102,6 +114,10 @@ def num_batched_tokens(self): def num_curr_seqs(self): return self._num_curr_seqs + @property + def num_cached_tokens(self): + return self._num_cached_tokens + @dataclass class ScheduledSequenceGroup: @@ -111,9 +127,7 @@ class ScheduledSequenceGroup: # 1 for decoding. Same as prompt tokens for prefill, but if prefill is # chunked, it can be smaller than that. token_chunk_size: int - # --- FLAGSCALE MODIFICATION BEG --- - negative_token_chunk_size: int = 0 - # --- FLAGSCALE MODIFICATION END --- + negative_token_chunk_size: int = 0 # --- FLAGSCALE MODIFICATION --- @dataclass @@ -295,11 +309,55 @@ def scheduler_running_outputs_builder(): def scheduled_seq_group_builder(): return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup), - token_chunk_size=0, - negative_token_chunk_size=0) # --- FLAGSCALE MODIFICATION --- + token_chunk_size=0, negative_token_chunk_size=0) # --- FLAGSCALE MODIFICATION --- # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0) +# --- FLAGSCALE MODIFICATION BEG --- +@contextlib.contextmanager +def _switch_seq_group(seq_group: SequenceGroup, new_seqs: Sequence): + + if new_seqs is None: + yield + else: + origin_seq = seq_group.seqs + origin_negative_seq = seq_group.negative_seqs + + for idx, new_seq in enumerate(new_seqs): + new_seq.status = origin_seq[idx].status + + seq_group.seqs = new_seqs + seq_group.negative_seqs = None + try: + yield + finally: + seq_group.seqs = origin_seq + seq_group.negative_seqs = origin_negative_seq + + +def _update_num_new_tokens( + budget: SchedulingBudget, + num_new_tokens: int = 0, + num_new_tokens_negative: int = 0, +): + remaining_token_budget = budget.remaining_token_budget() + if num_new_tokens + num_new_tokens_negative >= remaining_token_budget: + min_num_new_tokens = min(num_new_tokens, remaining_token_budget // 2) + min_num_new_tokens_negative = min(num_new_tokens_negative, remaining_token_budget // 2) + + if min_num_new_tokens == num_new_tokens and \ + min_num_new_tokens_negative != num_new_tokens_negative: + return 0, 0 + elif min_num_new_tokens != num_new_tokens and \ + min_num_new_tokens_negative == num_new_tokens_negative: + return 0, 0 + else: + return min_num_new_tokens, min_num_new_tokens_negative + + return num_new_tokens, num_new_tokens_negative +# --- FLAGSCALE MODIFICATION END --- + + class Scheduler: def __init__( @@ -546,10 +604,34 @@ def _schedule_running( assert len(self._async_stopped) == 0 while running_queue: seq_group = running_queue[0] - num_running_tokens, negative_num_running_tokens = self._get_num_new_tokens( # --- FLAGSCALE MODIFICATION --- - seq_group, SequenceStatus.RUNNING, enable_chunking, budget) + # We discard the cached tokens info here because we don't need it + # for running sequence: + # 1. If a sequence is running with chunked prefill, the cached + # tokens info was already used for the first prefill. + # 2. If a sequence is running with non-chunked prefill, then + # there it's a decoding sequence, and the cached tokens info is + # irrelevant. + num_uncached_new_tokens, _ = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.RUNNING, enable_chunking, + budget)) + + # --- FLAGSCALE MODIFICATION BEG --- + with _switch_seq_group(seq_group, seq_group.negative_seqs): + num_uncached_new_tokens_negative, _ = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.RUNNING, enable_chunking, + budget)) + # --- FLAGSCALE MODIFICATION END --- - if num_running_tokens + negative_num_running_tokens == 0: # --- FLAGSCALE MODIFICATION --- + num_running_tokens = num_uncached_new_tokens + # --- FLAGSCALE MODIFICATION BEG --- + num_running_tokens_negative = num_uncached_new_tokens_negative + num_running_tokens, num_running_tokens_negative = _update_num_new_tokens( + budget, num_running_tokens, num_running_tokens_negative) + # --- FLAGSCALE MODIFICATION END --- + + if num_running_tokens + num_running_tokens_negative == 0: # --- FLAGSCALE MODIFICATION --- # No budget => Stop break @@ -568,7 +650,7 @@ def _schedule_running( # slot to keep all the sequence groups in the RUNNING state. while not self._can_append_slots(seq_group, enable_chunking): budget.subtract_num_batched_tokens(seq_group.request_id, - num_running_tokens + negative_num_running_tokens) # --- FLAGSCALE MODIFICATION --- + num_running_tokens + num_running_tokens_negative) # --- FLAGSCALE MODIFICATION --- num_running_seqs = seq_group.get_max_num_running_seqs() budget.subtract_num_seqs(seq_group.request_id, num_running_seqs) @@ -624,7 +706,7 @@ def _schedule_running( scheduled_seq_group.seq_group = seq_group if is_prefill: scheduled_seq_group.token_chunk_size = num_running_tokens - scheduled_seq_group.negative_token_chunk_size = negative_num_running_tokens # --- FLAGSCALE MODIFICATION --- + scheduled_seq_group.negative_token_chunk_size = num_running_tokens_negative # --- FLAGSCALE MODIFICATION --- prefill_seq_groups.append(scheduled_seq_group) ret.prefill_seq_groups_list.append(seq_group) else: @@ -634,7 +716,7 @@ def _schedule_running( ret.decode_seq_groups_list.append(seq_group) budget.add_num_batched_tokens(seq_group.request_id, - num_running_tokens + negative_num_running_tokens) # --- FLAGSCALE MODIFICATION --- + num_running_tokens + num_running_tokens_negative) # --- FLAGSCALE MODIFICATION --- # OPTIMIZATION: Note that get_max_num_running_seqs is # expensive. For the default scheduling chase where # enable_chunking is False, num_seqs are updated before running @@ -722,14 +804,24 @@ def _schedule_swapped( # The total number of sequences in the RUNNING state should not # exceed the maximum number of sequences. num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens, negative_num_new_tokens = self._get_num_new_tokens(seq_group, # --- FLAGSCALE MODIFICATION --- - SequenceStatus.SWAPPED, - enable_chunking, budget) + num_new_tokens_uncached, num_new_tokens_cached = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.SWAPPED, enable_chunking, + budget)) - if (num_new_tokens + negative_num_new_tokens == 0 - or not budget.can_schedule(num_new_tokens=num_new_tokens + negative_num_new_tokens, # --- FLAGSCALE MODIFICATION --- - num_new_seqs=num_new_seqs)): + # --- FLAGSCALE MODIFICATION BEG --- + with _switch_seq_group(seq_group, seq_group.negative_seqs): + num_new_tokens_uncached_negative, num_new_tokens_cached_negative = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.SWAPPED, enable_chunking, + budget)) + if num_new_tokens_uncached + num_new_tokens_uncached_negative == 0 \ + or not budget.can_schedule( + num_new_tokens=num_new_tokens_uncached + num_new_tokens_uncached_negative, + num_new_seqs=num_new_seqs, + ): break + # --- FLAGSCALE MODIFICATION END --- if lora_int_id > 0 and curr_loras is not None: curr_loras.add(lora_int_id) @@ -739,13 +831,24 @@ def _schedule_swapped( is_prefill = seq_group.is_prefill() if is_prefill: prefill_seq_groups.append( - ScheduledSequenceGroup(seq_group, - token_chunk_size=num_new_tokens, - negative_token_chunk_size=negative_num_new_tokens)) # --- FLAGSCALE MODIFICATION --- + ScheduledSequenceGroup( + seq_group, + token_chunk_size=num_new_tokens_uncached + + num_new_tokens_cached, + negative_token_chunk_size=num_new_tokens_uncached_negative + + num_new_tokens_cached_negative # --- FLAGSCALE MODIFICATION --- + )) else: decode_seq_groups.append( - ScheduledSequenceGroup(seq_group, token_chunk_size=1, negative_token_chunk_size=1 if seq_group.has_negative_seqs() else 0)) # --- FLAGSCALE MODIFICATION --- - budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens + negative_num_new_tokens) # --- FLAGSCALE MODIFICATION --- + ScheduledSequenceGroup( + seq_group, + token_chunk_size=1, + negative_token_chunk_size=1 if seq_group.has_negative_seqs() else 0)) # --- FLAGSCALE MODIFICATION --- + budget.add_num_batched_tokens( + seq_group.request_id, + num_batched_tokens=num_new_tokens_uncached + num_new_tokens_uncached_negative, # --- FLAGSCALE MODIFICATION --- + num_cached_tokens=num_new_tokens_cached + num_new_tokens_cached_negative, # --- FLAGSCALE MODIFICATION --- + ) budget.add_num_seqs(seq_group.request_id, num_new_seqs) swapped_queue.extendleft(leftover_swapped) @@ -811,33 +914,36 @@ def _schedule_priority_preemption( if waiting_queue: seq_group = waiting_queue.popleft() num_new_seqs = seq_group.get_max_num_running_seqs() - num_new_tokens = self._get_num_new_tokens(seq_group, - SequenceStatus.WAITING, - False, budget) + num_new_tokens_uncached, _ = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.WAITING, False, budget)) #Only preempt if priority inversion exists while running_queue and self._get_priority( running_queue[-1]) > self._get_priority(seq_group): #Only preempt if waiting sequence cannot be allocated can_allocate = self.block_manager.can_allocate(seq_group) - if (num_new_tokens and can_allocate == AllocStatus.OK - and budget.can_schedule(num_new_tokens=num_new_tokens, - num_new_seqs=num_new_seqs)): + if (num_new_tokens_uncached > 0 + and can_allocate == AllocStatus.OK + and budget.can_schedule( + num_new_tokens=num_new_tokens_uncached, + num_new_seqs=num_new_seqs, + )): break #Adjust budget to remove the victim sequence group vseq_group = running_queue.pop() - num_running_tokens = self._get_num_new_tokens( - vseq_group, SequenceStatus.RUNNING, False, budget) - budget.subtract_num_batched_tokens(vseq_group.request_id, - num_running_tokens) + num_running_tokens_uncached, _ = ( + self._get_num_new_uncached_and_cached_tokens( + vseq_group, SequenceStatus.RUNNING, False, budget)) + budget.subtract_num_batched_tokens( + vseq_group.request_id, num_running_tokens_uncached) num_running_seqs = vseq_group.get_max_num_running_seqs() budget.subtract_num_seqs(vseq_group.request_id, num_running_seqs) #Preempt out the victim sequence group - self._preempt(vseq_group, blocks_to_swap_out, - PreemptionMode.RECOMPUTE) + self._preempt(vseq_group, blocks_to_swap_out) waiting_queue.appendleft(vseq_group) force_preemption_count += 1 #Put the sequence back into the waiting queue @@ -891,18 +997,39 @@ def _schedule_prefills( assert len(waiting_seqs) == 1, ( "Waiting sequence group should have only one prompt " "sequence.") - num_new_tokens, negative_num_new_tokens = self._get_num_new_tokens(seq_group, # --- FLAGSCALE MODIFICATION --- - SequenceStatus.WAITING, - enable_chunking, budget) + num_new_tokens_uncached, num_new_tokens_cached = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.WAITING, enable_chunking, + budget)) + + # --- FLAGSCALE MODIFICATION BEG --- + with _switch_seq_group(seq_group, seq_group.negative_seqs): + num_new_tokens_uncached_negative, num_new_tokens_cached_negative = ( + self._get_num_new_uncached_and_cached_tokens( + seq_group, SequenceStatus.WAITING, enable_chunking, + budget)) + num_new_tokens_uncached, num_new_tokens_uncached_negative = _update_num_new_tokens( + budget, num_new_tokens_uncached, num_new_tokens_uncached_negative) + + num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached + num_new_tokens_negative = num_new_tokens_uncached_negative + num_new_tokens_cached_negative + # --- FLAGSCALE MODIFICATION END --- + if not enable_chunking: num_prompt_tokens = waiting_seqs[0].get_len() assert num_new_tokens == num_prompt_tokens + # --- FLAGSCALE MODIFICATION BEG --- + if num_new_tokens_negative > 0: + assert self.scheduler_config.is_multi_step is False + assert self.cache_config.enable_prefix_caching is False + # --- FLAGSCALE MODIFICATION END --- + prompt_limit = self._get_prompt_limit(seq_group) - if num_new_tokens + negative_num_new_tokens > prompt_limit: # --- FLAGSCALE MODIFICATION --- + if num_new_tokens + num_new_tokens_negative > prompt_limit: # --- FLAGSCALE MODIFICATION --- logger.warning( "Input prompt (%d tokens) (%d negative tokens) is too long" - " and exceeds limit of %d", num_new_tokens, negative_num_new_tokens, prompt_limit) # --- FLAGSCALE MODIFICATION --- + " and exceeds limit of %d", num_new_tokens, num_new_tokens_negative, prompt_limit) # --- FLAGSCALE MODIFICATION --- for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) @@ -921,9 +1048,9 @@ def _schedule_prefills( break elif can_allocate == AllocStatus.NEVER: logger.warning( - "Input prompt (%d tokens) (%d negative tokens) is too long" - " and exceeds the capacity of block_manager", - num_new_tokens, negative_num_new_tokens) # --- FLAGSCALE MODIFICATION --- + "Input prompt (%d tokens) (%d negative tokens) + lookahead slots (%d) is " + "too long and exceeds the capacity of block_manager", + num_new_tokens, num_new_tokens_negative, num_lookahead_slots) # --- FLAGSCALE MODIFICATION --- for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) @@ -944,10 +1071,19 @@ def _schedule_prefills( waiting_queue.popleft() continue + if (budget.num_batched_tokens >= + self.scheduler_config.max_num_batched_tokens): + # We've reached the budget limit - since there might be + # continuous prefills in the running queue, we should break + # to avoid scheduling any new prefills. + break + num_new_seqs = seq_group.get_max_num_running_seqs() - if (num_new_tokens + negative_num_new_tokens == 0 # --- FLAGSCALE MODIFICATION --- - or not budget.can_schedule(num_new_tokens=num_new_tokens + negative_num_new_tokens, # --- FLAGSCALE MODIFICATION --- - num_new_seqs=num_new_seqs)): + if num_new_tokens_uncached + num_new_tokens_uncached_negative == 0 \ + or not budget.can_schedule( + num_new_tokens=num_new_tokens_uncached + num_new_tokens_uncached_negative, + num_new_seqs=num_new_seqs, + ): # --- FLAGSCALE MODIFICATION --- break # Can schedule this request. @@ -976,8 +1112,12 @@ def _schedule_prefills( seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens, - negative_token_chunk_size=negative_num_new_tokens)) # --- FLAGSCALE MODIFICATION --- - budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens + negative_num_new_tokens) # --- FLAGSCALE MODIFICATION --- + negative_token_chunk_size=num_new_tokens_negative)) # --- FLAGSCALE MODIFICATION --- + budget.add_num_batched_tokens( + seq_group.request_id, + num_batched_tokens=num_new_tokens_uncached + num_new_tokens_uncached_negative, # --- FLAGSCALE MODIFICATION --- + num_cached_tokens=num_new_tokens_cached + num_new_tokens_cached_negative, # --- FLAGSCALE MODIFICATION --- + ) budget.add_num_seqs(seq_group.request_id, num_new_seqs) # Queue requests that couldn't be scheduled. @@ -1085,7 +1225,8 @@ def _schedule_default(self) -> SchedulerOutputs: return SchedulerOutputs( scheduled_seq_groups=scheduled_seq_groups, num_prefill_groups=num_prefill_groups, - num_batched_tokens=budget.num_batched_tokens, + num_batched_tokens=budget.num_batched_tokens + + budget.num_cached_tokens, blocks_to_swap_in=swapped_in.blocks_to_swap_in, blocks_to_swap_out=running_scheduled.blocks_to_swap_out, blocks_to_copy=blocks_to_copy, @@ -1129,7 +1270,6 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - # Schedule new prefills. prefills = self._schedule_prefills(budget, curr_loras, enable_chunking=True) @@ -1157,23 +1297,35 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: # Update swapped requests. self.swapped.extend(running_scheduled.swapped_out) + # Put prefills first due to Attention backend ordering assumption. + scheduled_seq_groups = (prefills.seq_groups + + running_scheduled.prefill_seq_groups + + swapped_in.prefill_seq_groups + + running_scheduled.decode_seq_groups + + swapped_in.decode_seq_groups) + num_prefill_groups = (len(prefills.seq_groups) + + len(swapped_in.prefill_seq_groups) + + len(running_scheduled.prefill_seq_groups)) + # If all prompts, then we set num_lookahead_slots to 0 + # this allows us to go through the `no_spec` path in + # `spec_decode_worker.py` + all_prefills = (len(scheduled_seq_groups) == num_prefill_groups) + num_lookahead_slots = (0 if + (all_prefills + and not self.scheduler_config.is_multi_step) + else running_scheduled.num_lookahead_slots) return SchedulerOutputs( - scheduled_seq_groups=(prefills.seq_groups + - running_scheduled.prefill_seq_groups + - swapped_in.prefill_seq_groups + - running_scheduled.decode_seq_groups + - swapped_in.decode_seq_groups), - num_prefill_groups=(len(prefills.seq_groups) + - len(swapped_in.prefill_seq_groups) + - len(running_scheduled.prefill_seq_groups)), - num_batched_tokens=budget.num_batched_tokens, + scheduled_seq_groups=scheduled_seq_groups, + num_prefill_groups=num_prefill_groups, + num_batched_tokens=budget.num_batched_tokens + + budget.num_cached_tokens, blocks_to_swap_in=swapped_in.blocks_to_swap_in, blocks_to_swap_out=running_scheduled.blocks_to_swap_out, blocks_to_copy=running_scheduled.blocks_to_copy + swapped_in.blocks_to_copy, ignored_seq_groups=prefills.ignored_seq_groups + swapped_in.infeasible_seq_groups, - num_lookahead_slots=running_scheduled.num_lookahead_slots, + num_lookahead_slots=num_lookahead_slots, running_queue_size=len(self.running), preempted=(len(running_scheduled.preempted) + len(running_scheduled.swapped_out)), @@ -1344,12 +1496,15 @@ def schedule( negative_seq_data=negative_seq_data, # --- FLAGSCALE MODIFICATION --- negative_block_tables=negative_block_tables, # --- FLAGSCALE MODIFICATION --- state=seq_group.state, + token_type_ids=seq_group.token_type_ids, # `multi_modal_data` will only be present for the 1st comm # between engine and worker. # the subsequent comms can still use delta, but # `multi_modal_data` will be None. multi_modal_data=seq_group.multi_modal_data if scheduler_outputs.num_prefill_groups > 0 else None, + multi_modal_placeholders=seq_group.multi_modal_placeholders + if scheduler_outputs.num_prefill_groups > 0 else None, mm_processor_kwargs=seq_group.mm_processor_kwargs, prompt_adapter_request=seq_group.prompt_adapter_request, ) @@ -1493,12 +1648,8 @@ def _append_slots(self, assert not seq_group.has_negative_seqs() # --- FLAGSCALE MODIFICATION --- blocks_to_copy.extend(cows) - def _preempt( - self, - seq_group: SequenceGroup, - blocks_to_swap_out: List[Tuple[int, int]], - preemption_mode: Optional[PreemptionMode] = None, - ) -> PreemptionMode: + def _preempt(self, seq_group: SequenceGroup, + blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than # swapping. However, when the sequence group has multiple sequences @@ -1531,6 +1682,11 @@ def _preempt( preemption_mode, self.num_cumulative_preemption + 1) self.num_cumulative_preemption += 1 + # --- FLAGSCALE MODIFICATION BEG --- + if seq_group.has_negative_seqs() and preemption_mode != PreemptionMode.RECOMPUTE: + raise NotImplementedError("Swap in/out does not support in CFG.") + # --- FLAGSCALE MODIFICATION END --- + if preemption_mode == PreemptionMode.RECOMPUTE: self._preempt_by_recompute(seq_group) elif preemption_mode == PreemptionMode.SWAP: @@ -1634,89 +1790,178 @@ def _get_num_lookahead_slots(self, is_prefill: bool, return self.scheduler_config.num_lookahead_slots - def _get_num_new_tokens(self, seq_group: SequenceGroup, - status: SequenceStatus, enable_chunking: bool, - budget: SchedulingBudget) -> int: - """Get the next new tokens to compute for a given sequence group - that's in a given `status`. + def _get_num_new_uncached_and_cached_tokens( + self, + seq_group: SequenceGroup, + status: SequenceStatus, + enable_chunking: bool, + budget: SchedulingBudget, + ) -> Tuple[int, int]: + """ + Returns the number of new uncached and cached tokens to schedule for a + given sequence group that's in a given `status`. The API could chunk the number of tokens to compute based on `budget` if `enable_chunking` is True. If a sequence group has multiple sequences (e.g., running beam search), it means it is in decoding phase, so chunking doesn't happen. - Returns 0 if the new token cannot be computed due to token budget. + Returns (0, 0) if the new token cannot be computed due to token budget. + + The cached tokens's blocks are already computed, and the attention + backend will reuse the cached blocks rather than recomputing them. So + the scheduler could schedule these cached tokens "for free". + + Args: + seq_group: The sequence group to get the number of new tokens to + schedule. + status: The status of the sequences to get the number of new tokens + to schedule. + enable_chunking: Whether to chunk the number of tokens to compute. + budget: The budget to chunk the number of tokens to compute. + + + Returns: + A tuple of two ints. The first int is the number of new uncached + tokens to schedule. The second int is the number of cached tokens. + If no more new tokens can be scheduled, returns (0, 0). """ - # --- FLAGSCALE MODIFICATION BEG --- - num_new_tokens = 0 - negative_num_new_tokens = 0 + num_cached_new_tokens = 0 + num_uncached_new_tokens = 0 + seqs = seq_group.get_seqs(status=status) + # Compute the number of new uncached and cached tokens for + # each sequence. for seq in seqs: - num_new_tokens += seq.get_num_new_tokens() - if seq_group.has_negative_seqs(): - negative_seq = seq_group.negative_seqs_dict[seq.seq_id] - negative_num_new_tokens += negative_seq.get_num_new_tokens() - assert num_new_tokens + negative_num_new_tokens > 0 - - if seq_group.has_negative_seqs(): - if enable_chunking and len(seqs) == 1: - assert not self.cache_config.enable_prefix_caching, "Prefix caching is not supported with negative sequences." - remaining_token_budget = budget.remaining_token_budget() - if num_new_tokens + negative_num_new_tokens < remaining_token_budget: - return num_new_tokens, negative_num_new_tokens - else: - rt_num_new_tokens = min(num_new_tokens, remaining_token_budget // 2) - rt_negative_num_new_tokens = min(negative_num_new_tokens, remaining_token_budget // 2) + if not seq.is_prefill(): + # Decode sequences should always just have 1 uncached token + # TODO(rickyx): Actually is this still correct for multi-step? + num_uncached_new_tokens += 1 + continue - if rt_num_new_tokens == num_new_tokens and rt_negative_num_new_tokens != negative_num_new_tokens: - return 0, 0 - elif rt_num_new_tokens != num_new_tokens and rt_negative_num_new_tokens == negative_num_new_tokens: - return 0, 0 - else: - return rt_num_new_tokens, rt_negative_num_new_tokens - return num_new_tokens, negative_num_new_tokens - # --- FLAGSCALE MODIFICATION END --- + num_computed_tokens_seq = seq.get_num_computed_tokens() + all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq + if not self.cache_config.enable_prefix_caching: + # If prefix caching is not enabled, all new tokens are uncached. + num_uncached_new_tokens += all_num_new_tokens_seq + continue + + # NOTE: the cache token might be currently in a block that's in an + # evictor meaning that it's not yet allocated. However, we don't + # exclude such tokens in the cache count because it will be + # guaranteed to be allocated later if the sequence can be allocated. + num_cached_tokens_seq = self.block_manager.get_num_cached_tokens( + seq) + + # Sanity check. + if num_cached_tokens_seq < num_computed_tokens_seq: + # This should only happen with chunked prefill, and + # the seq is still in prefill. The `num_cached_tokens_seq` + # is the value we calculated on scheduling the first prefill. + # For subsequent continuous prefill steps, we cached the + # number of cache tokens for the sequence so the cached token + # count could be less than the number of computed tokens. + # See comments on `ComputedBlocksTracker` for more details. + assert ( + seq.is_prefill() and seq.status == SequenceStatus.RUNNING + and self.scheduler_config.chunked_prefill_enabled + ), ("Number of cached tokens should not be less than the " + "number of computed tokens for a sequence that's still " + f"in prefill. But there are {num_cached_tokens_seq} cached " + f"tokens and {num_computed_tokens_seq} computed tokens " + f"for sequence {seq.seq_id}.") + + num_cached_new_tokens_seq = max( + 0, num_cached_tokens_seq - num_computed_tokens_seq) + num_uncached_new_tokens_seq = (all_num_new_tokens_seq - + num_cached_new_tokens_seq) + + num_uncached_new_tokens += num_uncached_new_tokens_seq + num_cached_new_tokens += num_cached_new_tokens_seq + + if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0: + # For a fully cached hit sequence, we actually need to recompute the + # last token. So we need at least 1 uncached token to schedule. + # See ModelRunner._compute_for_prefix_cache_hit for more details. + num_uncached_new_tokens = 1 + num_cached_new_tokens -= 1 - # Chunk if a running request cannot fit in the given budget. - # If number of seq > 1, it means it is doing beam search - # in a decode phase. Do not chunk. if enable_chunking and len(seqs) == 1: - remaining_token_budget = budget.remaining_token_budget() - if self.scheduler_config.is_multi_step: - # The current multi-step + chunked prefill capability does - # not actually support chunking prompts. - # - # Therefore, `num_new_tokens` is computed in the same fashion - # for both multi-step+chunked-prefill & - # multi-step+chunked-prefill+APC - # - # Prompts with more tokens than the current remaining budget - # are postponed to future scheduler steps - if num_new_tokens > self._get_prompt_limit(seq_group): - # If the seq_group is in prompt-stage, pass the - # num_new_tokens as-is so the caller can ignore - # the sequence. - pass - else: - num_new_tokens = 0 \ - if num_new_tokens > remaining_token_budget \ - else num_new_tokens - elif self.cache_config.enable_prefix_caching: - # When prefix caching is enabled, we always allocate - # the number of new tokens that is dividable by the block - # size to avoid partial block matching. - block_size = self.cache_config.block_size - remainder = budget.token_budget % block_size - if remainder != 0: - raise ValueError("When enabling chunked prefill and " - "prefix caching, max_num_batched_tokens " - "(chunk size) must be dividable by " - "block size, but got chunk_size " - f"({budget.token_budget}) % block_size " - f"({block_size}) = {remainder}") - if remaining_token_budget < num_new_tokens: - num_new_tokens = (remaining_token_budget // - block_size) * block_size - else: - num_new_tokens = min(num_new_tokens, remaining_token_budget) - return num_new_tokens, negative_num_new_tokens # --- FLAGSCALE MODIFICATION --- + # Chunk if a running request cannot fit in the given budget. + # If number of seq > 1, it means it is doing beam search + # in a decode phase. Do not chunk. + num_uncached_new_tokens = self._chunk_new_tokens_to_schedule( + self.scheduler_config, + self.cache_config, + budget, + self._get_prompt_limit(seq_group), + num_uncached_new_tokens, + ) + + return num_uncached_new_tokens, num_cached_new_tokens + + @staticmethod + def _chunk_new_tokens_to_schedule( + scheduler_config: SchedulerConfig, + cache_config: CacheConfig, + budget: SchedulingBudget, + prompt_limit: int, + num_new_tokens: int, + ) -> int: + """ + Chunks the number of new tokens to schedule based on the budget when + chunked prefill is enabled. + + Args: + scheduler_config: The scheduler config. + cache_config: The cache config. + budget: The budget to chunk the number of tokens to compute. + prompt_limit: The maximum number of tokens allowed in a prompt. + num_new_tokens: The number of new tokens to schedule. + + Returns: + The number of new tokens to schedule after chunking. + """ + remaining_token_budget = budget.remaining_token_budget() + if scheduler_config.is_multi_step: + # The current multi-step + chunked prefill capability does + # not actually support chunking prompts. + # + # Therefore, `num_new_tokens` is computed in the same fashion + # for both multi-step+chunked-prefill & + # multi-step+chunked-prefill+APC + # + # Prompts with more tokens than the current remaining budget + # are postponed to future scheduler steps + if num_new_tokens > prompt_limit: + # If the seq_group is in prompt-stage, pass the + # num_new_tokens as-is so the caller can ignore + # the sequence. + return num_new_tokens + + return (0 if num_new_tokens > remaining_token_budget else + num_new_tokens) + + if cache_config.enable_prefix_caching: + # Adjust the remaining token budget to be divisible by the block + # size when prefix caching is enabled. + + # When prefix caching is enabled, we always allocate + # the number of new tokens that is dividable by the block + # size to avoid partial block matching. + block_size = cache_config.block_size + remainder = budget.token_budget % block_size + if remainder != 0: + raise ValueError("When enabling chunked prefill and " + "prefix caching, max_num_batched_tokens " + "(chunk size) must be dividable by " + "block size, but got chunk_size " + f"({budget.token_budget}) % block_size " + f"({block_size}) = {remainder}") + # Round down to block size. + remaining_token_budget = (remaining_token_budget // block_size * + block_size) + + num_new_tokens = min(num_new_tokens, remaining_token_budget) + + return num_new_tokens diff --git a/flagscale/inference/core/sequence.py b/flagscale/inference/core/sequence.py index 996a3a33b..614987178 100644 --- a/flagscale/inference/core/sequence.py +++ b/flagscale/inference/core/sequence.py @@ -6,24 +6,21 @@ from array import array from collections import defaultdict from dataclasses import dataclass, field -from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional +from functools import reduce +from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union, cast +from typing import Set, Tuple, Union import msgspec import torch -from vllm.inputs.parse import is_encoder_decoder_inputs +from vllm.inputs import SingletonInputs, SingletonInputsAdapter from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams -if TYPE_CHECKING: - from vllm.inputs import SingletonInputs - from vllm.multimodal.base import MultiModalDataDict - VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 @@ -167,6 +164,8 @@ class SequenceData(msgspec.Struct, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 + # The number of tokens with prefix cache hit. + _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) @@ -257,7 +256,8 @@ def output_token_ids(self) -> Tuple[int, ...]: return tuple(self._output_token_ids) @output_token_ids.setter - def output_token_ids(self, new_output_token_ids: List[int]) -> None: + def output_token_ids(self, + new_output_token_ids: GenericSequence[int]) -> None: self._output_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, new_output_token_ids) self._update_cached_all_tokens() @@ -322,6 +322,14 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int): if self.get_num_uncomputed_tokens() == 0: self._stage = SequenceStage.DECODE + def get_num_cached_tokens(self) -> int: + """Return the number of tokens with prefix cache hit.""" + return self._num_cached_tokens + + def update_num_cached_tokens(self, num_cached_tokens: int): + """Update the number of tokens with prefix cache hit.""" + self._num_cached_tokens = num_cached_tokens + def reset_state_for_recompute(self) -> None: """Reset the number of computed tokens from this sequence. It is supposed to be called when a sequence needs to be started from @@ -379,14 +387,9 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - The sequence is constructed from the :code:`SingletonInputs` instance - passed in through the :code:`inputs` constructor argument. - - For encoder/decoder models, SingletonInputs encapsulates both a - decoder and encoder prompt, creating an ambiguity about which - prompt to construct the sequence from. The `from_decoder_prompt` - constructor argument signals whether to construct the Sequence - from the SingletonInputs decoder prompt, or encoder prompt. + The sequence is constructed from the :data:`DecoderOnlyInputs` + (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the :code:`inputs` constructor argument. Args: seq_id: The ID of the sequence. @@ -396,59 +399,23 @@ class Sequence: eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. lora_request: LoRA request. prompt_adapter_request: Prompt Adapter request. - from_decoder_prompt: Construct Sequence from SingletonInputs decoder - prompt (True) or encoder prompt (False.) Must be - True for decoder-only model. - """ def __init__( self, seq_id: int, - inputs: "SingletonInputs", + inputs: SingletonInputs, block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - from_decoder_prompt: bool = True, - from_negative_prompt: bool = False, # --- FLAGSCALE MODIFICATION --- ) -> None: self.seq_id = seq_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.from_decoder_prompt = from_decoder_prompt - # --- FLAGSCALE MODIFICATION BEG --- - self.from_negative_prompt = from_negative_prompt - # --- FLAGSCALE MODIFICATION END --- - - # For decoder-only models, a Sequence is constructed - # from an DecoderOnlyInputs instance (the `inputs` arg.) - # - # For encoder/decoder models the same `inputs` - # instance could be utilized to construct either an - # encoder sequence or a decoder sequence, because - # `DecoderOnlyInputs` has both decoder- and encoder-oriented - # member variables (i.e. it encapsulates both an encoder - # and a decoder prompt.) The decision of which type of sequence - # to generate is determined by the `from_decoder_prompt` argument. - # - # When constructing a encoder sequence - # (`from_decoder_prompt` False) it matters that - # the `DecoderOnlyInputs` instance stored in `inputs` is valid - # in the sense that its encoder-related member variables are - # populated; below, an exception is raised if this is - # not the case. - # - # When constructing a decoder sequence (`from_decoder_prompt` True) - # it does not matter whether `inputs` has its encoder-related - # member variables populated. - if not (from_decoder_prompt or is_encoder_decoder_inputs(inputs)): - raise ValueError("Cannot extract encoder input prompt from " - f"invalid input {inputs}; did you forget the " - "encoder input prompt fields?") self.data = SequenceData.from_seqs(self.prompt_token_ids) self.output_logprobs: SampleLogprobs = [] @@ -471,55 +438,33 @@ def __init__( def n_blocks(self) -> int: return (self.get_len() + self.block_size - 1) // self.block_size - @cached_property + @property def prompt(self) -> Optional[str]: - # Select decoder or encoder input prompt str, as appropriate - prompt_key: str = ("prompt" - if self.from_decoder_prompt else "encoder_prompt") - - # --- FLAGSCALE MODIFICATION BEG --- - if self.from_negative_prompt: - assert self.from_decoder_prompt is True, "negative prompt is only supported for decoder" - prompt_key: str = "negative_prompt" - # --- FLAGSCALE MODIFICATION END --- + return self.inputs.prompt - return cast(Optional[str], self.inputs.get(prompt_key)) - - @cached_property + @property def prompt_token_ids(self) -> List[int]: - # Select decoder or encoder input prompt token ids, as appropriate - prompt_token_ids_key: str = ("prompt_token_ids" - if self.from_decoder_prompt else - "encoder_prompt_token_ids") + return self.inputs.prompt_token_ids - # --- FLAGSCALE MODIFICATION BEG --- - if self.from_negative_prompt: - assert self.from_decoder_prompt is True, "negative prompt is only supported for decoder" - prompt_token_ids_key: str = "negative_prompt_token_ids" - # --- FLAGSCALE MODIFICATION END --- + @property + def prompt_embeds(self) -> Optional[torch.Tensor]: + return self.inputs.prompt_embeds - # Cache computed prompt token ids - return cast(List[int], self.inputs.get(prompt_token_ids_key)) + @property + def token_type_ids(self) -> List[int]: + return self.inputs.token_type_ids @property def multi_modal_data(self) -> "MultiModalDataDict": - inputs = self.inputs + return self.inputs.multi_modal_data - if (inputs.get("multi_modal_data") - and inputs.get("encoder_multi_modal_data")): - raise ValueError( - "Multi-modal data in both encoder and decoder is not supported." - ) - - return cast( - "MultiModalDataDict", - (inputs.get("multi_modal_data") - or inputs.get("encoder_multi_modal_data") or {}), - ) + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + return self.inputs.multi_modal_placeholders @property def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.inputs.get("mm_processor_kwargs") or {} + return self.inputs.mm_processor_kwargs @property def lora_int_id(self) -> int: @@ -639,6 +584,9 @@ def get_num_new_tokens(self) -> int: return 1 return self.data.get_num_uncomputed_tokens() + def get_num_computed_tokens(self) -> int: + return self.data.get_num_computed_tokens() + def is_prefill(self) -> bool: return self.data.stage == SequenceStage.PREFILL @@ -723,6 +671,7 @@ def __init__( # --- FLAGSCALE MODIFICATION BEG --- self.negative_seqs = negative_seqs + self.negative_seqs_dict = {} if negative_seqs: self.negative_seqs_dict = {seq.seq_id: seq for seq in negative_seqs} assert self.is_single_seq is True @@ -754,9 +703,17 @@ def encoder_prompt_token_ids(self) -> Optional[List[int]]: if self.encoder_seq is not None else None) @property - def multi_modal_data(self) -> "MultiModalDataDict": + def token_type_ids(self) -> Optional[List[int]]: + return self.first_seq.token_type_ids + + @property + def multi_modal_data(self) -> MultiModalDataDict: return self.first_seq.multi_modal_data + @property + def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: + return self.first_seq.multi_modal_placeholders + @property def mm_processor_kwargs(self) -> Dict[str, Any]: return self.first_seq.mm_processor_kwargs @@ -969,7 +926,7 @@ class SequenceGroupMetadata( multi_modal_data: Multi modal data. mm_processor_kwargs: Multimodal input processor / mapper overrides. encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None + (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder model. cross_block_table: Optional cross-attention block table associated @@ -993,7 +950,9 @@ class SequenceGroupMetadata( default_factory=lambda: SequenceGroupState()) # "MultiModalDataDict" types. We have to use Any due to msgspec # doesn't allow to have union of 2 different dicts. + token_type_ids: Optional[List[int]] = None multi_modal_data: Optional[Any] = None + multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None encoder_seq_data: Optional[SequenceData] = None cross_block_table: Optional[List[int]] = None @@ -1227,7 +1186,7 @@ def get_all_seq_ids_and_request_ids( sequence ids. """ seq_ids: List[int] = [] - request_id_seq_ids_mapping: Dict[str, Set[int]] = defaultdict(set) + request_id_seq_ids_mapping: DefaultDict[str, Set[int]] = defaultdict(set) for sg in seq_group_metadata_list: for seq_id in sg.seq_data: seq_ids.append(seq_id) diff --git a/flagscale/runner/runner_inference.py b/flagscale/runner/runner_inference.py index 376696a22..a3c64550c 100644 --- a/flagscale/runner/runner_inference.py +++ b/flagscale/runner/runner_inference.py @@ -81,6 +81,7 @@ def _generate_run_script_inference( root_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) + vllm_dir = os.path.join(root_dir, "vllm") cmds_config = config.experiment.get("cmds", None) if cmds_config: before_start = cmds_config.get("before_start", "") @@ -94,7 +95,7 @@ def _generate_run_script_inference( f.write(f"\n") f.write(f"cd {root_dir}\n") f.write(f"\n") - f.write(f"export PYTHONPATH={root_dir}\n") + f.write(f"export PYTHONPATH={vllm_dir}:{root_dir}\n") f.write(f"\n") f.write(f'cmd="{cmd}"\n') f.write(f"\n") diff --git a/flagscale/runner/runner_serve.py b/flagscale/runner/runner_serve.py new file mode 100644 index 000000000..1955e8d95 --- /dev/null +++ b/flagscale/runner/runner_serve.py @@ -0,0 +1,306 @@ +import os +import shlex + +import hydra +from hydra.core.hydra_config import HydraConfig +from omegaconf import DictConfig, OmegaConf + +from flagscale.runner.runner_base import RunnerBase +from flagscale.runner.runner_utils import ( + get_free_port, + get_nnodes, + get_nproc_per_node, + logger, + parse_hostfile, + run_local_command, + run_scp_command, + run_ssh_command, +) + + +def _get_args_vllm(config: DictConfig): + # see the following link for more details + # https://github.com/facebookresearch/hydra/discussions/2750 + OmegaConf.set_struct(config, False) + + hydra_config = HydraConfig.get() + output_dir = hydra_config.runtime.output_dir + output_subdir = hydra_config.output_subdir + config_path = os.path.join(output_dir, f"{output_subdir}/config.yaml") + config_path = hydra.utils.to_absolute_path(config_path) + + args = [] + args.append(f"--config-path={config_path}") + + return args + + +def _update_config_serve(config: DictConfig): + exp_dir = os.path.abspath(config.experiment.exp_dir) + if not os.path.isdir(exp_dir): + os.makedirs(exp_dir) + assert os.path.isdir(exp_dir), f"Directory {exp_dir} does not exist." + + OmegaConf.set_struct(config, False) + + if config.get("logging", None) is None: + config.serve.logging = DictConfig({}) + + log_dir = os.path.join(exp_dir, f"serve_logs") + scripts_dir = os.path.join(log_dir, "scripts") + pids_dir = os.path.join(log_dir, "pids") + + config.serve.logging.log_dir = log_dir + config.serve.logging.scripts_dir = scripts_dir + config.serve.logging.pids_dir = pids_dir + + OmegaConf.set_struct(config, True) + + +def _generate_run_script_serve( + config, host, node_rank, cmd, background=True, with_test=False +): + logging_config = config.serve.logging + + no_shared_fs = config.experiment.runner.get("no_shared_fs", False) + if no_shared_fs: + host_output_file = os.path.join(logging_config.log_dir, f"host.output") + else: + host_output_file = os.path.join( + logging_config.log_dir, f"host_{node_rank}_{host}.output" + ) + host_run_script_file = os.path.join( + logging_config.scripts_dir, f"host_{node_rank}_{host}_run.sh" + ) + host_pid_file = os.path.join( + logging_config.pids_dir, f"host_{node_rank}_{host}.pid" + ) + + os.makedirs(logging_config.scripts_dir, exist_ok=True) + + root_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + cmds_config = config.experiment.get("cmds", None) + if cmds_config: + before_start = cmds_config.get("before_start", "") + else: + before_start = "" + cmd += f" --log-dir={logging_config.log_dir}" + with open(host_run_script_file, "w") as f: + f.write("#!/bin/bash\n\n") + f.write("set -x\n") + f.write(f"{before_start}\n") + f.write(f"mkdir -p {logging_config.log_dir}\n") + f.write(f"mkdir -p {logging_config.pids_dir}\n") + f.write(f"\n") + f.write(f"cd {root_dir}\n") + f.write(f"\n") + f.write(f"export PYTHONPATH={root_dir}\n") + f.write(f"\n") + f.write(f'cmd="{cmd}"\n') + f.write(f"\n") + if with_test: + f.write(f'bash -c "$cmd; sync" \n') + else: + # TODO: need a option to control whether to append or overwrite the output file + # Now, it always appends to the output file + if background: + f.write( + f'nohup bash -c "$cmd; sync" >> {host_output_file} 2>&1 & echo $! > {host_pid_file}\n' + ) + else: + f.write(f'bash -c "$cmd; sync" >> {host_output_file} 2>&1\n') + f.write("\n") + f.flush() + os.fsync(f.fileno()) + os.chmod(host_run_script_file, 0o755) + + return host_run_script_file + + +def _generate_stop_script(config, host, node_rank): + logging_config = config.serve.logging + + host_stop_script_file = os.path.join( + logging_config.scripts_dir, f"host_{node_rank}_{host}_stop.sh" + ) + + host_pid_file = os.path.join( + logging_config.pids_dir, f"host_{node_rank}_{host}.pid" + ) + + os.makedirs(logging_config.scripts_dir, exist_ok=True) + + cmds_config = config.experiment.get("cmds", None) + if cmds_config: + after_stop = cmds_config.get("after_stop", "") + else: + after_stop = "" + with open(host_stop_script_file, "w") as f: + f.write("#!/bin/bash\n\n") + f.write("pkill -f 'python'\n") + f.write(f"{after_stop}\n") + f.flush() + os.fsync(f.fileno()) + os.chmod(host_stop_script_file, 0o755) + + return host_stop_script_file + + +class SSHServeRunner(RunnerBase): + def __init__(self, config: DictConfig): + super().__init__(config) + self.task_type = getattr(self.config.experiment.task, "type", None) + assert self.task_type == "serve", f"Unsupported task type: {self.task_type}" + self.command_line_mode = getattr( + self.config.serve.deploy, "command-line-mode", None + ) + self._prepare() + + def _prepare(self): + _update_config_serve(self.config) + self.user_args = _get_args_vllm(self.config) + self.user_envs = self.config.experiment.get("envs", {}) + if self.command_line_mode: + self.user_script = "flagscale/serve/run_vllm.py" + else: + self.user_script = self.config.experiment.task.entrypoint + self.resources = parse_hostfile( + self.config.experiment.runner.get("hostfile", None) + ) + logger.info("\n************** configuration **************") + logger.info(f"\n{OmegaConf.to_yaml(self.config)}") + + def _run_each( + self, + host, + master_addr, + master_port, + nnodes, + node_rank, + nproc_per_node, + with_test=False, + dryrun=False, + ): + export_cmd = [] + for k, v in self.user_envs.items(): + export_cmd += [f"{k}={v}"] + + cmd = shlex.join(export_cmd + ["python"] + [self.user_script] + self.user_args) + + logging_config = self.config.serve.logging + host_run_script_file = _generate_run_script_serve( + self.config, host, node_rank, cmd, background=True, with_test=with_test + ) + + if host != "localhost": + ssh_port = self.config.experiment.runner.get("ssh_port", 22) + # Step 1: make sure the scripts_dir exists on the remote host + run_ssh_command( + host, f"mkdir -p {logging_config.scripts_dir}", ssh_port, dryrun + ) + + # Step 2: copy the host_run_script_file to the remote host + no_shared_fs = self.config.experiment.runner.get("no_shared_fs", False) + if no_shared_fs: + run_scp_command( + host, + host_run_script_file, + logging_config.scripts_dir, + ssh_port, + dryrun, + ) + + # Step 3: run the host_run_script_file on the remote host + run_ssh_command(host, f"bash {host_run_script_file}", ssh_port, dryrun) + else: + run_local_command(f"bash {host_run_script_file}", dryrun) + + def run(self, with_test=False, dryrun=False): + num_visible_devices = None + visible_devices = self.user_envs.get("CUDA_VISIBLE_DEVICES", None) + if visible_devices is not None and isinstance(visible_devices, str): + visible_devices = visible_devices.split(",") + num_visible_devices = len(visible_devices) + + runner_config = self.config.experiment.runner + + # If hostfile is provided, use the resources from the hostfile + if self.resources is not None: + nnodes_from_hostfile = len(self.resources.keys()) + nnodes_from_args = runner_config.get("nnodes", None) + nnodes = get_nnodes(nnodes_from_hostfile, nnodes_from_args) + available_ip = list(self.resources.keys())[0] + available_port = get_free_port() + for node_rank, (host, resource_info) in enumerate(self.resources.items()): + if node_rank >= nnodes: + break + nproc_from_hostfile = resource_info["slots"] + nproc_from_args = runner_config.get("nproc_per_node", None) + nproc_per_node = get_nproc_per_node( + nproc_from_hostfile, nproc_from_args, num_visible_devices + ) + master_addr = runner_config.get("master_addr", available_ip) + master_port = runner_config.get("master_port", available_port) + self._run_each( + host, + master_addr, + master_port, + nnodes, + node_rank, + nproc_per_node, + with_test=with_test, + dryrun=dryrun, + ) + else: + # If hostfile is not provided, run the job on localhost + nproc_from_args = runner_config.get("nproc_per_node", None) + nproc_per_node = get_nproc_per_node( + None, nproc_from_args, num_visible_devices + ) + available_addr = runner_config.get("master_addr", "localhost") + available_port = runner_config.get("master_port", get_free_port()) + self._run_each( + "localhost", + available_addr, + available_port, + 1, + 0, + nproc_per_node, + with_test=with_test, + dryrun=dryrun, + ) + + def _stop_each(self, host, node_rank): + host_stop_script_file = _generate_stop_script(self.config, host, node_rank) + logging_config = self.config.serve.logging + + if host != "localhost": + ssh_port = self.config.experiment.runner.get("ssh_port", 22) + # Step 1: make sure the scripts_dir exists on the remote host + run_ssh_command(host, f"mkdir -p {logging_config.scripts_dir}", ssh_port) + # Step 2: copy the host_run_script_file to the remote host + no_shared_fs = self.config.experiment.runner.get("no_shared_fs", False) + if no_shared_fs: + run_scp_command( + host, host_stop_script_file, logging_config.scripts_dir, ssh_port + ) + # Step 3: run the host_run_script_file on the remote host + run_ssh_command(host, f"bash {host_stop_script_file}", ssh_port) + else: + run_local_command(f"bash {host_stop_script_file}") + + def stop(self): + if self.resources is None: + self._stop_each("localhost", 0) + return + + nnodes = get_nnodes( + len(self.resources), self.config.experiment.runner.get("nnodes", None) + ) + + for node_rank, (host, _) in enumerate(self.resources.items()): + if node_rank >= nnodes: + break + self._stop_each(host, node_rank) diff --git a/flagscale/runner/runner_train.py b/flagscale/runner/runner_train.py index 8efead5a4..745da2c52 100644 --- a/flagscale/runner/runner_train.py +++ b/flagscale/runner/runner_train.py @@ -7,6 +7,7 @@ from flagscale.runner.runner_base import JobStatus, RunnerBase from flagscale.runner.runner_utils import ( + add_decive_extra_config, flatten_dict_to_args, get_free_port, get_host_name_or_ip, @@ -290,6 +291,7 @@ def _prepare(self): self.user_args = _get_args_megatron(self.config) self.rdzv_id = datetime.now().strftime("%Y%m%d_%H%M%S.%f") self.user_envs = self.config.experiment.get("envs", {}) + self.cur_envs = None # current node envs self.user_script = self.config.experiment.task.entrypoint self.resources = parse_hostfile( self.config.experiment.runner.get("hostfile", None) @@ -305,11 +307,13 @@ def _run_each( nnodes, node_rank, nproc_per_node, + device_type=None, with_test=False, dryrun=False, ): export_cmd = [] - for k, v in self.user_envs.items(): + + for k, v in self.cur_envs.items(): export_cmd += [f"{k}={v}"] runner_cmd = _get_runner_cmd_train( @@ -321,6 +325,13 @@ def _run_each( nproc_per_node, self.config, ) + # update hetero-current-device-type according to the device_type in hostfile + if device_type is not None: + if "--hetero-current-device-type" in self.user_args: + idx = self.user_args.index("--hetero-current-device-type") + self.user_args[idx + 1] = device_type + else: + self.user_args += ["--hetero-current-device-type", device_type] cmd = shlex.join(export_cmd + runner_cmd + [self.user_script] + self.user_args) @@ -355,11 +366,6 @@ def _run_each( def run(self, with_test=False, dryrun=False, monitor=False, interval=10): num_visible_devices = None - visible_devices = self.user_envs.get("CUDA_VISIBLE_DEVICES", None) - if visible_devices is not None and isinstance(visible_devices, str): - visible_devices = visible_devices.split(",") - num_visible_devices = len(visible_devices) - runner_config = self.config.experiment.runner # If hostfile is provided, use the resources from the hostfile @@ -372,6 +378,13 @@ def run(self, with_test=False, dryrun=False, monitor=False, interval=10): for node_rank, (host, resource_info) in enumerate(self.resources.items()): if node_rank >= nnodes: break + self.cur_envs = add_decive_extra_config( + self.user_envs, resource_info["type"] + ) + visible_devices = self.cur_envs.get("CUDA_VISIBLE_DEVICES", None) + if visible_devices is not None and isinstance(visible_devices, str): + visible_devices = visible_devices.split(",") + num_visible_devices = len(visible_devices) nproc_from_hostfile = resource_info["slots"] nproc_from_args = runner_config.get("nproc_per_node", None) nproc_per_node = get_nproc_per_node( @@ -386,11 +399,17 @@ def run(self, with_test=False, dryrun=False, monitor=False, interval=10): nnodes, node_rank, nproc_per_node, + device_type=resource_info["type"], with_test=with_test, dryrun=dryrun, ) else: # If hostfile is not provided, run the job on localhost + self.cur_envs = self.user_envs + visible_devices = self.cur_envs.get("CUDA_VISIBLE_DEVICES", None) + if visible_devices is not None and isinstance(visible_devices, str): + visible_devices = visible_devices.split(",") + num_visible_devices = len(visible_devices) nproc_from_args = runner_config.get("nproc_per_node", None) nproc_per_node = get_nproc_per_node( None, nproc_from_args, num_visible_devices diff --git a/flagscale/runner/runner_utils.py b/flagscale/runner/runner_utils.py index 37a27f105..8e030648c 100644 --- a/flagscale/runner/runner_utils.py +++ b/flagscale/runner/runner_utils.py @@ -3,6 +3,9 @@ import re import socket import subprocess +import sys + +from omegaconf import DictConfig, OmegaConf from flagscale.logger import logger @@ -45,6 +48,10 @@ def parse_hostfile(hostfile_path): else: log_and_raise_error(f"Invalid entry in hostfile: {line}.") + assert all(info["type"] == None for _, info in resources.items()) or all( + info["type"] != None for _, info in resources.items() + ), "All hosts must have the a machine type or no machine type specified." + if len(resources) == 0: log_and_raise_error( "Hostfile is empty or not formatted correctly. Please check the hostfile." @@ -84,12 +91,24 @@ def run_local_command(cmd, dryrun=False, query=False): return if query: result = subprocess.run( - cmd, shell=True, check=True, capture_output=True, text=True + cmd, + shell=True, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", ) return result else: result = subprocess.run( - cmd, shell=True, check=True, capture_output=True, text=True + cmd, + shell=True, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", ) if result.returncode != 0: print(f"Command {cmd} failed with return code {result.returncode}.") @@ -107,13 +126,22 @@ def run_ssh_command(host, cmd, port=None, dryrun=False, query=False): logger.info(f"Running the ssh command: {ssh_cmd}") if dryrun: return + result = subprocess.run( + ssh_cmd, + shell=True, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + if result.returncode != 0: + print(f"SSH command {ssh_cmd} failed with return code {result.returncode}.") + print(f"Output: {result.stdout}") + print(f"Error: {result.stderr}") + sys.exit(result.returncode) if query: - result = subprocess.run( - ssh_cmd, shell=True, check=True, text=True, stdout=subprocess.PIPE - ) return result - else: - subprocess.run(ssh_cmd, shell=True, check=True) def run_scp_command(host, src, dst, port=None, dryrun=False): @@ -124,7 +152,20 @@ def run_scp_command(host, src, dst, port=None, dryrun=False): logger.info(f"Run the scp command: {scp_cmd}") if dryrun: return - subprocess.run(scp_cmd, shell=True, check=True) + result = subprocess.run( + scp_cmd, + shell=True, + check=True, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + if result.returncode != 0: + print(f"SCP command {scp_cmd} failed with return code {result.returncode}.") + print(f"Output: {result.stdout}") + print(f"Error: {result.stderr}") + sys.exit(result.returncode) def flatten_dict_to_args(config_dict, ignore_keys=[]): @@ -188,3 +229,26 @@ def get_nproc_per_node( return num_visible_devices else: return 1 + + +def add_decive_extra_config(config, device_type): + if device_type is None: + logger.warning( + f"type in hostfile is not specified. All the nodes use the same arguments inlucding evnironment variables." + ) + return OmegaConf.to_container(config, resolve=True) + cur_node_config = {} + temp_dict = {} + if isinstance(config, DictConfig): + temp_dict = OmegaConf.to_container(config, resolve=True) + else: + temp_dict = config + for key, value in temp_dict.items(): + if isinstance(value, dict): + if key == device_type: + cur_node_config.update(value) + else: + continue + else: + cur_node_config[key] = value + return cur_node_config diff --git a/flagscale/serve/README.md b/flagscale/serve/README.md new file mode 100644 index 000000000..8a312f1b5 --- /dev/null +++ b/flagscale/serve/README.md @@ -0,0 +1,93 @@ +## Introduce + +We introduce support for deploying large models with FlagScale, leveraging the Ray framework for efficient orchestration and scalability. Currently, this implementation supports the Qwen model, enabling users to easily deploy and manage large-scale machine learning services. + +Future Key features include: + +- Easy distributed Serve on base of eamless integration with Ray. +- Optimized resource management for large model inference. +- Simplified deployment process for the LLM and Multimodal models. + +This enhancement will significantly improve the usability of FlagScale for large model deployment scenarios. + +## Setup + +[Install vLLM](../../README.md#setup) + +## Prepare Model + +[Prepare Qwen data](https://www.modelscope.cn/models/Qwen/Qwen2.5-7B-Instruct/summary) + +```shell +pip install modelscope +modelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir /models/ +``` + +## Serve run + +```shell +cd FlagScale +python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=run +``` + +## Serve call + +```shell +curl http://127.0.0.1:4567/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "/models/Qwen2.5-7B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Introduce Bruce Lee in details."} + ] + }' +``` + +## Serve stop + +```shell +cd FlagScale +python run.py --config-path ./examples/qwen/conf --config-name config_qwen2.5_7b action=stop +``` + +## logs + +Since serve is the distributed mode, the logs are stored separately. \ +The default logs of are loacated in `/tmp/ray/session_latest/logs`.\ +The log of each work is named as `worker-[worker_id]-[job_id]-[pid].[out|err]`. + +## Config Template + +Flagscale.serve will support multiple scenarios. For better performance and usage, Flagscale.serve will optimize for specific scenarios, and these optimizations can be applied through different configurations. + +### Command Line Mode with vLLM + +If origin model is excuted in command line mode with vLLM, we can use Flagscale.serve to deploy it easily. + +```shell +vllm serve /models/Qwen2.5-7B-Instruct --tensor-parallel-size=1 --gpu-memory-utilization=0.9 --max-model-len=32768 --max-num-seqs=256 --port=4567 --trust-remote-code --enable-chunked-prefill +``` + +All the args remain the same as vLLM. Note that action args without value, like trust-remote-code and enable-chunked-prefill, are located in **action-args** block in config file. + +```YAML +model_args: + vllm_model: + model-tag: /models/Qwen2.5-7B-Instruct + tensor-parallel-size: 1 + gpu-memory-utilization: 0.9 + max-model-len: 32768 + max-num-seqs: 256 + port: 4567 + action-args: + - trust-remote-code + - enable-chunked-prefill + +deploy: + command-line-mode: true + models: + vllm_model: + num_gpus: 1 +``` + +### How to config serve parameters +***deploy*** block is used to specify the parameters of serve. The ***models*** block is used to specify the parameters of each model decorated by "serve.remote". diff --git a/flagscale/serve/__init__.py b/flagscale/serve/__init__.py new file mode 100644 index 000000000..f9aecd94b --- /dev/null +++ b/flagscale/serve/__init__.py @@ -0,0 +1,6 @@ +from .utils import init, run, stop, prepare, remote, task_config + + +__all__ = ["init", "run", "stop", "prepare", "remote", "task_config"] + +prepare() diff --git a/flagscale/serve/run_vllm.py b/flagscale/serve/run_vllm.py new file mode 100644 index 000000000..171239cbc --- /dev/null +++ b/flagscale/serve/run_vllm.py @@ -0,0 +1,73 @@ +import os +import sys +import datetime +import inspect +import subprocess +import argparse +import logging as logger +from omegaconf import OmegaConf +import ray +from flagscale import serve + + +timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + + +@serve.remote(name="vllm_model") +def vllm_model(args): + + vllm_args = args["serve"]["model_args"]["vllm_model"] + + command = ["vllm", "serve"] + command.append(vllm_args["model-tag"]) + for item in vllm_args: + if item not in {"model-tag", "action-args"}: + command.append(f"--{item}={vllm_args[item]}") + for arg in vllm_args["action-args"]: + command.append(f"--{arg}") + + # Start the subprocess + logger.info(f"[Serve]: Starting vllm serve with command: {' '.join(command)}") + runtime_context = ray.get_runtime_context() + worker_id = runtime_context.get_worker_id() + job_id = runtime_context.get_job_id() + logger.info( + f"[Serve]: Current Job ID: {job_id} , \n[Serve]: ******** Worker ID: {worker_id} ********\n\n" + ) + link_dir = os.path.join( + args.log_dir, f"session_latest_{timestamp}", f"worker-{worker_id}-" + ) + logger.info( + f"\n\n[Serve]: ********************** {inspect.currentframe().f_code.co_name} Worker log path\ + ********************** \n[Serve]: {link_dir} \n\n" + ) + + process = subprocess.Popen(command, stdout=sys.stdout, stderr=sys.stderr) + pid = os.getpid() + logger.info(f"[Serve]: Current vLLM PID: {pid} ") + + stdout, stderr = process.communicate() + logger.info(f"[Serve]: Standard Output: {stdout}") + logger.info(f"[Serve]: Standard Error: {stderr}") + + return process.returncode + + +def main(): + # Note: Custom log dir here may cause "OSError: AF_UNIX path length cannot exceed 107 bytes:" + ray.init( + log_to_driver=True, + logging_config=ray.LoggingConfig(encoding="TEXT", log_level="INFO"), + ) + link_dir = os.path.join(serve.task_config.log_dir, f"session_latest_{timestamp}") + tar_dir = ray._private.worker.global_worker.node._logs_dir + os.symlink(tar_dir, link_dir) + + result = vllm_model.remote(serve.task_config) + return_code = ray.get(result) + + logger.info(f"[Serve]: vLLM serve exited with return code: {return_code}") + + +if __name__ == "__main__": + main() diff --git a/flagscale/serve/utils.py b/flagscale/serve/utils.py new file mode 100644 index 000000000..d4061e57b --- /dev/null +++ b/flagscale/serve/utils.py @@ -0,0 +1,67 @@ +from omegaconf import OmegaConf +import argparse +import ray + + +task_config = OmegaConf.create() + + +class TaskManager: + def __init__(self): + pass + + +def init(): + ray.init(address="auto") + + +def run(): + ray.run() + + +def stop(): + ray.shutdown() + + +def remote(*args, **kwargs): + """Transform a function into a Ray task""" + _load() + def _merge_kwargs(func_name, **kwargs): + new_kwargs = kwargs.copy() + models = task_config.serve.deploy.models + + if func_name in models: + new_kwargs.update(models[func_name]) + if "model_name" not in kwargs: + new_kwargs.pop("model_name", None) + + return new_kwargs + + new_kwargs = _merge_kwargs(kwargs["name"], **kwargs) + + return ray.remote(*args, **new_kwargs) + + +def _load() -> None: + """Load configuration for cluster init""" + parser = argparse.ArgumentParser(description="Start vllm serve with Ray") + + parser.add_argument( + "--config-path", type=str, required=True, help="Path to the model" + ) + parser.add_argument("--log-dir", type=str, default="outputs", help="Path to the model") + args = parser.parse_args() + + config = OmegaConf.load(args.config_path) + + global task_config + task_config.update(config) + task_config.update({"log_dir": args.log_dir}) + + return + + +def prepare() -> None: + # Load config + # _load() + return diff --git a/flagscale/train/hetero/p2p_communication.py b/flagscale/train/hetero/p2p_communication.py index 43af7ff18..590d59c69 100644 --- a/flagscale/train/hetero/p2p_communication.py +++ b/flagscale/train/hetero/p2p_communication.py @@ -65,7 +65,8 @@ def warm_up_comm_group_hetero(config: ModelParallelConfig): for pp_group in pp_groups: group_ranks = torch.distributed.get_process_group_ranks(pp_group) - if rank == group_ranks[0]: + pipeline_rank = get_pipeline_model_parallel_rank() + if pipeline_rank == 0: _communicate( tensor_send_next=to_send_tensor, tensor_send_prev=None, @@ -75,7 +76,7 @@ def warm_up_comm_group_hetero(config: ModelParallelConfig): config=config, group=pp_group, ) - elif rank == group_ranks[-1]: + elif pipeline_rank == len(group_ranks) - 1: _communicate( tensor_send_next=None, tensor_send_prev=None, diff --git a/flagscale/train/hetero/parallel_context.py b/flagscale/train/hetero/parallel_context.py index 788bd35e6..6303be272 100644 --- a/flagscale/train/hetero/parallel_context.py +++ b/flagscale/train/hetero/parallel_context.py @@ -130,6 +130,7 @@ def __init__( ): assert torch.distributed.is_initialized() self._args = args + self._distributed_backend = args.distributed_backend self._rank = torch.distributed.get_rank() self._world_size = tensor_model_parallel_size * pipeline_model_parallel_size * context_parallel_size * data_parallel_size self._offset = offset @@ -205,7 +206,7 @@ def build_process_group( ranks = self._rank_mapper.to_physical_ranks(logical_ranks) group = torch.distributed.new_group( ranks, - backend="nccl", + backend=self._distributed_backend, timeout=self._timeout, pg_options=pg_options, ) @@ -996,7 +997,11 @@ def get_pipeline_model_parallel_rank(self, group=None): return rank if group is None: group = self.get_pipeline_model_parallel_group()[0] - return torch.distributed.get_rank(group=group) + if group not in self._process_group_to_ranks: # local pipeline group + return torch.distributed.get_rank(group=group) + else: + ranks = self._process_group_to_ranks[group] + return ranks.index(self._rank) def get_pipeline_model_parallel_split_rank(self): """Return pipeline model parallel split rank.""" @@ -1153,11 +1158,14 @@ def get_pipeline_model_parallel_next_rank(self, group=None): if group is None: group = self.get_pipeline_model_parallel_group()[0] ranks = self._process_group_to_ranks.get(group, None) - if ranks is None: + if ranks is None: # local pipeline group current_process_mesh = self._process_meshes[self._current_process_mesh_index] ranks = current_process_mesh.get_process_group_ranks(token="pp", independent_ep=False, check_initialized=True) - assert ranks is not None, "Pipeline parallel group is not initialized" - rank_in_pipeline = self.get_pipeline_model_parallel_rank(group) + assert ranks is not None, "Pipeline parallel group is not initialized" + rank_in_pipeline = torch.distributed.get_rank(group=group) + world_size = self.get_pipeline_model_parallel_world_size(group) + return ranks[(rank_in_pipeline + 1) % world_size] + rank_in_pipeline = ranks.index(self._rank) world_size = self.get_pipeline_model_parallel_world_size(group) return ranks[(rank_in_pipeline + 1) % world_size] @@ -1166,11 +1174,14 @@ def get_pipeline_model_parallel_prev_rank(self, group=None): if group is None: group = self.get_pipeline_model_parallel_group()[0] ranks = self._process_group_to_ranks.get(group, None) - if ranks is None: + if ranks is None: # local pipeline group current_process_mesh = self._process_meshes[self._current_process_mesh_index] ranks = current_process_mesh.get_process_group_ranks(token="pp", independent_ep=False, check_initialized=True) - assert ranks is not None, "Pipeline parallel group is not initialized" - rank_in_pipeline = self.get_pipeline_model_parallel_rank(group) + assert ranks is not None, "Pipeline parallel group is not initialized" + rank_in_pipeline = torch.distributed.get_rank(group=group) + world_size = self.get_pipeline_model_parallel_world_size(group) + return ranks[(rank_in_pipeline - 1) % world_size] + rank_in_pipeline = ranks.index(self._rank) world_size = self.get_pipeline_model_parallel_world_size(group) return ranks[(rank_in_pipeline - 1) % world_size] diff --git a/flagscale/train/train.py b/flagscale/train/train.py index 7e5dc158e..f100b911b 100644 --- a/flagscale/train/train.py +++ b/flagscale/train/train.py @@ -1588,6 +1588,7 @@ def get_e2e_base_metrics(): torch.distributed.get_rank() in args.profile_ranks: if args.use_pytorch_profiler: prof.stop() + print(prof.key_averages(group_by_stack_n=10 ,group_by_input_shape=True).table(sort_by="cpu_time_total")) else: torch.cuda.cudart().cudaProfilerStop() diff --git a/flagscale/train/train_aquila_sft.py b/flagscale/train/train_aquila_sft.py new file mode 100644 index 000000000..3085ce573 --- /dev/null +++ b/flagscale/train/train_aquila_sft.py @@ -0,0 +1,335 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +"""Pretrain GPT.""" + +import os +import sys +from flagscale.utils import CustomModuleFinder +sys.path.append(os.path.dirname( + os.path.dirname(os.path.abspath(__file__)))) +sys.meta_path.insert(0, CustomModuleFinder()) + +import torch +from functools import partial +from contextlib import nullcontext +import inspect + +from typing import Union +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.core import mpu +from megatron.core.enums import ModelType +from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder +from megatron.core.datasets.utils import get_blend_from_list +from megatron.core.datasets.gpt_dataset import GPTDatasetConfig +from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset +import megatron.legacy.model +from megatron.core.models.gpt import GPTModel + +from megatron.core.utils import StragglerDetector +from megatron.core.transformer.spec_utils import import_module +from megatron.training.utils import ( + get_batch_on_this_ulysses_sp_rank, + get_batch_on_this_cp_rank, + get_batch_on_this_tp_rank, +) +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.yaml_arguments import core_transformer_config_from_yaml +from megatron.core.models.gpt.gpt_layer_specs import ( + get_gpt_layer_local_spec, + get_gpt_layer_with_transformer_engine_spec, +) +from flagscale.datasets.sft_dataset import SFTDatasetConfig, SFTDataset +from flagscale.train.extra_valid import extra_valid_dataset_provider +from flagscale.train.train import pretrain +from flagscale.train.global_vars import get_parallel_context + + +stimer = StragglerDetector() + +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: + """Builds the model. + + If you set the use_legacy_models to True, it will return the legacy GPT model and if not the mcore GPT model. + + Args: + pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True. + post_process (bool, optional): Set to true if you need to want to compute output logits/loss. Defaults to True. + + + Returns: + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model + """ + args = get_args() + use_te = args.transformer_impl == "transformer_engine" + + print_rank_0('building GPT model ...') + # Experimental loading arguments from yaml + config = None + if args.yaml_cfg is not None: + config = core_transformer_config_from_yaml(args, "language_model") + else: + para_ctx = get_parallel_context() + if para_ctx is not None: + config = para_ctx.get_transformer_config() + + if config is None: + config = core_transformer_config_from_args(args) + + if args.use_legacy_models: + model = megatron.legacy.model.GPTModel( + config, + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + ) + else: # using core models + if args.spec is not None: + transformer_layer_spec = import_module(args.spec) + else: + if use_te: + transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention, args.fp8) + else: + transformer_layer_spec = get_gpt_layer_local_spec(args.num_experts, args.moe_grouped_gemm, args.qk_layernorm, args.multi_latent_attention) + + build_model_context = nullcontext + build_model_context_args = {} + if args.fp8_param_gather: + try: + from transformer_engine.pytorch import fp8_model_init + + build_model_context = fp8_model_init + build_model_context_args["enabled"] = True + + # Check if fp8_model_init supports preserve_high_precision_init_val + if "preserve_high_precision_init_val" in inspect.signature(fp8_model_init).parameters: + build_model_context_args["preserve_high_precision_init_val"] = True + except: + raise RuntimeError("--fp8-param-gather requires `fp8_model_init` from TransformerEngine, but not found.") + + with build_model_context(**build_model_context_args): + model = GPTModel( + config=config, + transformer_layer_spec=transformer_layer_spec, + vocab_size=args.padded_vocab_size, + max_sequence_length=args.max_position_embeddings, + pre_process=pre_process, + post_process=post_process, + fp16_lm_cross_entropy=args.fp16_lm_cross_entropy, + parallel_output=True, + share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights, + position_embedding_type=args.position_embedding_type, + rotary_percent=args.rotary_percent, + rotary_base=args.rotary_base, + rope_scaling=args.use_rope_scaling + ) + + return model + + +def get_batch(data_iterator): + """Generate a batch.""" + + # TODO: this is pretty hacky, find a better way + if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()): + return None, None, None, None, None + + # get batches based on the TP rank you are on + batch = get_batch_on_this_tp_rank(data_iterator) + + # slice batch along sequence dimension for context parallelism + batch = get_batch_on_this_cp_rank(batch) + + # slice batch along sequence dimension for ulysses sequence parallelism + batch = get_batch_on_this_ulysses_sp_rank(batch) + + return batch.values() + + +def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor): + """Loss function. + + Args: + loss_mask (torch.Tensor): Used to mask out some portions of the loss + output_tensor (torch.Tensor): The tensor with the losses + + Returns: + the loss scalar for this micro-batch + the number of non-padded tokens in this microbatch + a dict containing reporting metrics on the loss and number of tokens across + the data parallel ranks + """ + + args = get_args() + + losses = output_tensor.float() + loss_mask = loss_mask.view(-1).float() + total_tokens = loss_mask.sum() + loss = torch.cat([torch.sum(torch.masked_select(losses.view(-1) , loss_mask==1)).view(1), total_tokens.view(1)]) + + if args.ulysses_sp_parallel_size > 1: + torch.distributed.all_reduce(loss, group=mpu.get_ulysses_sp_parallel_group()) + + if args.context_parallel_size > 1: + torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group()) + + # Check individual rank losses are not NaN prior to DP all-reduce. + if args.check_for_nan_in_loss_and_grad: + global_rank = torch.distributed.get_rank() + assert not loss[0].isnan(), ( + f'Rank {global_rank}: found NaN in local forward loss calculation. ' + f'Device: {torch.cuda.current_device()}, node: {os.uname()[1]}' + ) + + # Reduce loss for logging. + reporting_loss = loss.clone().detach() + torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group()) + + local_num_tokens = loss[1].clone().detach().to(torch.int) + return ( + loss[0] * args.context_parallel_size * args.ulysses_sp_parallel_size, + local_num_tokens, + {'lm loss': (reporting_loss[0], reporting_loss[1])}, + ) + +def forward_step(data_iterator, model: GPTModel): + """Forward training step. + + Args: + data_iterator : Input data iterator + model (GPTModel): The GPT Model + """ + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch-generator', log_level=2).start() + global stimer + with stimer(bdata=True): + tokens, labels, loss_mask, attention_mask, position_ids = get_batch( + data_iterator) + timers('batch-generator').stop() + + with stimer: + output_tensor = model(tokens, position_ids, attention_mask, + labels=labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def is_dataset_built_on_rank(): + return ( + mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage() + ) and mpu.get_tensor_model_parallel_rank() == 0 + + +def core_gpt_dataset_config_from_args(args): + tokenizer = get_tokenizer() + + return GPTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + renormalize_blend_weights=args.renormalize_blend_weights, + split=args.split, + num_dataset_builder_threads=args.num_dataset_builder_threads, + path_to_cache=args.data_cache_path, + mmap_bin_files=args.mmap_bin_files, + tokenizer=tokenizer, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + create_attention_mask=args.create_attention_mask_in_dataloader, + s3_cache_path = args.s3_cache_path, + ) + + +def core_sft_dataset_config_from_args(args): + tokenizer = get_tokenizer() + + return SFTDatasetConfig( + random_seed=args.seed, + sequence_length=args.seq_length, + blend=get_blend_from_list(args.data_path), + blend_per_split=[ + get_blend_from_list(args.train_data_path), + get_blend_from_list(args.valid_data_path), + get_blend_from_list(args.test_data_path) + ], + renormalize_blend_weights=args.renormalize_blend_weights, + split=args.split, + num_dataset_builder_threads=args.num_dataset_builder_threads, + path_to_cache=args.data_cache_path, + mmap_bin_files=args.mmap_bin_files, + tokenizer=tokenizer, + reset_position_ids=args.reset_position_ids, + reset_attention_mask=args.reset_attention_mask, + eod_mask_loss=args.eod_mask_loss, + create_attention_mask=args.create_attention_mask_in_dataloader, + apply_sft_dataset_separated_loss_mask_if_existed=args.apply_sft_dataset_separated_loss_mask_if_existed, + ) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build the train test and validation datasets. + + Args: + train_val_test_num_samples : A list containing the number of samples in train test and validation. + """ + args = get_args() + + config = None + para_ctx = get_parallel_context() + if para_ctx is not None: + config = para_ctx.get_dataset_config() + + if config is None: + if args.apply_sft_dataset_separated_loss_mask_if_existed: + config = core_sft_dataset_config_from_args(args) + else: + config = core_gpt_dataset_config_from_args(args) + + if args.mock_data: + dataset_type = MockGPTDataset + elif args.apply_sft_dataset_separated_loss_mask_if_existed: + dataset_type = SFTDataset + else: + dataset_type = GPTDataset + + print_rank_0("> building train, validation, and test datasets for GPT ...") + + train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder( + dataset_type, + train_val_test_num_samples, + is_dataset_built_on_rank, + config + ).build() + + print_rank_0("> finished creating GPT datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + # Temporary for transition to core datasets + train_valid_test_datasets_provider.is_distributed = True + + extra_valid_dataset_provider.is_distributed = True + + pretrain( + train_valid_test_datasets_provider, + model_provider, + ModelType.encoder_or_decoder, + forward_step, + args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, + get_batch_fn=get_batch, + extra_valid_dataset_provider=extra_valid_dataset_provider + ) diff --git a/hardware/kunlunxin_R300p/237377e9/237377e9.patch b/hardware/kunlunxin_R300p/237377e9/237377e9.patch new file mode 100644 index 000000000..f404971ec --- /dev/null +++ b/hardware/kunlunxin_R300p/237377e9/237377e9.patch @@ -0,0 +1,117 @@ +From 115b26cc46200236cccfe072cf0049b39853b168 Mon Sep 17 00:00:00 2001 +From: brianlcy123 +Date: Sun, 24 Nov 2024 19:12:03 +0800 +Subject: [PATCH] [kunlunxin] add patch for mixtral + +--- + .../megatron/core/dist_checkpointing/strategies/base.py | 4 ++-- + .../megatron/core/distributed/param_and_grad_buffer.py | 7 ++++++- + megatron/megatron/core/transformer/moe/moe_utils.py | 6 +++--- + megatron/megatron/core/transformer/moe/token_dispatcher.py | 4 ++-- + megatron/megatron/training/checkpointing.py | 3 ++- + 5 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/megatron/megatron/core/dist_checkpointing/strategies/base.py b/megatron/megatron/core/dist_checkpointing/strategies/base.py +index cc1c83b9..125779a0 100644 +--- a/megatron/megatron/core/dist_checkpointing/strategies/base.py ++++ b/megatron/megatron/core/dist_checkpointing/strategies/base.py +@@ -6,7 +6,7 @@ from abc import ABC, abstractmethod + from collections import defaultdict + from enum import Enum + from pathlib import Path +-from typing import Any, DefaultDict ++from typing import Any, DefaultDict, Dict, Tuple + + from ..mapping import CheckpointingException, ShardedStateDict, StateDict + from .async_utils import AsyncCallsQueue, AsyncRequest +@@ -20,7 +20,7 @@ class StrategyAction(Enum): + + + _import_trigger = None +-default_strategies: DefaultDict[str, dict[tuple, Any]] = defaultdict(dict) ++default_strategies: DefaultDict[str, Dict[Tuple, Any]] = defaultdict(dict) + + async_calls = AsyncCallsQueue() + +diff --git a/megatron/megatron/core/distributed/param_and_grad_buffer.py b/megatron/megatron/core/distributed/param_and_grad_buffer.py +index 77ecd7be..c2761c6e 100644 +--- a/megatron/megatron/core/distributed/param_and_grad_buffer.py ++++ b/megatron/megatron/core/distributed/param_and_grad_buffer.py +@@ -248,6 +248,11 @@ class ParamAndGradBuffer: + def _pad(number_to_be_padded: int, divisor: int) -> int: + return int(math.ceil(number_to_be_padded / divisor) * divisor) + ++ import math ++ ++ def _lcm(a, b): ++ return abs(a * b) // math.gcd(a, b) ++ + def _pad_end_of_bucket_if_needed(bucket_end_index: int) -> int: + """ + Pads end index of bucket if using distributed optimizer (to ensure uniform sharding). +@@ -257,7 +262,7 @@ class ParamAndGradBuffer: + # This also helps cuBLAS pick more efficient algorithms for GEMMs. + # We now ensure that all buckets start at a memory address that is 256-byte + # aligned (128 values since params and grads use >= 16-bit precision). +- return _pad(bucket_end_index, math.lcm(self.data_parallel_world_size, 128)) ++ return _pad(bucket_end_index, _lcm(self.data_parallel_world_size, 128)) + return bucket_end_index + + def _pad_start_of_param_if_needed(param_start_index: int) -> int: +diff --git a/megatron/megatron/core/transformer/moe/moe_utils.py b/megatron/megatron/core/transformer/moe/moe_utils.py +index ee4bb690..a3c1fd69 100644 +--- a/megatron/megatron/core/transformer/moe/moe_utils.py ++++ b/megatron/megatron/core/transformer/moe/moe_utils.py +@@ -366,8 +366,8 @@ def topk_softmax_with_capacity( + + if capacity_factor is None: + # TopK without capacity +- tokens_per_expert = torch.bincount(top_indices.view(-1), minlength=num_experts) +- return probs, top_indices, tokens_per_expert ++ tokens_per_expert = torch.bincount(top_indices.cpu().view(-1), minlength=num_experts) ++ return probs, top_indices, tokens_per_expert.cuda() + else: + # TopK with capacity + expert_capacity = get_capacity( +@@ -380,7 +380,7 @@ def topk_softmax_with_capacity( + # Maskout exceeded tokens + if drop_policy == "probs": + capacity_probs, capacity_indices = torch.topk( +- topk_masked_gates, k=expert_capacity, dim=0, sorted=False ++ topk_masked_gates, k=expert_capacity, dim=0, sorted=True #mod by zh + ) + capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1) + elif drop_policy == "position": +diff --git a/megatron/megatron/core/transformer/moe/token_dispatcher.py b/megatron/megatron/core/transformer/moe/token_dispatcher.py +index 84f3d450..6a0b4a28 100644 +--- a/megatron/megatron/core/transformer/moe/token_dispatcher.py ++++ b/megatron/megatron/core/transformer/moe/token_dispatcher.py +@@ -179,10 +179,10 @@ class MoEAllGatherTokenDispatcher(MoETokenDispatcher): + + with torch.no_grad(): + tokens_per_expert = torch.bincount( +- local_indices.view(-1), minlength=self.config.num_moe_experts ++ local_indices.cpu().view(-1), minlength=self.config.num_moe_experts + ) + if self.num_local_experts < self.config.num_moe_experts: +- tokens_per_expert = tokens_per_expert[ ++ tokens_per_expert = tokens_per_expert.cuda()[ + self.local_expert_indices[0] : self.local_expert_indices[-1] + 1 + ] + tokens_per_expert = tokens_per_expert.cpu().to(torch.long) +diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py +index 6e58b317..6c650c4e 100644 +--- a/megatron/megatron/training/checkpointing.py ++++ b/megatron/megatron/training/checkpointing.py +@@ -1057,7 +1057,8 @@ def load_checkpoint(model, optimizer, opt_param_scheduler, load_arg='load', stri + restore_modelopt_state(model, state_dict) + + # Model. +- strict = False if args.retro_add_retriever else strict ++ # strict = False if args.retro_add_retriever else strict ++ strict = False + if len(model) == 1: + model[0].load_state_dict(state_dict['model'], strict=strict) + else: +-- +2.25.1 diff --git a/hardware/kunlunxin_R300p/a44556c0/a44556c0.patch b/hardware/kunlunxin_R300p/a44556c0/a44556c0.patch new file mode 100644 index 000000000..77c365928 --- /dev/null +++ b/hardware/kunlunxin_R300p/a44556c0/a44556c0.patch @@ -0,0 +1,107 @@ +From 9289f099424ba4d0dec83fb5715d4d2561f4c4d8 Mon Sep 17 00:00:00 2001 +From: brianlcy123 +Date: Thu, 21 Nov 2024 15:46:54 +0800 +Subject: [PATCH] [kunlunxin] add updated llama3 70b patch + +--- + examples/llama/conf/config.yaml | 40 ++++++++++++++++++++- + megatron/megatron/training/arguments.py | 18 +++++----- + megatron/megatron/training/checkpointing.py | 5 +-- + 3 files changed, 51 insertions(+), 12 deletions(-) + +diff --git a/examples/llama/conf/config.yaml b/examples/llama/conf/config.yaml +index 592c45bf..27fb83ae 100644 +--- a/examples/llama/conf/config.yaml ++++ b/examples/llama/conf/config.yaml +@@ -46,6 +46,44 @@ experiment: + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_APPLY_QK_LAYER_SCALING: 0 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 ++ ALLGATHER_ASYNC: false ++ ALLREDUCE_ASYNC: false ++ ALLREDUCE_FUSION: 0 ++ BKCL_CCIX_BUFFER_GM: 1 ++ BKCL_CCIX_RING: 1 ++ BKCL_ENABLE_XDR: 1 ++ BKCL_FLAT_RING: 1 ++ BKCL_KL3_TURBO_MODE: 1 ++ BKCL_RDMA_FORCE_TREE: 1 ++ BKCL_RDMA_NICS: ens11np0,ens11np0,ens13np0,ens13np0,ens15np0,ens15np0,ens17np0,ens17np0 ++ BKCL_RDMA_PROXY_DISABLE: 1 ++ BKCL_RING_BUFFER_GM: 1 ++ BKCL_TIMEOUT: 360000 ++ BKCL_TRANS_UNSUPPORTED_DATATYPE: 8 ++ BKCL_TREE_THRESHOLD: 1 ++ BKCL_XLINK_C2C: 1 ++ BKCL_XLINK_D2D: 0 ++ BKCL_XLINK_ETH: 0 ++ CUDART_DUMMY_REGISTER: 1 ++ FAST_SWIGLU_ENABLE: 1 ++ USE_FAST_BF16_FC: true ++ USE_L3: 1 ++ XDNN_USE_FAST_SWISH: true ++ XPU_ZEBU_MODE: 1 ++ XPU_FORCE_USERMODE_LAUNCH: 1 ++ DIST_MULTI_STREAM: true ++ XMLIR_DIST_SINGLETON_STREAM: true ++ XMLIR_FA_GEMM_TYPE: float ++ XBLAS_FC_HBM_VERSION: 40 ++ XMLIR_PARALLEL_SAVE_MEMORY: false ++ XMLIR_DISABLE_CUDA_ALLOCATOR: true ++ XMLIR_XDNN_PYTORCH_CHECK_ENABLE_FALLBACK_BOOL: 0 ++ XMLIR_ENABLE_FALLBACK_TO_CPU_BOOL: False ++ XMLIR_DUMP_FALLBACK_OP_LIST_BOOL: true ++ XMLIR_BATCH_PARALLEL: true ++ DIST_MULTI_STREAM: true ++ CUDA_DEVICE_MAX_CONNECTIONS: 8 ++ XMLIR_DIST_ASYNC_ISEND_IRECV: 1 + action: run + + hydra: +diff --git a/megatron/megatron/training/arguments.py b/megatron/megatron/training/arguments.py +index e20f178b..7e79da2a 100644 +--- a/megatron/megatron/training/arguments.py ++++ b/megatron/megatron/training/arguments.py +@@ -652,15 +652,15 @@ def validate_args(args, defaults={}): + if args.sequence_parallel: + args.async_tensor_model_parallel_allreduce = False + +- if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": +- if args.sequence_parallel: +- raise RuntimeError( +- "Using sequence parallelism requires setting the environment variable " +- "CUDA_DEVICE_MAX_CONNECTIONS to 1") +- if args.async_tensor_model_parallel_allreduce: +- raise RuntimeError( +- "Using async gradient all reduce requires setting the environment " +- "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") ++ # if os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1": ++ # if args.sequence_parallel: ++ # raise RuntimeError( ++ # "Using sequence parallelism requires setting the environment variable " ++ # "CUDA_DEVICE_MAX_CONNECTIONS to 1") ++ # if args.async_tensor_model_parallel_allreduce: ++ # raise RuntimeError( ++ # "Using async gradient all reduce requires setting the environment " ++ # "variable CUDA_DEVICE_MAX_CONNECTIONS to 1") + + # Disable bias gelu fusion if we are disabling bias altogether + if not args.add_bias_linear: +diff --git a/megatron/megatron/training/checkpointing.py b/megatron/megatron/training/checkpointing.py +index 01425f36..80fa0254 100644 +--- a/megatron/megatron/training/checkpointing.py ++++ b/megatron/megatron/training/checkpointing.py +@@ -530,8 +530,9 @@ def save_dataloader_state(train_iterator, iteration, dataloader_save_path): + + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + +- if mpu.get_data_parallel_rank() == 0: +- ensure_directory_exists(data_state_save_path) ++ # if mpu.get_data_parallel_rank() == 0: ++ # ensure_directory_exists(data_state_save_path) ++ ensure_directory_exists(data_state_save_path) + + torch.distributed.barrier(group=mpu.get_data_parallel_group()) + +-- +2.25.1 diff --git a/megatron/megatron/core/dist_checkpointing/serialization.py b/megatron/megatron/core/dist_checkpointing/serialization.py index 5493c96bb..cf6f51e44 100644 --- a/megatron/megatron/core/dist_checkpointing/serialization.py +++ b/megatron/megatron/core/dist_checkpointing/serialization.py @@ -8,6 +8,7 @@ loading the sharded tensors. """ +import os import logging from pathlib import Path from typing import Dict, Optional, Set, Tuple, Union @@ -339,10 +340,17 @@ def save( f'Checkpoint destination directory does not exist: {checkpoint_dir}' ) - if next(checkpoint_dir.iterdir(), None) is not None: - raise CheckpointingException( - f'Checkpoint destination directory ({checkpoint_dir}) is not empty' - ) + # Skip this if the env var exists, otherwise default to False + single_file_per_tensor_ckpt = os.getenv('FS_SFPT_CKPT_SAVE', 'False').lower() in ( + 'true', + '1', + 't', + ) + if not single_file_per_tensor_ckpt: + if next(checkpoint_dir.iterdir(), None) is not None: + raise CheckpointingException( + f'Checkpoint destination directory ({checkpoint_dir}) is not empty' + ) if common_strategy is not None: raise NotImplementedError('The only supported common strategy is torch') diff --git a/megatron/megatron/core/dist_checkpointing/strategies/filesystem_async.py b/megatron/megatron/core/dist_checkpointing/strategies/filesystem_async.py index 9d0be4d6e..7d942e366 100644 --- a/megatron/megatron/core/dist_checkpointing/strategies/filesystem_async.py +++ b/megatron/megatron/core/dist_checkpointing/strategies/filesystem_async.py @@ -5,19 +5,21 @@ import logging import os import queue +import pickle from contextlib import contextmanager from itertools import chain from pathlib import Path from time import time -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union, cast import psutil import torch from torch import multiprocessing as mp from torch.distributed.checkpoint import FileSystemWriter -from torch.distributed.checkpoint.filesystem import DEFAULT_SUFFIX, _StoragePrefix, _write_item +from torch.distributed.checkpoint.filesystem import DEFAULT_SUFFIX, _StoragePrefix, _write_item, _metadata_fn from torch.distributed.checkpoint.planner import SavePlan, SavePlanner, WriteItem, WriteItemType from torch.distributed.checkpoint.storage import WriteResult +from torch.distributed.checkpoint.metadata import Metadata from torch.futures import Future logger = logging.getLogger(__name__) @@ -26,6 +28,40 @@ _results_queue = None +_GLOBAL_PREVIOUS_METADATA = None + +_GLOBAL_PREVIOUS_COUNT = 0 + + +def get_previous_metadata(): + """ + Get the metadata from the previous save. + """ + return _GLOBAL_PREVIOUS_METADATA + + +def set_previous_metadata(metadata): + """ + Set the metadata from the previous save. + """ + global _GLOBAL_PREVIOUS_METADATA + _GLOBAL_PREVIOUS_METADATA = metadata + + +def get_previous_count(): + """ + Get the count from the previous save. + """ + return _GLOBAL_PREVIOUS_COUNT + + +def set_previous_count(count): + """ + Set the count from the previous save. + """ + global _GLOBAL_PREVIOUS_COUNT + _GLOBAL_PREVIOUS_COUNT = count + def _get_write_results_queue(): global _results_queue @@ -80,6 +116,13 @@ def __init__(self, *args, **kwargs): self.write_buckets: Optional[List[WriteBucket]] = None self.results_queue: Optional[mp.Queue] = None + # Get the value from the environment variable if it exists, otherwise default to False + self.single_file_per_tensor_ckpt = os.getenv('FS_SFPT_CKPT_SAVE', 'False').lower() in ( + 'true', + '1', + 't', + ) + def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> None: """ First stage of async saving. Copy data to CPU and plan the local saving. @@ -99,12 +142,17 @@ def prepare_write_data(self, plan: SavePlan, planner: SavePlanner) -> None: start = time() # move tensors from GPU to CPU before starting async writing # We do D2H synchronously for now - file_count = 0 + if not self.single_file_per_tensor_ckpt: + file_count = 0 + else: + file_count = get_previous_count() def gen_file(): nonlocal file_count file_name = f"{storage_plan.prefix}{file_count}{DEFAULT_SUFFIX}" file_count += 1 + if self.single_file_per_tensor_ckpt: + set_previous_count(file_count) return file_name # Prepare bytes / tensor data in each bucket, which will be assigned to each writer process @@ -314,6 +362,48 @@ def retrieve_write_results(self) -> List[WriteResult]: ) return list(chain.from_iterable(write_results.values())) + def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None: + # Modify based on the original implementation from torch.distributed.checkpoint.filesystem.FileSystemWriter + # https://github.com/pytorch/pytorch/blob/625c24a7f98a645b6f8758a01d7163a842582ce0/torch/distributed/checkpoint/filesystem.py#L574 + + if not self.single_file_per_tensor_ckpt: + storage_md = {} + else: + if get_previous_count() == 1: + storage_md = {} + else: + # Get the metadata from the previous save + prev_metadata = get_previous_metadata() + prev_metadata.state_dict_metadata.update(metadata.state_dict_metadata) + metadata = prev_metadata + storage_md = metadata.storage_data + + for wr_list in results: + storage_md.update({wr.index: wr.storage_data for wr in wr_list}) + metadata.storage_data = storage_md + + if not self.single_file_per_tensor_ckpt or get_previous_count() == 1: + metadata.storage_meta = self.storage_meta() + + tmp_path = cast(Path, self.fs.concat_path(self.path, f"{_metadata_fn}.tmp")) + with self.fs.create_stream(tmp_path, "wb") as metadata_file: + pickle.dump(metadata, metadata_file) + if self.sync_files: + try: + os.fsync(metadata_file.fileno()) + except AttributeError: + os.sync() + + # delete in-case other checkpoints were present. + if self.fs.exists(self.metadata_path): + self.fs.rm_file(self.metadata_path) + + self.fs.rename(tmp_path, self.metadata_path) + + # Store the metadata for the next save + if self.single_file_per_tensor_ckpt: + set_previous_metadata(metadata) + def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[WriteItem]]: """ @@ -349,7 +439,6 @@ def _split_by_size_and_type(bins: int, items: List[WriteItem]) -> List[List[Writ idx = min(enumerate(bucket_sizes), key=lambda x: x[1])[0] buckets[idx].append(item) bucket_sizes[idx] += _item_size(item) - return buckets diff --git a/megatron/megatron/core/dist_checkpointing/strategies/torch.py b/megatron/megatron/core/dist_checkpointing/strategies/torch.py index 077d94eb7..8a41ad669 100644 --- a/megatron/megatron/core/dist_checkpointing/strategies/torch.py +++ b/megatron/megatron/core/dist_checkpointing/strategies/torch.py @@ -2,6 +2,7 @@ """ Strategies using PyTorch distributed.checkpoint as an underlying format. """ import io +import os from collections import ChainMap, defaultdict from dataclasses import dataclass from itertools import product @@ -404,7 +405,6 @@ def _replace_sharded_keys_with_state_dict_keys( assert len(tensors) == len(rename_mapping[k]) for ten, recovered_k in zip(tensors, rename_mapping[k]): recovered_sd[recovered_k] = ten - return unflatten_state_dict(recovered_sd, flat_mapping) @@ -734,6 +734,13 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St Returns: loaded state dict """ + # Get the value from the environment variable if it exists, otherwise default to True + single_file_per_tensor_ckpt = os.getenv('FS_SFPT_CKPT_LOAD', 'False').lower() in ( + 'true', + '1', + 't', + ) + # Apply N-D tensors resharding sharded_state_dict, formulation_restore_data = apply_nd_flattened_tensors_reformulation( sharded_state_dict, get_reformulation_metadata(sharded_state_dict, checkpoint_dir) @@ -752,13 +759,24 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St ) pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True) # Load PyT Distributed format - checkpoint.load_state_dict( - pyt_state_dict, - FileSystemReader(checkpoint_dir), - planner=MCoreLoadPlanner( - shapes_validation_sharded_tensors=flexible_shape_sharded_tensors - ), - ) + if not single_file_per_tensor_ckpt: + checkpoint.load_state_dict( + pyt_state_dict, + FileSystemReader(checkpoint_dir), + planner=MCoreLoadPlanner( + shapes_validation_sharded_tensors=flexible_shape_sharded_tensors + ), + ) + else: + checkpoint.load_state_dict( + pyt_state_dict, + FileSystemReader(checkpoint_dir), + planner=MCoreLoadPlanner( + shapes_validation_sharded_tensors=flexible_shape_sharded_tensors, + allow_partial_load=True, + ), + ) + pyt_state_dict = cast( Dict[str, Union[TorchShardedTensor, List[io.BytesIO]]], pyt_state_dict ) @@ -767,6 +785,13 @@ def load(self, sharded_state_dict: ShardedStateDict, checkpoint_dir: Path) -> St k: v if not isinstance(v, TorchShardedTensor) else _unwrap_pyt_sharded_tensor(v) for k, v in pyt_state_dict.items() } + + if single_file_per_tensor_ckpt: + mcore_state_dict = { + k: [None] if (not isinstance(v, list) and "_extra_state" in k) else v + for k, v in mcore_state_dict.items() + } + mcore_state_dict = _replace_sharded_keys_with_state_dict_keys( mcore_state_dict, flat_mapping, rename_mapping ) diff --git a/megatron/megatron/core/transformer/transformer_block.py b/megatron/megatron/core/transformer/transformer_block.py index aabd8558b..ca978e6f1 100755 --- a/megatron/megatron/core/transformer/transformer_block.py +++ b/megatron/megatron/core/transformer/transformer_block.py @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - +import os from contextlib import nullcontext from dataclasses import dataclass from typing import List, Optional, Union @@ -316,66 +316,6 @@ def checkpoint_handler(forward_func): rotary_pos_emb, ) - if self.config.recompute_method_per_stage_micro_batch != None: - if self.config.virtual_pipeline_model_parallel_size != None: - if ( - self.config.recompute_method_per_stage_micro_batch[ - parallel_state.get_virtual_pipeline_model_parallel_rank() - * self.config.pipeline_model_parallel_size - + parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - == 0 - ): - self.config.recompute_method = 'uniform' - elif ( - self.config.recompute_method_per_stage_micor_batch[ - parallel_state.get_virtual_pipeline_model_parallel_rank() - * self.config.pipeline_model_parallel_size - + parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - == 1 - ): - self.config.recompute_method = 'block' - else: - raise ValueError("the item of recompute_method_per_stage_micor_batch must be '0' or '1' ") - else: - if ( - self.config.recompute_method_per_stage_micro_batch[ - parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - == 0 - ): - self.config.recompute_method = 'uniform' - elif ( - self.config.recompute_method_per_stage_micro_batch[ - parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - == 1 - ): - self.config.recompute_method = 'block' - - if self.config.recompute_num_layers_per_stage_micro_batch != None: - if self.config.virtual_pipeline_model_parallel_size != None: - self.config.recompute_num_layers = self.config.recompute_num_layers_per_stage_micro_batch[ - parallel_state.get_virtual_pipeline_model_parallel_rank() - * self.config.pipeline_model_parallel_size - + parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - else: - self.config.recompute_num_layers = self.config.recompute_num_layers_per_stage_micro_batch[ - parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - - if ( - self.config.recompute_granularity_per_stage_micro_batch != None - and self.config.recompute_granularity_per_stage_micro_batch[ - parallel_state.get_pipeline_model_parallel_rank() - ][self.current_microbatch] - == 0 - ): - self.recompute_granularity = None - self.recompute_method = None - if self.config.recompute_method == 'uniform': # Uniformly divide the total number of Transformer layers and checkpoint # the input activation of each divided chunk. @@ -538,6 +478,71 @@ def forward( else: fp8_context = nullcontext() + if self.config.recompute_method_per_stage_micro_batch != None: + if self.config.virtual_pipeline_model_parallel_size != None: + if ( + self.config.recompute_method_per_stage_micro_batch[ + parallel_state.get_virtual_pipeline_model_parallel_rank() + * self.config.pipeline_model_parallel_size + + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + == 0 + ): + self.config.recompute_method = 'uniform' + elif ( + self.config.recompute_method_per_stage_micor_batch[ + parallel_state.get_virtual_pipeline_model_parallel_rank() + * self.config.pipeline_model_parallel_size + + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + == 1 + ): + self.config.recompute_method = 'block' + else: + raise ValueError("the item of recompute_method_per_stage_micor_batch must be '0' or '1' ") + else: + if ( + self.config.recompute_method_per_stage_micro_batch[ + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + == 0 + ): + self.config.recompute_method = 'uniform' + elif ( + self.config.recompute_method_per_stage_micro_batch[ + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + == 1 + ): + self.config.recompute_method = 'block' + else: + raise ValueError("the item of recompute_method_per_stage_micor_batch must be '0' or '1' ") + + if self.config.recompute_num_layers_per_stage_micro_batch != None: + if self.config.virtual_pipeline_model_parallel_size != None: + self.config.recompute_num_layers = self.config.recompute_num_layers_per_stage_micro_batch[ + parallel_state.get_virtual_pipeline_model_parallel_rank() + * self.config.pipeline_model_parallel_size + + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + else: + self.config.recompute_num_layers = self.config.recompute_num_layers_per_stage_micro_batch[ + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + if self.config.recompute_num_layers == 0: + self.config.recompute_method = None + self.config.recompute_granularity = None + + if ( + self.config.recompute_granularity_per_stage_micro_batch != None + and self.config.recompute_granularity_per_stage_micro_batch[ + parallel_state.get_pipeline_model_parallel_rank() + ][self.current_microbatch] + == 0 + ): + self.config.recompute_granularity = None + self.config.recompute_method = None + with rng_context and fp8_context: # Forward pass. if self.config.recompute_granularity == 'full' and self.training: @@ -625,6 +630,16 @@ def sharded_state_dict( non_homogeneous_layers = metadata is not None and metadata.get( 'non_homogeneous_layers', False ) + + # TODO: @aoyulong - This is a temporary solution to support single-file-per-tensor ckpt + non_homogeneous_layers_env = os.getenv('FS_NON_HOMOGENEOUS_LAYERS', 'False').lower() in ( + 'true', + '1', + 't', + ) + if non_homogeneous_layers_env: + non_homogeneous_layers = True + sharded_state_dict = {} layer_prefix = f'{prefix}layers.' diff --git a/run.py b/run.py index fba8ebb07..472f072ac 100644 --- a/run.py +++ b/run.py @@ -3,6 +3,7 @@ from flagscale.runner.runner_train import SSHTrainRunner, CloudTrainRunner from flagscale.runner.runner_inference import SSHInferenceRunner +from flagscale.runner.runner_serve import SSHServeRunner @hydra.main(version_base=None, config_name="config") @@ -47,6 +48,16 @@ def main(config: DictConfig) -> None: runner.stop() else: raise ValueError(f"Unknown action {config.action}") + elif task_type == "serve": + runner = SSHServeRunner(config) + if config.action == "run": + runner.run() + elif config.action == "test": + runner.run(with_test=True) + elif config.action == "stop": + runner.stop() + else: + raise ValueError(f"Unknown action {config.action}") else: raise ValueError(f"Unknown task type {task_type}") diff --git a/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml b/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml index b73534e02..b2967ee26 100644 --- a/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml +++ b/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2dp1pp1_tp2dp2pp1_tp1dp2pp1.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_aquila.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml b/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml index 4ac191c2f..1f7a16d7e 100644 --- a/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml +++ b/tests/functional_tests/test_cases/hetero_train/aquila/conf/tp2pp1_tp4pp1_tp2pp1.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_aquila.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/functional_tests/test_cases/train/aquila/conf/tp2_pp2.yaml b/tests/functional_tests/test_cases/train/aquila/conf/tp2_pp2.yaml index 78b108a27..03930cdf8 100644 --- a/tests/functional_tests/test_cases/train/aquila/conf/tp2_pp2.yaml +++ b/tests/functional_tests/test_cases/train/aquila/conf/tp2_pp2.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_aquila.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/functional_tests/test_cases/train/aquila/conf/tp4_pp2.yaml b/tests/functional_tests/test_cases/train/aquila/conf/tp4_pp2.yaml index 81f4e1e60..b8354557b 100644 --- a/tests/functional_tests/test_cases/train/aquila/conf/tp4_pp2.yaml +++ b/tests/functional_tests/test_cases/train/aquila/conf/tp4_pp2.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_aquila.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/functional_tests/test_cases/train/mixtral/conf/tp2_pp1_ep2.yaml b/tests/functional_tests/test_cases/train/mixtral/conf/tp2_pp1_ep2.yaml index c48c4ef16..dce9ff56a 100644 --- a/tests/functional_tests/test_cases/train/mixtral/conf/tp2_pp1_ep2.yaml +++ b/tests/functional_tests/test_cases/train/mixtral/conf/tp2_pp1_ep2.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_mixtral.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/functional_tests/test_cases/train/mixtral/conf/tp4_pp1_ep2.yaml b/tests/functional_tests/test_cases/train/mixtral/conf/tp4_pp1_ep2.yaml index 95387cb5d..ef744d84f 100644 --- a/tests/functional_tests/test_cases/train/mixtral/conf/tp4_pp1_ep2.yaml +++ b/tests/functional_tests/test_cases/train/mixtral/conf/tp4_pp1_ep2.yaml @@ -10,9 +10,9 @@ experiment: backend: megatron entrypoint: flagscale/train/train_mixtral.py runner: - backend: torchrun + backend: torchrun + ssh_port: null shell_cmds: null - ssh_port: null envs: HYDRA_FULL_ERROR: 1 CUDA_VISIBLE_DEVICES: "0,1,2,3,4,5,6,7" diff --git a/tests/scripts/format_tests/test_format.sh b/tests/scripts/format_tests/test_format.sh index 5ad2bbc3e..f01c67d25 100755 --- a/tests/scripts/format_tests/test_format.sh +++ b/tests/scripts/format_tests/test_format.sh @@ -13,7 +13,9 @@ flagscale/logger.py \ flagscale/patches_utils.py \ flagscale/datasets/sft_dataset.py \ flagscale/inference/inference_*.py \ -flagscale/inference/arguments.py" +flagscale/inference/arguments.py \ +tools/checkpoint/sfpt_ckpt/*.py \ +" # Function to run a command and continue even if it fails run_command() { diff --git a/tools/checkpoint/convert.py b/tools/checkpoint/convert.py index e5a0f2b72..dde8204b2 100644 --- a/tools/checkpoint/convert.py +++ b/tools/checkpoint/convert.py @@ -32,7 +32,7 @@ def main(): allow_abbrev=False, conflict_handler='resolve') # convert args parser.add_argument('--model-type', type=str, default=[], nargs="+", required=True, - choices=['mistral', 'mixtral', 'llama'], + choices=['aquila3_dense', 'aquila3_moe', 'mistral', 'mixtral', 'llama'], help='Type of the model.') parser.add_argument('--loader', type=str, default='mcore', choices=['mcore', 'transformers'], help='Module name to load checkpoint, should be on python path') @@ -44,6 +44,7 @@ def main(): help='Directory to save model checkpoint to') parser.add_argument('--max-queue-size', type=int, default=50, help='Maximum number of tensors in the queue') + extend_cases = [['mistral', 'mixtral'], ['aquila3_dense', 'aquila3_moe']] known_args, _ = parser.parse_known_args() loader = load_plugin('loader', known_args.loader) @@ -85,4 +86,4 @@ def main(): if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/tools/checkpoint/run.sh b/tools/checkpoint/run.sh index b433f7713..cfe7a0696 100755 --- a/tools/checkpoint/run.sh +++ b/tools/checkpoint/run.sh @@ -54,3 +54,30 @@ python convert.py \ --target-params-dtype bf16 \ --true-vocab-size 128256 \ --megatron-path + +python convert.py \ + --model-type aquila3_dense \ + --loader transformers \ + --saver mcore \ + --load-dir $loaddir \ + --save-dir $outputs \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 1 \ + --target-expert-parallel-size 1 \ + --target-params-dtype bf16 \ + --true-vocab-size 151665 \ + --megatron-path + +# megatron to huggingface +python convert.py \ + --model-type llama \ + --loader mcore \ + --saver transformers \ + --load-dir $loaddir \ + --save-dir $outputs \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 1 \ + --target-expert-parallel-size 1 \ + --target-params-dtype bf16 \ + --true-vocab-size 128256 \ + --megatron-path diff --git a/tools/checkpoint/sfpt_ckpt/README.md b/tools/checkpoint/sfpt_ckpt/README.md new file mode 100644 index 000000000..88530118b --- /dev/null +++ b/tools/checkpoint/sfpt_ckpt/README.md @@ -0,0 +1,73 @@ +# README + +This directory contains scripts for converting checkpoints between DCP (Distributed Checkpoint) and SFPT (Single File Per Tensor) formats. + +## Scripts + +- `dcp_to_sfpt.py` - Converts a DCP checkpoint to SFPT format. +- `sfpt_to_dcp.py` - Converts an SFPT checkpoint to DCP format. + +## Usage + +**Convert DCP to SFPT:** +1. Get the DCP checkpoint non-homogeneous layers from the training run. + * Add the environment variable to experiment-level configuration file: + ```yaml + envs: + FS_NON_HOMOGENEOUS_LAYERS: True + ``` + + * Add the following to the task-level configuration file: + ```yaml + use_dist_ckpt: True + ckpt_format: torch_dist + ckpt_fully_parallel_save: True + ckpt_fully_parallel_load: True + ``` + +2. Set the `PYTHONPATH` environment variable: + + ```bash + # FlagScale_ROOT is the root directory of the FlagScale repository + export PYTHONPATH=$FlagScale_ROOT/megatron:$FlagScale_ROOT + ``` + +3. Run the conversion script: + ```bash + torchrun --nnodes 1 --node_rank 0 --nproc_per_node 1 \ + --master_addr localhost --master_port 1234 \ + dcp_to_sfpt.py --input_dir /path/to/dcp_checkpoint --output_dir /path/to/output_sfpt_checkpoint + ``` + +**Convert SFPT to DCP:** + +1. Set the `PYTHONPATH` environment variable: + ```bash + # FlagScale_ROOT is the root directory of the FlagScale repository + export PYTHONPATH=$FlagScale_ROOT/megatron:$FlagScale_ROOT + ``` + +2. Run the conversion script: + ```bash + FS_SFPT_CKPT_SAVE=1 torchrun --nnodes 1 --node_rank 0 --nproc_per_node 1 \ + --master_addr localhost --master_port 1234 \ + sfpt_to_dcp.py --input_dir /path/to/sfpt_checkpoint --output_dir /path/to/output_dcp_checkpoint + ``` + +3. Use the DCP checkpoint for further fine-tuning. + * Add the environment variables to experiment-level configuration file: + ```yaml + envs: + FS_NON_HOMOGENEOUS_LAYERS: True + FS_SFPT_CKPT_LOAD: True + ``` + + * Add the following to the task-level configuration file: + ```yaml + use_dist_ckpt: True + ckpt_format: torch_dist + ckpt_fully_parallel_save: True + ckpt_fully_parallel_load: True + finetune: True + load: /path/to/output_dcp_checkpoint + ``` \ No newline at end of file diff --git a/tools/checkpoint/sfpt_ckpt/dcp_to_sfpt.py b/tools/checkpoint/sfpt_ckpt/dcp_to_sfpt.py new file mode 100644 index 000000000..c29831994 --- /dev/null +++ b/tools/checkpoint/sfpt_ckpt/dcp_to_sfpt.py @@ -0,0 +1,111 @@ +import argparse +import os +from datetime import timedelta + +import torch +from torch.distributed.checkpoint import ( + BytesStorageMetadata, + FileSystemReader, + Metadata, + TensorStorageMetadata, +) +from torch.distributed.checkpoint.metadata import Metadata + +from megatron.core.dist_checkpointing import ShardedTensor, load +from megatron.core.dist_checkpointing.mapping import ShardedObject + + +def build_tensor_shared_state_dict(key, metadata: Metadata = None): + # Based on load_tensors_metadata from FlagScale/megatron/megatron/core/dist_checkpointing/strategies/torch.py + mcore_data = getattr(metadata, "mcore_data", {}) + sharded_state_dict = {} + tp = metadata.state_dict_metadata[key] + + nd_orig_global_shape = mcore_data.get(key, {}).get( + "nd_reformulated_orig_global_shape" + ) + if nd_orig_global_shape is None: + # Regular tensor + sharded_state_dict[key] = ShardedTensor.from_rank_offsets( + key, torch.empty(tp.size, **tp.properties.__dict__, device="cpu") + ) + else: + # N-D flattened tensor + unflat_ten = torch.empty( + nd_orig_global_shape, **tp.properties.__dict__, device="cpu" + ) + flat_ten = unflat_ten.flatten() + sharded_state_dict[key] = ShardedTensor.from_rank_offsets_flat( + key, + flat_ten, + unflat_ten.shape, + flattened_range=slice(0, unflat_ten.numel()), # whole slice + ) + + return sharded_state_dict + + +def build_sharded_state_dict(metadata_key, metadata): + # Based on load_sharded_metadata from FlagScale/megatron/megatron/core/dist_checkpointing/strategies/torch.py + storage_metadata = metadata.state_dict_metadata[metadata_key] + if isinstance(storage_metadata, BytesStorageMetadata): + sharded_state_dict = {} + sh_obj = ShardedObject.empty_from_unique_key(metadata_key) + sharded_state_dict[sh_obj.unique_key] = sh_obj + return sharded_state_dict + elif isinstance(storage_metadata, TensorStorageMetadata): + sharded_state_dict = build_tensor_shared_state_dict(metadata_key, metadata) + return sharded_state_dict + + +def convert_dist_ckpt_to_sfpt_ckpt(input_dir, output_dir): + # Distributed checkpoint loading requires the distributed environment to be initialized + rank = int(os.getenv("RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "1")) + print(f"Rank: {rank}, World size: {world_size}") + torch.distributed.init_process_group( + backend="gloo", world_size=world_size, rank=rank + ) + + fs_reader = FileSystemReader(input_dir) + metadata = fs_reader.read_metadata() + state_dict_metadata = metadata.state_dict_metadata + for metadata_key, storage_metadata in state_dict_metadata.items(): + # Skip optimizer state_dict + if "optimizer" not in metadata_key and isinstance( + storage_metadata, TensorStorageMetadata + ): + print(f"Processing {metadata_key}") + sharded_state_dict = build_sharded_state_dict(metadata_key, metadata) + loaded_state_dict = load(sharded_state_dict, input_dir) + sharded_tensor = loaded_state_dict[metadata_key] + unshared_tensor = sharded_tensor.data + path = os.path.join(output_dir, metadata_key) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + with open(f"{path}.pt", "wb") as f: + torch.save({metadata_key: unshared_tensor}, f) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert distributed checkpoint to single-file-per-tensor checkpoint." + ) + parser.add_argument( + "--input_dir", + type=str, + required=True, + help="Input directory containing the distributed checkpoint.", + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Output directory to save the single-file-per-tensor checkpoint.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + convert_dist_ckpt_to_sfpt_ckpt(args.input_dir, args.output_dir) diff --git a/tools/checkpoint/sfpt_ckpt/sfpt_to_dcp.py b/tools/checkpoint/sfpt_ckpt/sfpt_to_dcp.py new file mode 100644 index 000000000..5c65bc8aa --- /dev/null +++ b/tools/checkpoint/sfpt_ckpt/sfpt_to_dcp.py @@ -0,0 +1,82 @@ +import argparse +import os +from argparse import Namespace +from pathlib import Path + +import torch + +from megatron.core.dist_checkpointing import ShardedTensor, save +from megatron.core.dist_checkpointing.serialization import ( + get_default_save_common_strategy, +) + + +def convert_sfpt_ckpt_to_dist_ckpt(input_dir, output_dir): + # Distributed checkpoint loading requires the distributed environment to be initialized + rank = int(os.getenv("RANK", "0")) + world_size = int(os.getenv("WORLD_SIZE", "1")) + print(f"Rank: {rank}, World size: {world_size}") + torch.distributed.init_process_group( + backend="gloo", world_size=world_size, rank=rank + ) + + input_ckpt_dir = os.path.join(input_dir) + if not os.path.isdir(input_ckpt_dir): + raise ValueError(f"Checkpoint directory {input_ckpt_dir} does not exist") + + ckpt_output_dir = os.path.join(output_dir, "iter_0000000") + if not os.path.exists(ckpt_output_dir): + os.makedirs(ckpt_output_dir) + + for root, dirs, files in os.walk(input_ckpt_dir): + for file in files: + file_path = os.path.join(root, file) + print(f"Processing file: {file_path}") + state_dict = torch.load(file_path) + assert len(state_dict) == 1 + key = list(state_dict.keys())[0] + tensor = state_dict[key] + sharded_state_dict = {} + sharded_state_dict[key] = ShardedTensor.from_rank_offsets( + key, + tensor, + ) + save(sharded_state_dict, ckpt_output_dir) + + # Fake the minimal args for the checkpoint loading processing + state_dict = {} + args = Namespace( + tensor_model_parallel_size=1, + pipeline_model_parallel_size=1, + ) + state_dict["args"] = args + common_strategy = get_default_save_common_strategy() + common_strategy.save_common(state_dict, Path(ckpt_output_dir)) + + # add the latest_checkpointed_iteration file + with open(os.path.join(output_dir, "latest_checkpointed_iteration.txt"), "w") as f: + f.write("0") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Convert single-file-per-tensor checkpoint to distributed checkpoint." + ) + parser.add_argument( + "--input_dir", + type=str, + required=True, + help="Input directory containing the single-file-per-tensor checkpoint.", + ) + parser.add_argument( + "--output_dir", + type=str, + required=True, + help="Output directory to save the distributed checkpoint.", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + convert_sfpt_ckpt_to_dist_ckpt(args.input_dir, args.output_dir) diff --git a/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh index b2e910e1b..a67fc89d5 100644 --- a/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh +++ b/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do done lm_eval --model hf \ - --model_args pretrained=$MODEL,parallelize=True \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,parallelize=True" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh index 4d32b49a4..65be3c5d9 100644 --- a/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh +++ b/vllm/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do done lm_eval --model vllm \ - --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \ - --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ - --batch_size $BATCH_SIZE + --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \ + --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ + --batch_size "$BATCH_SIZE" diff --git a/vllm/.buildkite/lm-eval-harness/run-tests.sh b/vllm/.buildkite/lm-eval-harness/run-tests.sh index b4fdde6da..26f33b744 100644 --- a/vllm/.buildkite/lm-eval-harness/run-tests.sh +++ b/vllm/.buildkite/lm-eval-harness/run-tests.sh @@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do done # Parse list of configs. -IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" do diff --git a/vllm/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/vllm/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index eec2a51e2..64ba1b32f 100644 --- a/vllm/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/vllm/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -9,8 +9,11 @@ steps: - image: badouralix/curl-jq command: - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: A100 plugins: @@ -18,7 +21,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + - image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT command: - bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh resources: @@ -41,20 +44,48 @@ steps: - name: devshm emptyDir: medium: Memory - # - label: "H100" - # agents: - # queue: H100 - # plugins: - # - docker#v5.11.0: - # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT - # command: - # - bash - # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh - # mount-buildkite-agent: true - # propagate-environment: true - # ipc: host - # gpus: all - # environment: - # - VLLM_USAGE_SOURCE - # - HF_TOKEN + - label: "H200" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: H200 + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: 4,5,6,7 + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN + + - block: "Run H100 Benchmark" + key: block-h100 + depends_on: ~ + + - label: "H100" + # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" + agents: + queue: H100 + depends_on: block-h100 + plugins: + - docker#v5.12.0: + image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT + command: + - bash + - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh + mount-buildkite-agent: true + propagate-environment: true + ipc: host + gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used + volumes: + - /data/benchmark-hf-cache:/root/.cache/huggingface + environment: + - VLLM_USAGE_SOURCE + - HF_TOKEN diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/vllm/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index f90e46428..9d3646e2f 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/vllm/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -56,7 +56,7 @@ def read_markdown(file): if os.path.exists(file): - with open(file, "r") as f: + with open(file) as f: return f.read() + "\n" else: return f"{file} not found.\n" @@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving): # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) if "serving" in str(test_file): # this result is generated via `benchmark_serving.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_latency.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving): # this result is generated via `benchmark_throughput.py` # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) @@ -157,6 +157,18 @@ def results_to_json(latency, throughput, serving): throughput_results, serving_results) + for df in [latency_results, serving_results, throughput_results]: + if df.empty: + continue + + # Sort all dataframes by their respective "Test name" columns + df.sort_values(by="Test name", inplace=True) + + # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", + # we want to turn it into "8xGPUTYPE" + df["GPU"] = df["GPU"].apply( + lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}") + # get markdown tables latency_md_table = tabulate(latency_results, headers='keys', diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/vllm/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 6059588fe..052060c57 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/vllm/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -72,7 +72,7 @@ def main(args): # collect results for test_file in results_folder.glob("*_nightly_results.json"): - with open(test_file, "r") as f: + with open(test_file) as f: results = results + json.loads(f.read()) # generate markdown table @@ -80,7 +80,7 @@ def main(args): md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) - with open(args.description, "r") as f: + with open(args.description) as f: description = f.read() description = description.format( diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/vllm/.buildkite/nightly-benchmarks/scripts/launch-server.sh index e9d7d6a8d..fb5063db8 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/launch-server.sh +++ b/vllm/.buildkite/nightly-benchmarks/scripts/launch-server.sh @@ -50,31 +50,30 @@ launch_trt_server() { git clone https://github.com/triton-inference-server/tensorrtllm_backend.git git lfs install cd tensorrtllm_backend - git checkout $trt_llm_version - tensorrtllm_backend_dir=$(pwd) + git checkout "$trt_llm_version" git submodule update --init --recursive # build trtllm engine cd /tensorrtllm_backend - cd ./tensorrt_llm/examples/${model_type} + cd "./tensorrt_llm/examples/${model_type}" python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} + --model_dir "${model_path}" \ + --dtype "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --output_dir "${trt_model_path}" trtllm-build \ - --checkpoint_dir ${trt_model_path} \ + --checkpoint_dir "${trt_model_path}" \ --use_fused_mlp \ --reduce_fusion disable \ --workers 8 \ - --gpt_attention_plugin ${model_dtype} \ - --gemm_plugin ${model_dtype} \ - --tp_size ${model_tp_size} \ - --max_batch_size ${max_batch_size} \ - --max_input_len ${max_input_len} \ - --max_seq_len ${max_seq_len} \ - --max_num_tokens ${max_num_tokens} \ - --output_dir ${trt_engine_path} + --gpt_attention_plugin "${model_dtype}" \ + --gemm_plugin "${model_dtype}" \ + --tp_size "${model_tp_size}" \ + --max_batch_size "${max_batch_size}" \ + --max_input_len "${max_input_len}" \ + --max_seq_len "${max_seq_len}" \ + --max_num_tokens "${max_num_tokens}" \ + --output_dir "${trt_engine_path}" # handle triton protobuf files and launch triton server cd /tensorrtllm_backend @@ -82,15 +81,15 @@ launch_trt_server() { cp -r all_models/inflight_batcher_llm/* triton_model_repo/ cd triton_model_repo rm -rf ./tensorrt_llm/1/* - cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + cp -r "${trt_engine_path}"/* ./tensorrt_llm/1 python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false - python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5 - python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false - python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size - python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1 + python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5" + python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false" + python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size" + python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1" cd /tensorrtllm_backend python3 scripts/launch_triton_server.py \ - --world_size=${model_tp_size} \ + --world_size="${model_tp_size}" \ --model_repo=/tensorrtllm_backend/triton_model_repo & } @@ -98,10 +97,7 @@ launch_trt_server() { launch_tgi_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -129,10 +125,7 @@ launch_tgi_server() { launch_lmdeploy_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") server_command="lmdeploy serve api_server $model \ @@ -149,10 +142,7 @@ launch_sglang_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -185,10 +175,7 @@ launch_vllm_server() { model=$(echo "$common_params" | jq -r '.model') tp=$(echo "$common_params" | jq -r '.tp') - dataset_name=$(echo "$common_params" | jq -r '.dataset_name') - dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') - num_prompts=$(echo "$common_params" | jq -r '.num_prompts') server_args=$(json2args "$server_params") if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then @@ -217,19 +204,19 @@ launch_vllm_server() { main() { - if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then launch_trt_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then launch_tgi_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then launch_lmdeploy_server fi - if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then + if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then launch_sglang_server fi diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/vllm/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index c6a1bbdeb..686f70dbe 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/vllm/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -16,10 +16,10 @@ main() { fi # initial annotation - description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" + #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" # download results - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" mkdir -p results/ /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ ls @@ -30,15 +30,15 @@ main() { /workspace/buildkite-agent artifact upload "results.zip" # upload benchmarking scripts - cd $VLLM_SOURCE_CODE_LOC/ + cd "$VLLM_SOURCE_CODE_LOC/" zip -r nightly-benchmarks.zip .buildkite/ benchmarks/ /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # upload benchmarking pipeline /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml" - cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md @@ -75,4 +75,4 @@ main() { # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } -main "$@" \ No newline at end of file +main "$@" diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/vllm/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh index dd8c15e07..3f38cf513 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh +++ b/vllm/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh @@ -12,7 +12,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')" echo "GPU type is $gpu_type" } @@ -102,7 +102,7 @@ kill_gpu_processes() { pkill -f text-generation pkill -f lmdeploy - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done } @@ -119,8 +119,8 @@ wait_for_server() { ensure_installed() { # Ensure that the given command is installed by apt-get local cmd=$1 - if ! which $cmd >/dev/null; then - apt-get update && apt-get install -y $cmd + if ! which "$cmd" >/dev/null; then + apt-get update && apt-get install -y "$cmd" fi } @@ -173,13 +173,11 @@ run_serving_tests() { echo "Reuse previous server for test case $test_name" else kill_gpu_processes - bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \ + bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \ "$server_params" "$common_params" fi - wait_for_server - - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "$CURRENT_LLM_SERVING_ENGINE server is up and running." else @@ -190,13 +188,13 @@ run_serving_tests() { # prepare tokenizer # this is required for lmdeploy. - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" rm -rf /tokenizer_cache mkdir /tokenizer_cache python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$model" \ --cachedir /tokenizer_cache - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" # change model name for lmdeploy (it will not follow standard hf name) @@ -307,11 +305,11 @@ run_serving_tests() { prepare_dataset() { # download sharegpt dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json # duplicate sonnet by 4x, to allow benchmarking with input length 2048 - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" echo "" > sonnet_4x.txt for _ in {1..4} do @@ -339,17 +337,17 @@ main() { prepare_dataset - cd $VLLM_SOURCE_CODE_LOC/benchmarks + cd "$VLLM_SOURCE_CODE_LOC/benchmarks" declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/ + BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/" # run the test - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json" # upload benchmark results to buildkite python3 -m pip install tabulate pandas - python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py" upload_to_buildkite } diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/vllm/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index a0b9a409b..0d16a8378 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/vllm/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -6,6 +6,7 @@ # Do not set -e, as the mixtral 8x22B model tends to crash occasionally # and we still want to see other benchmarking results even when mixtral crashes. +set -x set -o pipefail check_gpus() { @@ -17,7 +18,7 @@ check_gpus() { echo "Need at least 1 GPU to run benchmarking." exit 1 fi - declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}') echo "GPU type is $gpu_type" } @@ -85,15 +86,11 @@ kill_gpu_processes() { ps -aux lsof -t -i:8000 | xargs -r kill -9 - pkill -f pt_main_thread - # this line doesn't work now - # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9 - pkill -f python3 - pkill -f /usr/bin/python3 + pgrep python3 | xargs -r kill -9 # wait until GPU memory usage smaller than 1GB - while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do + while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do sleep 1 done @@ -117,7 +114,7 @@ upload_to_buildkite() { fi # Use the determined command to annotate and upload artifacts - $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md + $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md" $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*" } @@ -150,7 +147,7 @@ run_latency_tests() { # check if there is enough GPU to run the test tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -206,9 +203,9 @@ run_throughput_tests() { throughput_args=$(json2args "$throughput_params") # check if there is enough GPU to run the test - tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -270,7 +267,7 @@ run_serving_tests() { # check if there is enough GPU to run the test tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi @@ -278,7 +275,7 @@ run_serving_tests() { server_model=$(echo "$server_params" | jq -r '.model') client_model=$(echo "$client_params" | jq -r '.model') if [[ $server_model != "$client_model" ]]; then - echo "Server model and client model must be the same. Skip testcase $testname." + echo "Server model and client model must be the same. Skip testcase $test_name." continue fi @@ -289,12 +286,11 @@ run_serving_tests() { # run the server echo "Running test case $test_name" echo "Server command: $server_command" - eval "$server_command" & + bash -c "$server_command" & server_pid=$! # wait until the server is alive - wait_for_server - if [ $? -eq 0 ]; then + if wait_for_server; then echo "" echo "vllm server is up and running." else @@ -323,7 +319,7 @@ run_serving_tests() { echo "Running test case $test_name with qps $qps" echo "Client command: $client_command" - eval "$client_command" + bash -c "$client_command" # record the benchmarking commands jq_output=$(jq -n \ diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/vllm/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 4e4d4cd4c..92d6fad73 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/vllm/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -36,11 +36,11 @@ # collect results for test_file in results_folder.glob("*.json"): - with open(test_file, "r") as f: + with open(test_file) as f: raw_result = json.loads(f.read()) # attach the benchmarking command to raw_result - with open(test_file.with_suffix(".commands"), "r") as f: + with open(test_file.with_suffix(".commands")) as f: command = json.loads(f.read()) raw_result.update(command) diff --git a/vllm/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/vllm/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh index f16862907..aa0f7ade8 100644 --- a/vllm/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh +++ b/vllm/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -1,12 +1,12 @@ #!/bin/sh -TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) -URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT" TIMEOUT_SECONDS=10 retries=0 while [ $retries -lt 1000 ]; do - if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then exit 0 fi @@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do sleep 5 done -exit 1 \ No newline at end of file +exit 1 diff --git a/vllm/.buildkite/release-pipeline.yaml b/vllm/.buildkite/release-pipeline.yaml index 3b7fa0f2d..93e118fb3 100644 --- a/vllm/.buildkite/release-pipeline.yaml +++ b/vllm/.buildkite/release-pipeline.yaml @@ -1,33 +1,41 @@ steps: - label: "Build wheel - CUDA 12.1" agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - # rename the files to change linux -> manylinux1 - - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" - - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" - - block: "Build CUDA 11.8 wheel" - key: block-build-cu118-wheel - + # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working. + # However, this block can be uncommented to save some compute hours. + # - block: "Build CUDA 11.8 wheel" + # key: block-build-cu118-wheel + - label: "Build wheel - CUDA 11.8" - depends_on: block-build-cu118-wheel + # depends_on: block-build-cu118-wheel agents: - queue: cpu_queue + queue: cpu_queue_postmerge commands: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - # rename the files to change linux -> manylinux1 - - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" + - "bash .buildkite/upload-wheels.sh" env: DOCKER_BUILDKIT: "1" + + - block: "Build release image" + depends_on: ~ + key: block-release-image-build + + - label: "Build release image" + depends_on: block-release-image-build + agents: + queue: cpu_queue_postmerge + commands: + - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ." + - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" diff --git a/vllm/.buildkite/run-amd-test.sh b/vllm/.buildkite/run-amd-test.sh index df201cdc7..3515ccd65 100755 --- a/vllm/.buildkite/run-amd-test.sh +++ b/vllm/.buildkite/run-amd-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script runs test inside the corresponding ROCm docker container. set -o pipefail @@ -31,8 +33,8 @@ cleanup_docker() { echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - # Remove unused volumes - docker volume prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune --force --filter "until=72h" --all echo "Docker images and volumes cleanup completed." else echo "Disk usage is below $threshold%. No cleanup needed." @@ -57,17 +59,17 @@ done echo "--- Pulling container" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" -docker pull ${image_name} +docker pull "${image_name}" remove_docker_container() { - docker rm -f ${container_name} || docker image rm -f ${image_name} || true + docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true } trap remove_docker_container EXIT echo "--- Running container" HF_CACHE="$(realpath ~)/huggingface" -mkdir -p ${HF_CACHE} +mkdir -p "${HF_CACHE}" HF_MOUNT="/root/.cache/huggingface" commands=$@ @@ -83,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then --ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flashinfer.py \ - --ignore=kernels/test_gguf.py \ --ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_mamba_ssm.py \ @@ -107,35 +108,36 @@ fi PARALLEL_JOB_COUNT=8 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then + # assign job count as the number of shards used + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do - #replace shard arguments - commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} - commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} - echo "Shard ${GPU} commands:$commands" + # assign shard-id for each shard + commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "} + echo "Shard ${GPU} commands:$commands_gpu" docker run \ --device /dev/kfd --device /dev/dri \ --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=${GPU} \ + -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name}_${GPU} \ - ${image_name} \ - /bin/bash -c "${commands}" \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}_${GPU}" \ + "${image_name}" \ + /bin/bash -c "${commands_gpu}" \ |& while read -r line; do echo ">>Shard $GPU: $line"; done & PIDS+=($!) done #wait for all processes to finish and collect exit codes - for pid in ${PIDS[@]}; do - wait ${pid} + for pid in "${PIDS[@]}"; do + wait "${pid}" STATUS+=($?) done - for st in ${STATUS[@]}; do + for st in "${STATUS[@]}"; do if [[ ${st} -ne 0 ]]; then echo "One of the processes failed with $st" - exit ${st} + exit "${st}" fi done else @@ -146,9 +148,9 @@ else --rm \ -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ - -v ${HF_CACHE}:${HF_MOUNT} \ - -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ - ${image_name} \ + -v "${HF_CACHE}:${HF_MOUNT}" \ + -e "HF_HOME=${HF_MOUNT}" \ + --name "${container_name}" \ + "${image_name}" \ /bin/bash -c "${commands}" fi diff --git a/vllm/.buildkite/run-benchmarks.sh b/vllm/.buildkite/run-benchmarks.sh index cbf6dda67..1641c1faa 100644 --- a/vllm/.buildkite/run-benchmarks.sh +++ b/vllm/.buildkite/run-benchmarks.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script is run by buildkite to run the benchmarks and upload the results to buildkite set -ex diff --git a/vllm/.buildkite/run-cpu-test-ppc64le.sh b/vllm/.buildkite/run-cpu-test-ppc64le.sh index fd60f5b6a..bc06838d8 100755 --- a/vllm/.buildkite/run-cpu-test-ppc64le.sh +++ b/vllm/.buildkite/run-cpu-test-ppc64le.sh @@ -1,39 +1,14 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex -# Try building the docker image -docker build -t cpu-test -f Dockerfile.ppc64le . - # Setup cleanup -remove_docker_container() { docker rm -f cpu-test || true; } +remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; } trap remove_docker_container EXIT remove_docker_container -# Run the image, setting --shm-size=4g for tensor parallel. -source /etc/environment -#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test - -# Run basic model test -docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" \ - --ignore=tests/models/test_embedding.py \ - --ignore=tests/models/test_oot_registration.py \ - --ignore=tests/models/test_registry.py \ - --ignore=tests/models/test_jamba.py \ - --ignore=tests/models/test_mamba.py \ - --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported +# Try building the docker image +docker build -t cpu-test -f Dockerfile.ppc64le . -# online inference -docker exec cpu-test bash -c " - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" diff --git a/vllm/.buildkite/run-cpu-test.sh b/vllm/.buildkite/run-cpu-test.sh index c331a9c49..4f1729d46 100644 --- a/vllm/.buildkite/run-cpu-test.sh +++ b/vllm/.buildkite/run-cpu-test.sh @@ -1,57 +1,85 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex +# allow to bind to different cores +CORE_RANGE=${CORE_RANGE:-48-95} +NUMA_NODE=${NUMA_NODE:-1} + # Try building the docker image -numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . -numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu . +numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } +remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; } trap remove_docker_container EXIT remove_docker_container # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test -docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ - --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 - -# offline inference -docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" - -# Run basic model test -docker exec cpu-test bash -c " - pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator - pytest -v -s tests/models/encoder_decoder/language - pytest -v -s tests/models/decoder_only/language \ - --ignore=tests/models/test_fp8.py \ - --ignore=tests/models/decoder_only/language/test_jamba.py \ - --ignore=tests/models/decoder_only/language/test_mamba.py \ - --ignore=tests/models/decoder_only/language/test_granitemoe.py \ - --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported - -# Run compressed-tensor test -docker exec cpu-test bash -c " - pytest -s -v \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ - tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" - -# Run AWQ test -docker exec cpu-test bash -c " - pytest -s -v \ - tests/quantization/test_ipex_quant.py" - -# online inference -docker exec cpu-test bash -c " - export VLLM_CPU_KVCACHE_SPACE=10 - export VLLM_CPU_OMP_THREADS_BIND=48-92 - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - python3 benchmarks/benchmark_serving.py \ - --backend vllm \ - --dataset-name random \ - --model facebook/opt-125m \ - --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \ + --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2 + +function cpu_tests() { + set -e + export NUMA_NODE=$2 + + # offline inference + docker exec cpu-test-avx2-"$NUMA_NODE" bash -c " + set -e + python3 examples/offline_inference.py" + + # Run basic model test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pip install pytest pytest-asyncio \ + decord einops librosa peft Pillow sentence-transformers soundfile \ + transformers_stream_generator matplotlib datamodel_code_generator + pip install torchvision --index-url https://download.pytorch.org/whl/cpu + pytest -v -s tests/models/decoder_only/language -m cpu_model + pytest -v -s tests/models/embedding/language -m cpu_model + pytest -v -s tests/models/encoder_decoder/language -m cpu_model + pytest -v -s tests/models/decoder_only/audio_language -m cpu_model + pytest -v -s tests/models/decoder_only/vision_language -m cpu_model" + + # Run compressed-tensor test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -s -v \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ + tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token" + + # Run AWQ test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -s -v \ + tests/quantization/test_ipex_quant.py" + + # Run chunked-prefill and prefix-cache test + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + pytest -s -v -k cpu_model \ + tests/basic_correctness/test_chunked_prefill.py" + + # online inference + docker exec cpu-test-"$NUMA_NODE" bash -c " + set -e + export VLLM_CPU_KVCACHE_SPACE=10 + export VLLM_CPU_OMP_THREADS_BIND=$1 + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" +} + +# All of CPU tests are expected to be finished less than 25 mins. +export -f cpu_tests +timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE" diff --git a/vllm/.buildkite/run-hpu-test.sh b/vllm/.buildkite/run-hpu-test.sh new file mode 100644 index 000000000..fa4f74fca --- /dev/null +++ b/vllm/.buildkite/run-hpu-test.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t hpu-test-env -f Dockerfile.hpu . + +# Setup cleanup +remove_docker_container() { docker rm -f hpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py \ No newline at end of file diff --git a/vllm/.buildkite/run-multi-node-test.sh b/vllm/.buildkite/run-multi-node-test.sh index 7ac4dcc4c..530bf90a8 100755 --- a/vllm/.buildkite/run-multi-node-test.sh +++ b/vllm/.buildkite/run-multi-node-test.sh @@ -14,7 +14,7 @@ DOCKER_IMAGE=$4 shift 4 COMMANDS=("$@") -if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then +if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then echo "The number of commands must be equal to the number of nodes." echo "Number of nodes: $NUM_NODES" echo "Number of commands: ${#COMMANDS[@]}" @@ -23,7 +23,7 @@ fi echo "List of commands" for command in "${COMMANDS[@]}"; do - echo $command + echo "$command" done start_network() { @@ -36,7 +36,7 @@ start_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done @@ -49,17 +49,20 @@ start_nodes() { # 3. map the huggingface cache directory to the container # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # starting from 192.168.10.11) - docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" + docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \ + -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \ + --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \ + /bin/bash -c "tail -f /dev/null" # organize containers into a ray cluster - if [ $node -eq 0 ]; then + if [ "$node" -eq 0 ]; then # start the ray head node - docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block" # wait for the head node to be ready sleep 10 else # start the ray worker nodes, and connect them to the head node - docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" + docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block" fi done @@ -79,22 +82,22 @@ run_nodes() { for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) GPU_DEVICES+=$(($DEVICE_NUM)) - if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then + if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then GPU_DEVICES+=',' fi done GPU_DEVICES+='"' echo "Running node$node with GPU devices: $GPU_DEVICES" - if [ $node -ne 0 ]; then - docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + if [ "$node" -ne 0 ]; then + docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" else - docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" + docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" fi done } cleanup() { for node in $(seq 0 $(($NUM_NODES-1))); do - docker stop node$node + docker stop "node$node" done docker network rm docker-net } diff --git a/vllm/.buildkite/run-neuron-test.sh b/vllm/.buildkite/run-neuron-test.sh index 252c0f7fe..9259391aa 100644 --- a/vllm/.buildkite/run-neuron-test.sh +++ b/vllm/.buildkite/run-neuron-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the Neuron docker image and run the API server inside the container. # It serves a sanity check for compilation and basic model usage. set -e @@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then docker system prune -f - echo $current_time > /tmp/neuron-docker-build-timestamp + echo "$current_time" > /tmp/neuron-docker-build-timestamp fi else - echo $(date +%s) > /tmp/neuron-docker-build-timestamp + date "+%s" > /tmp/neuron-docker-build-timestamp fi docker build -t neuron -f Dockerfile.neuron . @@ -34,7 +36,7 @@ wait_for_server_to_start() { timeout=300 counter=0 - while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do + while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do sleep 1 counter=$((counter + 1)) if [ $counter -ge $timeout ]; then diff --git a/vllm/.buildkite/run-openvino-test.sh b/vllm/.buildkite/run-openvino-test.sh index 70e56596c..6b12f424f 100755 --- a/vllm/.buildkite/run-openvino-test.sh +++ b/vllm/.buildkite/run-openvino-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the OpenVINO docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex @@ -11,4 +13,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py diff --git a/vllm/.buildkite/run-tpu-test.sh b/vllm/.buildkite/run-tpu-test.sh index 6989c94d4..770dad6ff 100644 --- a/vllm/.buildkite/run-tpu-test.sh +++ b/vllm/.buildkite/run-tpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e # Build the docker image. @@ -12,4 +14,4 @@ remove_docker_container # For HF_TOKEN. source /etc/environment # Run a simple end-to-end example. -docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" +docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" diff --git a/vllm/.buildkite/run-xpu-test.sh b/vllm/.buildkite/run-xpu-test.sh index 6ffa66d5e..e0a12afbe 100644 --- a/vllm/.buildkite/run-xpu-test.sh +++ b/vllm/.buildkite/run-xpu-test.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # This script build the CPU docker image and run the offline inference inside the container. # It serves a sanity check for compilation and basic model usage. set -ex @@ -10,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; } trap remove_docker_container EXIT remove_docker_container -# Run the image and launch offline inference -docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py +# Run the image and test offline inference/tensor parallel +docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c ' + python3 examples/offline_inference.py + python3 examples/offline_inference_cli.py -tp 2 +' diff --git a/vllm/.buildkite/test-pipeline.yaml b/vllm/.buildkite/test-pipeline.yaml index 8c98aa36a..bf0de3f69 100644 --- a/vllm/.buildkite/test-pipeline.yaml +++ b/vllm/.buildkite/test-pipeline.yaml @@ -9,7 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only -# optional(bool): never run this test by default (i.e. need to unblock manually) +# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run. # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] @@ -50,7 +50,9 @@ steps: - tests/multimodal - tests/test_utils - tests/worker + - tests/standalone_tests/lazy_torch_compile.py commands: + - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py @@ -59,6 +61,13 @@ steps: - pytest -v -s test_utils.py # Utils - pytest -v -s worker # Worker +- label: Python-only Installation Test + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + - label: Basic Correctness Test # 30min #mirror_hardwares: [amd] fast_check: true @@ -119,6 +128,7 @@ steps: - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile commands: + - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py @@ -163,6 +173,14 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization +- label: V1 Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - VLLM_USE_V1=1 pytest -v -s v1 + - label: Examples Test # 15min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] @@ -219,7 +237,7 @@ steps: source_file_dependencies: - vllm/lora - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore lora/test_long_context.py lora/test_chatglm3_tp.py lora/test_llama_tp.py parallelism: 4 - label: "PyTorch Fullgraph Smoke Test" # 9min @@ -229,6 +247,9 @@ steps: - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py + # these tests need to be separated, cannot combine + - pytest -v -s compile/piecewise/test_simple.py + - pytest -v -s compile/piecewise/test_toy_llama.py - label: "PyTorch Fullgraph Test" # 18min source_file_dependencies: @@ -264,7 +285,6 @@ steps: source_file_dependencies: - benchmarks/ commands: - - pip install aiohttp - bash run-benchmarks.sh - label: Quantization Test # 33min @@ -301,55 +321,70 @@ steps: ##### models test ##### -- label: Basic Models Test # 3min +- label: Basic Models Test # 30min source_file_dependencies: - vllm/ - tests/models commands: - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s models/test_oot_registration.py # it needs a clean process - - pytest -v -s models/*.py --ignore=models/test_oot_registration.py + - pytest -v -s models/test_registry.py + - pytest -v -s models/test_initialization.py -- label: Decoder-only Language Models Test (Standard) # 35min +- label: Language Models Test (Standard) # 42min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - - pytest -v -s models/decoder_only/language/test_models.py - - pytest -v -s models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' + - pytest -v -s models/embedding/language -m core_model -- label: Decoder-only Language Models Test (Extended) # 1h20min - nightly: true +- label: Language Models Test (Extended) # 50min + optional: true source_file_dependencies: - vllm/ - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language commands: - - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py + - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/language -m 'not core_model' -- label: Decoder-only Multi-Modal Models Test # 1h31min +- label: Multi-Modal Models Test (Standard) # 26min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language commands: - - pytest -v -s models/decoder_only/audio_language - - pytest -v -s models/decoder_only/vision_language + - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/embedding/vision_language -m core_model + - pytest -v -s models/encoder_decoder/language -m core_model + - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Other Models Test # 6min - #mirror_hardwares: [amd] +- label: Multi-Modal Models Test (Extended) # 1h15m + optional: true source_file_dependencies: - vllm/ - - tests/models/embedding/language + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language - tests/models/embedding/vision_language - - tests/models/encoder_decoder/language - tests/models/encoder_decoder/vision_language commands: - - pytest -v -s models/embedding/language - - pytest -v -s models/embedding/vision_language - - pytest -v -s models/encoder_decoder/language - - pytest -v -s models/encoder_decoder/vision_language + - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' + # HACK - run phi3v tests separately to sidestep this transformers bug + # https://github.com/huggingface/transformers/issues/34307 + - pytest -v -s models/decoder_only/vision_language/test_phi3v.py + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' + - pytest -v -s models/encoder_decoder/language -m 'not core_model' + - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test @@ -402,6 +437,9 @@ steps: - vllm/model_executor/models/ - tests/distributed/ - vllm/compilation + - vllm/worker/worker_base.py + - vllm/worker/worker.py + - vllm/worker/model_runner.py commands: - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py @@ -410,12 +448,12 @@ steps: # Avoid importing model tests that cause CUDA reinitialization error - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/decoder_only/vision_language/test_models.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" @@ -448,18 +486,22 @@ steps: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py -- label: LoRA Long Context (Distributed) # 11min - # This test runs llama 13B, so it is required to run on 4 GPUs. +- label: LoRA TP Test (Distributed) num_gpus: 4 - soft_fail: true source_file_dependencies: - vllm/lora - - tests/lora/test_long_context + - tests/lora commands: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # This test runs llama 13B, so it is required to run on 4 GPUs. - pytest -v -s -x lora/test_long_context.py + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - label: Weight Loading Multiple GPU Test # 33min working_dir: "/vllm-workspace/tests" @@ -487,6 +529,7 @@ steps: - label: Distributed Tests (A100) # optional gpu: a100 + optional: true num_gpus: 4 source_file_dependencies: - vllm/ @@ -494,11 +537,13 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional gpu: a100 + optional: true num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: diff --git a/vllm/.buildkite/upload-wheels.sh b/vllm/.buildkite/upload-wheels.sh new file mode 100644 index 000000000..7345dd4e6 --- /dev/null +++ b/vllm/.buildkite/upload-wheels.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -ex + +# Assume wheels are in artifacts/dist/*.whl +wheel_files=(artifacts/dist/*.whl) + +# Check that exactly one wheel is found +if [[ ${#wheel_files[@]} -ne 1 ]]; then + echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}" + exit 1 +fi + +# Get the single wheel file +wheel="${wheel_files[0]}" + +# Rename 'linux' to 'manylinux1' in the wheel filename +new_wheel="${wheel/linux/manylinux1}" +mv -- "$wheel" "$new_wheel" +wheel="$new_wheel" + +# Extract the version from the wheel +version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) +echo "Version: $version" + +# If the version contains "dev", rename it to v1.0.0.dev for consistency +if [[ $version == *dev* ]]; then + suffix="${version##*.}" + if [[ $suffix == cu* ]]; then + new_version="1.0.0.dev+${suffix}" + else + new_version="1.0.0.dev" + fi + new_wheel="${wheel/$version/$new_version}" + mv -- "$wheel" "$new_wheel" + wheel="$new_wheel" + version="$new_version" +fi + +# Upload the wheel to S3 +aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/" +aws s3 cp "$wheel" "s3://vllm-wheels/nightly/" +aws s3 cp "$wheel" "s3://vllm-wheels/$version/" \ No newline at end of file diff --git a/vllm/.github/CODEOWNERS b/vllm/.github/CODEOWNERS index cd721971d..3cb91fc0f 100644 --- a/vllm/.github/CODEOWNERS +++ b/vllm/.github/CODEOWNERS @@ -3,13 +3,16 @@ # This lists cover the "core" components of vLLM that require careful review /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill -CMakeLists.txt @tlrmchlsmth @WoosukKwon +/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill +CMakeLists.txt @tlrmchlsmth + +# vLLM V1 +/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic # Test ownership /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo diff --git a/vllm/.github/FUNDING.yml b/vllm/.github/FUNDING.yml index 71f4e5201..d1f6105a4 100644 --- a/vllm/.github/FUNDING.yml +++ b/vllm/.github/FUNDING.yml @@ -1,2 +1,2 @@ github: [vllm-project] -open_collective: [vllm] +open_collective: vllm diff --git a/vllm/.github/PULL_REQUEST_TEMPLATE.md b/vllm/.github/PULL_REQUEST_TEMPLATE.md index be0afc630..51a73c857 100644 --- a/vllm/.github/PULL_REQUEST_TEMPLATE.md +++ b/vllm/.github/PULL_REQUEST_TEMPLATE.md @@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE FIX #xxxx (*link existing issues this PR will resolve*) -**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE** - ---- - -
- - PR Checklist (Click to Expand) - -

Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.

- -

PR Title and Classification

-

Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:

-
    -
  • [Bugfix] for bug fixes.
  • -
  • [CI/Build] for build or continuous integration improvements.
  • -
  • [Doc] for documentation fixes and improvements.
  • -
  • [Model] for adding a new model or improving an existing model. Model name should appear in the title.
  • -
  • [Frontend] For changes on the vLLM frontend (e.g., OpenAI API server, LLM class, etc.)
  • -
  • [Kernel] for changes affecting CUDA kernels or other compute kernels.
  • -
  • [Core] for changes in the core vLLM logic (e.g., LLMEngine, AsyncLLMEngine, Scheduler, etc.)
  • -
  • [Hardware][Vendor] for hardware-specific changes. Vendor name should appear in the prefix (e.g., [Hardware][AMD]).
  • -
  • [Misc] for PRs that do not fit the above categories. Please use this sparingly.
  • -
-

Note: If the PR spans more than one category, please include all relevant prefixes.

- -

Code Quality

- -

The PR need to meet the following code quality standards:

- -
    -
  • We adhere to Google Python style guide and Google C++ style guide.
  • -
  • Pass all linter checks. Please use format.sh to format your code.
  • -
  • The code need to be well-documented to ensure future contributors can easily understand the code.
  • -
  • Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.
  • -
  • Please add documentation to docs/source/ if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.
  • -
- -

Adding or changing kernels

-

Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.

-
    -
  • Make sure custom ops are registered following PyTorch guidelines: Custom C++ and CUDA Operators and The Custom Operators Manual
  • -
  • Custom operations that return Tensors require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.
  • -
  • Use torch.libary.opcheck() to test the function registration and meta-function for any registered ops. See tests/kernels for examples.
  • -
  • When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.
  • -
  • If a new custom type is needed, see the following document: Custom Class Support in PT2. -
- -

Notes for Large Changes

-

Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with rfc-required and might not go through the PR.

- -

What to Expect for the Reviews

- -

The goal of the vLLM team is to be a transparent reviewing machine. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process:

- -
    -
  • After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.
  • -
  • After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.
  • -
  • After the review, the reviewer will put an action-required label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.
  • -
  • Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. -
  • -
- -

Thank You

- -

Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone!

- - -
- - +**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html ** diff --git a/vllm/.github/dependabot.yml b/vllm/.github/dependabot.yml index 6fddca0d6..683b70cd8 100644 --- a/vllm/.github/dependabot.yml +++ b/vllm/.github/dependabot.yml @@ -5,3 +5,27 @@ updates: directory: "/" schedule: interval: "weekly" + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "weekly" + labels: ["dependencies"] + open-pull-requests-limit: 5 + reviewers: ["khluu", "simon-mo"] + allow: + - dependency-type: "all" + ignore: + - dependency-name: "*" + update-types: ["version-update:semver-patch"] + - dependency-name: "torch" + - dependency-name: "torchvision" + - dependency-name: "xformers" + - dependency-name: "lm-format-enforcer" + - dependency-name: "gguf" + - dependency-name: "compressed-tensors" + - dependency-name: "ray[adag]" + - dependency-name: "lm-eval" + groups: + minor-update: + applies-to: version-updates + update-types: ["minor"] diff --git a/vllm/.github/mergify.yml b/vllm/.github/mergify.yml index 2a3dee7c6..ca4bd7ee2 100644 --- a/vllm/.github/mergify.yml +++ b/vllm/.github/mergify.yml @@ -13,13 +13,14 @@ pull_request_rules: - name: label-ci-build description: Automatically apply ci/build label conditions: - - files~=^\.github/ - - files~=\.buildkite/ - - files~=^cmake/ - - files=CMakeLists.txt - - files~=^Dockerfile - - files~=^requirements.*\.txt - - files=setup.py + - or: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py actions: label: add: @@ -45,7 +46,9 @@ pull_request_rules: comment: message: | This pull request has merge conflicts that must be resolved before it can be - merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + merged. Please rebase the PR, @{{author}}. + + https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork - name: remove 'needs-rebase' label when conflict is resolved conditions: diff --git a/vllm/.github/scripts/cleanup_pr_body.sh b/vllm/.github/scripts/cleanup_pr_body.sh new file mode 100755 index 000000000..3246c6f9b --- /dev/null +++ b/vllm/.github/scripts/cleanup_pr_body.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -eu + +# ensure 1 argument is passed +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +PR_NUMBER=$1 +OLD=/tmp/orig_pr_body.txt +NEW=/tmp/new_pr_body.txt + +gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}" +cp "${OLD}" "${NEW}" + +# Remove "FIX #xxxx (*link existing issues this PR will resolve*)" +sed -i '/FIX #xxxx.*$/d' "${NEW}" + +# Remove "FILL IN THE PR DESCRIPTION HERE" +sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}" + +# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**" +sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}" + +# Remove HTML
section that includes text of "PR Checklist (Click to Expand)" +python3 - <.*?.*?PR Checklist \(Click to Expand\).*?.*?
', re.DOTALL) +content = re.sub(pattern, '', content) + +with open("${NEW}", "w") as file: + file.write(content) +EOF + +# Run this only if ${NEW} is different than ${OLD} +if ! cmp -s "${OLD}" "${NEW}"; then + gh pr edit --body-file "${NEW}" "${PR_NUMBER}" + echo + echo "Updated PR body:" + echo + cat "${NEW}" +else + echo "No changes needed" +fi diff --git a/vllm/.github/workflows/actionlint.yml b/vllm/.github/workflows/actionlint.yml index b80749aaa..0226cf0ca 100644 --- a/vllm/.github/workflows/actionlint.yml +++ b/vllm/.github/workflows/actionlint.yml @@ -6,12 +6,14 @@ on: paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' pull_request: branches: - "main" paths: - '.github/workflows/*.ya?ml' - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' env: LC_ALL: en_US.UTF-8 @@ -28,7 +30,7 @@ jobs: runs-on: ubuntu-latest steps: - name: "Checkout" - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: fetch-depth: 0 diff --git a/vllm/.github/workflows/clang-format.yml b/vllm/.github/workflows/clang-format.yml index 68d60d736..68149d2dc 100644 --- a/vllm/.github/workflows/clang-format.yml +++ b/vllm/.github/workflows/clang-format.yml @@ -6,9 +6,21 @@ on: push: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' pull_request: branches: - main + paths: + - '**/*.h' + - '**/*.cpp' + - '**/*.cu' + - '**/*.cuh' + - '.github/workflows/clang-format.yml' jobs: clang-format: @@ -17,9 +29,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/vllm/.github/workflows/cleanup_pr_body.yml b/vllm/.github/workflows/cleanup_pr_body.yml new file mode 100644 index 000000000..0085a1cc2 --- /dev/null +++ b/vllm/.github/workflows/cleanup_pr_body.yml @@ -0,0 +1,26 @@ +name: Cleanup PR Body + +on: + pull_request_target: + types: [opened, reopened, edited] + +permissions: + pull-requests: write + +jobs: + update-description: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Set up Python + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: '3.12' + + - name: Update PR description + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}" diff --git a/vllm/.github/workflows/codespell.yml b/vllm/.github/workflows/codespell.yml new file mode 100644 index 000000000..68887adaa --- /dev/null +++ b/vllm/.github/workflows/codespell.yml @@ -0,0 +1,45 @@ +name: codespell + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + pull_request: + branches: + - main + paths: + - "**/*.py" + - "**/*.md" + - "**/*.rst" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/codespell.yml + +jobs: + codespell: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Spelling check with codespell + run: | + codespell --toml pyproject.toml diff --git a/vllm/.github/workflows/mypy.yaml b/vllm/.github/workflows/mypy.yaml index 5f1e5f8ee..73eeacf1f 100644 --- a/vllm/.github/workflows/mypy.yaml +++ b/vllm/.github/workflows/mypy.yaml @@ -6,20 +6,35 @@ on: push: branches: - main + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + - 'pyproject.toml' pull_request: branches: - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - '**/*.py' + # - '.github/workflows/mypy.yaml' + # - 'tools/mypy.sh' + # - 'pyproject.toml' jobs: mypy: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -33,4 +48,4 @@ jobs: - name: Mypy run: | echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 + tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/vllm/.github/workflows/png-lint.yml b/vllm/.github/workflows/png-lint.yml new file mode 100644 index 000000000..4932af943 --- /dev/null +++ b/vllm/.github/workflows/png-lint.yml @@ -0,0 +1,37 @@ +name: Lint PNG exports from excalidraw +on: + push: + branches: + - "main" + paths: + - '*.excalidraw.png' + - '.github/workflows/png-lint.yml' + pull_request: + branches: + - "main" + paths: + - '*.excalidraw.png' + - '.github/workflows/png-lint.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run png-lint.sh to check excalidraw exported images" + run: | + tools/png-lint.sh diff --git a/vllm/.github/workflows/publish.yml b/vllm/.github/workflows/publish.yml index f959a1cac..c1051d10a 100644 --- a/vllm/.github/workflows/publish.yml +++ b/vllm/.github/workflows/publish.yml @@ -21,7 +21,7 @@ jobs: upload_url: ${{ steps.create_release.outputs.upload_url }} steps: - name: Checkout - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Extract branch info shell: bash @@ -48,13 +48,13 @@ jobs: fail-fast: false matrix: os: ['ubuntu-20.04'] - python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12'] pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt. cuda-version: ['11.8', '12.1'] steps: - name: Checkout - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Setup ccache uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14 @@ -68,7 +68,7 @@ jobs: bash -x .github/workflows/scripts/env.sh - name: Set up Python - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: python-version: ${{ matrix.python-version }} diff --git a/vllm/.github/workflows/ruff.yml b/vllm/.github/workflows/ruff.yml index 9cc8a9e91..7266cc378 100644 --- a/vllm/.github/workflows/ruff.yml +++ b/vllm/.github/workflows/ruff.yml @@ -6,33 +6,47 @@ on: push: branches: - main + paths: + - "**/*.py" + - pyproject.toml + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml pull_request: branches: - main + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + #paths: + # - "**/*.py" + # - pyproject.toml + # - requirements-lint.txt + # - .github/workflows/matchers/ruff.json + # - .github/workflows/ruff.yml jobs: ruff: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml - - name: Run isort - run: | - isort . --check-only + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/vllm/.github/workflows/scripts/cuda-install.sh b/vllm/.github/workflows/scripts/cuda-install.sh index 312c6e82f..3d0b7a1fe 100644 --- a/vllm/.github/workflows/scripts/cuda-install.sh +++ b/vllm/.github/workflows/scripts/cuda-install.sh @@ -1,16 +1,16 @@ #!/bin/bash # Replace '.' with '-' ex: 11.8 -> 11-8 -cuda_version=$(echo $1 | tr "." "-") +cuda_version=$(echo "$1" | tr "." "-") # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 -OS=$(echo $2 | tr -d ".\-") +OS=$(echo "$2" | tr -d ".\-") # Installs CUDA -wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb +wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb" sudo dpkg -i cuda-keyring_1.1-1_all.deb rm cuda-keyring_1.1-1_all.deb sudo apt -qq update -sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} +sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}" sudo apt clean # Test nvcc diff --git a/vllm/.github/workflows/scripts/pytorch-install.sh b/vllm/.github/workflows/scripts/pytorch-install.sh index dfc1851d7..e3cda7dad 100644 --- a/vllm/.github/workflows/scripts/pytorch-install.sh +++ b/vllm/.github/workflows/scripts/pytorch-install.sh @@ -6,7 +6,7 @@ cuda_version=$3 # Install torch $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya -$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} +$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}" # Print version information $python_executable --version diff --git a/vllm/.github/workflows/shellcheck.yml b/vllm/.github/workflows/shellcheck.yml new file mode 100644 index 000000000..4b1587e37 --- /dev/null +++ b/vllm/.github/workflows/shellcheck.yml @@ -0,0 +1,37 @@ +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/vllm/.github/workflows/sphinx-lint.yml b/vllm/.github/workflows/sphinx-lint.yml new file mode 100644 index 000000000..e0bb24276 --- /dev/null +++ b/vllm/.github/workflows/sphinx-lint.yml @@ -0,0 +1,32 @@ +name: Lint documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + pull_request: + branches: + - main + paths: + - "docs/**" + +jobs: + sphinx-lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Linting docs + run: tools/sphinx-lint.sh diff --git a/vllm/.github/workflows/yapf.yml b/vllm/.github/workflows/yapf.yml index 9f06b35c1..ff441f944 100644 --- a/vllm/.github/workflows/yapf.yml +++ b/vllm/.github/workflows/yapf.yml @@ -6,26 +6,33 @@ on: push: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml pull_request: branches: - main + paths: + - "**/*.py" + - .github/workflows/yapf.yml + jobs: yapf: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.12"] steps: - - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + pip install toml==0.10.2 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/vllm/.gitignore b/vllm/.gitignore index 1ea6e3419..ceef6a5fb 100644 --- a/vllm/.gitignore +++ b/vllm/.gitignore @@ -202,3 +202,4 @@ benchmarks/*.json # Linting actionlint +shellcheck*/ diff --git a/vllm/.readthedocs.yaml b/vllm/.readthedocs.yaml index 42cbf18a0..284196bc2 100644 --- a/vllm/.readthedocs.yaml +++ b/vllm/.readthedocs.yaml @@ -6,17 +6,16 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.12" sphinx: - configuration: docs/source/conf.py - fail_on_warning: true + configuration: docs/source/conf.py + fail_on_warning: true # If using Sphinx, optionally build your docs in additional formats such as PDF formats: [] # Optionally declare the Python requirements required to build your docs python: - install: - - requirements: docs/requirements-docs.txt - + install: + - requirements: docs/requirements-docs.txt diff --git a/vllm/.shellcheckrc b/vllm/.shellcheckrc new file mode 100644 index 000000000..f3b6eedf8 --- /dev/null +++ b/vllm/.shellcheckrc @@ -0,0 +1,9 @@ +# rules currently disabled: +# +# SC1091 (info): Not following: was not specified as input (see shellcheck -x) +# SC2004 (style): $/${} is unnecessary on arithmetic variables. +# SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects. +# SC2155 (warning): Declare and assign separately to avoid masking return values. +# SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails. +# +disable=SC1091,SC2004,SC2129,SC2155,SC2164 diff --git a/vllm/CMakeLists.txt b/vllm/CMakeLists.txt index 1a6a311e9..c78cdc77a 100644 --- a/vllm/CMakeLists.txt +++ b/vllm/CMakeLists.txt @@ -31,13 +31,13 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) # Supported python versions. These versions will be searched in order, the # first match will be selected. These should be kept in sync with setup.py. # -set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12") +set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12") # Supported NVIDIA architectures. -set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0") +set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0") # Supported AMD GPU architectures. -set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100") +set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101") # # Supported/expected torch versions for CUDA/ROCm. @@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") -set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1") # # Try to find python package with an executable that exactly matches @@ -128,9 +128,9 @@ endif() if(VLLM_GPU_LANG STREQUAL "CUDA") # - # For cuda we want to be able to control which architectures we compile for on + # For cuda we want to be able to control which architectures we compile for on # a per-file basis in order to cut down on compile time. So here we extract - # the set of architectures we want to compile for and remove the from the + # the set of architectures we want to compile for and remove the from the # CMAKE_CUDA_FLAGS so that they are not applied globally. # clear_cuda_arches(CUDA_ARCH_FLAGS) @@ -138,7 +138,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "CUDA target architectures: ${CUDA_ARCHS}") # Filter the target architectures by the supported supported archs # since for some files we will build for all CUDA_ARCHS. - cuda_archs_loose_intersection(CUDA_ARCHS + cuda_archs_loose_intersection(CUDA_ARCHS "${CUDA_SUPPORTED_ARCHS}" "${CUDA_ARCHS}") message(STATUS "CUDA supported target architectures: ${CUDA_ARCHS}") else() @@ -187,13 +187,16 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") set(VLLM_EXT_SRC "csrc/cache_kernels.cu" - "csrc/attention/attention_kernels.cu" + "csrc/attention/paged_attention_v1.cu" + "csrc/attention/paged_attention_v2.cu" "csrc/pos_encoding_kernels.cu" "csrc/activation_kernels.cu" "csrc/layernorm_kernels.cu" + "csrc/layernorm_quant_kernels.cu" "csrc/quantization/gptq/q_gemm.cu" "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" "csrc/quantization/fp8/common.cu" + "csrc/quantization/gguf/gguf_kernel.cu" "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/torch_bindings.cpp") @@ -204,7 +207,19 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case. set(CUTLASS_REVISION "v3.5.1" CACHE STRING "CUTLASS revision to use") - FetchContent_Declare( + # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided + if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) + set(VLLM_CUTLASS_SRC_DIR $ENV{VLLM_CUTLASS_SRC_DIR}) + endif() + + if(VLLM_CUTLASS_SRC_DIR) + if(NOT IS_ABSOLUTE VLLM_CUTLASS_SRC_DIR) + get_filename_component(VLLM_CUTLASS_SRC_DIR "${VLLM_CUTLASS_SRC_DIR}" ABSOLUTE) + endif() + message(STATUS "The VLLM_CUTLASS_SRC_DIR is set, using ${VLLM_CUTLASS_SRC_DIR} for compilation") + FetchContent_Declare(cutlass SOURCE_DIR ${VLLM_CUTLASS_SRC_DIR}) + else() + FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git GIT_TAG v3.5.1 @@ -214,7 +229,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE GIT_SHALLOW TRUE - ) + ) + endif() FetchContent_MakeAvailable(cutlass) list(APPEND VLLM_EXT_SRC @@ -222,7 +238,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/mamba/causal_conv1d/causal_conv1d.cu" "csrc/quantization/aqlm/gemm_kernels.cu" "csrc/quantization/awq/gemm_kernels.cu" - "csrc/quantization/gguf/gguf_kernel.cu" "csrc/custom_all_reduce.cu" "csrc/permute_cols.cu" "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu") @@ -234,9 +249,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.9;9.0" ${CUDA_ARCHS}) + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS}) if (MARLIN_ARCHS) - set(MARLIN_SRCS + set(MARLIN_SRCS "csrc/quantization/fp8/fp8_marlin.cu" "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu" "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu" @@ -277,7 +292,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "in CUDA target architectures") endif() - # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't + # clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't # build any 3x kernels set(SCALED_MM_3X_ARCHS) endif() @@ -286,7 +301,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + "7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -316,10 +331,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) # - # For the Machete kernels we automatically generate sources for various + # For the Machete kernels we automatically generate sources for various # preselected input type pairs and schedules. # Generate sources: - set(MACHETE_GEN_SCRIPT + set(MACHETE_GEN_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/machete/generate.py) file(MD5 ${MACHETE_GEN_SCRIPT} MACHETE_GEN_SCRIPT_HASH) @@ -329,8 +344,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT DEFINED CACHE{MACHETE_GEN_SCRIPT_HASH} OR NOT $CACHE{MACHETE_GEN_SCRIPT_HASH} STREQUAL ${MACHETE_GEN_SCRIPT_HASH}) execute_process( - COMMAND ${CMAKE_COMMAND} -E env - PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH + COMMAND ${CMAKE_COMMAND} -E env + PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH ${Python_EXECUTABLE} ${MACHETE_GEN_SCRIPT} RESULT_VARIABLE machete_generation_result OUTPUT_VARIABLE machete_generation_output @@ -340,11 +355,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") if (NOT machete_generation_result EQUAL 0) message(FATAL_ERROR "Machete generation failed." - " Result: \"${machete_generation_result}\"" + " Result: \"${machete_generation_result}\"" "\nCheck the log for details: " "${CMAKE_CURRENT_BINARY_DIR}/machete_generation.log") else() - set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} + set(MACHETE_GEN_SCRIPT_HASH ${MACHETE_GEN_SCRIPT_HASH} CACHE STRING "Last run machete generate script hash" FORCE) message(STATUS "Machete generation completed successfully.") endif() @@ -366,7 +381,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}") else() - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 + if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS) message(STATUS "Not building Machete kernels as CUDA Compiler version is " "not >= 12.0, we recommend upgrading to CUDA 12.0 or " @@ -392,8 +407,8 @@ define_gpu_extension_target( USE_SABI 3 WITH_SOABI) -# If CUTLASS is compiled on NVCC >= 12.5, it by default uses -# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the +# If CUTLASS is compiled on NVCC >= 12.5, it by default uses +# cudaGetDriverEntryPointByVersion as a wrapper to avoid directly calling the # driver API. This causes problems when linking with earlier versions of CUDA. # Setting this variable sidesteps the issue by calling the driver directly. target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) @@ -412,7 +427,7 @@ set_gencode_flags_for_srcs( CUDA_ARCHS "${CUDA_ARCHS}") if(VLLM_GPU_LANG STREQUAL "CUDA") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.9;9.0" "${CUDA_ARCHS}") + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) set(MARLIN_MOE_SRC "csrc/moe/marlin_kernels/marlin_moe_kernel.h" @@ -471,9 +486,9 @@ if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda") return() endif () -# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target -# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the -# arches in the CUDA case (and instead set the gencodes on a per file basis) +# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target +# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the +# arches in the CUDA case (and instead set the gencodes on a per file basis) # we need to manually set VLLM_GPU_ARCHES here. if(VLLM_GPU_LANG STREQUAL "CUDA") foreach(_ARCH ${CUDA_ARCHS}) @@ -507,7 +522,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 + GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/vllm/CONTRIBUTING.md b/vllm/CONTRIBUTING.md index 5f79356bd..6d46a6dca 100644 --- a/vllm/CONTRIBUTING.md +++ b/vllm/CONTRIBUTING.md @@ -1,50 +1,3 @@ # Contributing to vLLM -Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: - -- Identify and report any issues or bugs. -- Request or add support for a new model. -- Suggest or implement new features. -- Improve documentation or contribute a how-to guide. - -We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. - -Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! - - -## Developing - -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. - - -## Testing - -```bash -pip install -r requirements-dev.txt - -# linting and formatting -bash format.sh -# Static type checking -mypy -# Unit tests -pytest tests/ -``` -**Note:** Currently, the repository does not pass the ``mypy`` tests. - -## Contribution Guidelines - -### Issues - -If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. - -> [!IMPORTANT] -> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability). - -### Pull Requests & Code Reviews - -Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution. - -### Thank You - -Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. -All of your contributions help make vLLM a great tool and community for everyone! +You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). diff --git a/vllm/DCO b/vllm/DCO new file mode 100644 index 000000000..49b8cb054 --- /dev/null +++ b/vllm/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/vllm/Dockerfile b/vllm/Dockerfile index 0a562253c..682f046d4 100644 --- a/vllm/Dockerfile +++ b/vllm/Dockerfile @@ -191,6 +191,18 @@ ADD . /vllm-workspace/ RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install -e tests/vllm_test_utils + +# enable fast downloads from hf (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install hf_transfer +ENV HF_HUB_ENABLE_HF_TRANSFER 1 + +# Copy in the v1 package for testing (it isn't distributed yet) +COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 + # doc requires source code # we hide them inside `test_docs/` , so that this source code # will not be imported by other tests @@ -206,7 +218,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer 'modelscope!=1.15.0' bitsandbytes>=0.44.0 timm==0.9.10 + pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10 ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/vllm/Dockerfile.arm b/vllm/Dockerfile.arm new file mode 100644 index 000000000..093ee2209 --- /dev/null +++ b/vllm/Dockerfile.arm @@ -0,0 +1,62 @@ +# This vLLM Dockerfile is used to construct an image that can build and run vLLM on ARM CPU platform. + +FROM ubuntu:22.04 AS cpu-test-arm + +ENV CCACHE_DIR=/root/.cache/ccache + +ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache + +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \ + && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# tcmalloc provides better memory allocation efficiency, e.g., holding memory in caches to speed up access of commonly-used objects. +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install py-cpuinfo # Use this to gather CPU info and optimize based on ARM Neoverse cores + +# Set LD_PRELOAD for tcmalloc on ARM +ENV LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libtcmalloc_minimal.so.4" + +RUN echo 'ulimit -c 0' >> ~/.bashrc + +WORKDIR /workspace + +ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" +ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \ + pip install --upgrade pip && \ + pip install -r requirements-build.txt + +FROM cpu-test-arm AS build + +WORKDIR /workspace/vllm + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \ + pip install -v -r requirements-cpu.txt + +COPY . . +ARG GIT_REPO_CHECK=0 +RUN --mount=type=bind,source=.git,target=.git \ + if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi + +# Disabling AVX512 specific optimizations for ARM +ARG VLLM_CPU_DISABLE_AVX512="true" +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=bind,source=.git,target=.git \ + VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \ + pip install dist/*.whl && \ + rm -rf dist + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] \ No newline at end of file diff --git a/vllm/Dockerfile.cpu b/vllm/Dockerfile.cpu index f1a21d6bd..ebe226cf6 100644 --- a/vllm/Dockerfile.cpu +++ b/vllm/Dockerfile.cpu @@ -16,13 +16,13 @@ RUN --mount=type=cache,target=/var/cache/apt \ # intel-openmp provides additional performance improvement vs. openmp # tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects. RUN --mount=type=cache,target=/root/.cache/pip \ - pip install intel-openmp + pip install intel-openmp==2025.0.1 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so" RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install intel_extension_for_pytorch==2.4.0 +RUN pip install intel_extension_for_pytorch==2.5.0 WORKDIR /workspace @@ -62,4 +62,8 @@ WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks +# install development dependencies (for testing) +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install -e tests/vllm_test_utils + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm/Dockerfile.hpu b/vllm/Dockerfile.hpu new file mode 100644 index 000000000..87e0c1a6a --- /dev/null +++ b/vllm/Dockerfile.hpu @@ -0,0 +1,21 @@ +FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-hpu.txt + +ENV no_proxy=localhost,127.0.0.1 +ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true + +RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install + +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm/Dockerfile.neuron b/vllm/Dockerfile.neuron index 3d9d8e7da..76dbd4c04 100644 --- a/vllm/Dockerfile.neuron +++ b/vllm/Dockerfile.neuron @@ -31,11 +31,14 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-neuron.txt ENV VLLM_TARGET_DEVICE neuron RUN --mount=type=bind,source=.git,target=.git \ - pip install --no-build-isolation -v -e . \ + pip install --no-build-isolation -v -e . + +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils CMD ["/bin/bash"] diff --git a/vllm/Dockerfile.openvino b/vllm/Dockerfile.openvino index a05ff452c..8bd188ffd 100644 --- a/vllm/Dockerfile.openvino +++ b/vllm/Dockerfile.openvino @@ -22,4 +22,7 @@ RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVIC COPY examples/ /workspace/examples COPY benchmarks/ /workspace/benchmarks +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/vllm/Dockerfile.ppc64le b/vllm/Dockerfile.ppc64le index cd5fcf481..971248577 100644 --- a/vllm/Dockerfile.ppc64le +++ b/vllm/Dockerfile.ppc64le @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 @@ -29,6 +29,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=bind,source=.git,target=.git \ VLLM_TARGET_DEVICE=cpu python3 setup.py install +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + WORKDIR /workspace/ RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks diff --git a/vllm/Dockerfile.rocm b/vllm/Dockerfile.rocm index d35889f05..e733994f8 100644 --- a/vllm/Dockerfile.rocm +++ b/vllm/Dockerfile.rocm @@ -51,9 +51,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \ *"rocm-6.2"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ - torch==2.6.0.dev20240918 \ - setuptools-scm>=8 \ - torchvision==0.20.0.dev20240918 \ + torch==2.6.0.dev20241113+rocm6.2 \ + 'setuptools-scm>=8' \ + torchvision==0.20.0.dev20241113+rocm6.2 \ --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ *) ;; esac @@ -121,6 +121,8 @@ ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install --upgrade pip + # Package upgrades for useful functionality or to avoid dependency issues RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard @@ -166,4 +168,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ if ls libs/*.whl; then \ python3 -m pip install libs/*.whl; fi +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/vllm/Dockerfile.tpu b/vllm/Dockerfile.tpu index bdfab3f61..b617932a8 100644 --- a/vllm/Dockerfile.tpu +++ b/vllm/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20240828" +ARG NIGHTLY_DATE="20241017" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE @@ -9,12 +9,6 @@ RUN apt-get update && apt-get install -y \ git \ ffmpeg libsm6 libxext6 libgl1 -# Install the TPU and Pallas dependencies. -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html - # Build vLLM. COPY . . ARG GIT_REPO_CHECK=0 @@ -25,8 +19,10 @@ ENV VLLM_TARGET_DEVICE="tpu" RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ python3 -m pip install \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ -r requirements-tpu.txt RUN python3 setup.py develop +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils + CMD ["/bin/bash"] diff --git a/vllm/Dockerfile.xpu b/vllm/Dockerfile.xpu index 0ecb46df6..a374f20d7 100644 --- a/vllm/Dockerfile.xpu +++ b/vllm/Dockerfile.xpu @@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir \ - --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ -r requirements-xpu.txt +RUN git clone https://github.com/intel/pti-gpu && \ + cd pti-gpu/sdk && \ + git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \ + make -j && \ + cmake --install . --config Release --prefix "/usr/local" + +ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/" + COPY . . ARG GIT_REPO_CHECK RUN --mount=type=bind,source=.git,target=.git \ @@ -54,5 +64,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image \ TRITON_XPU_PROFILE 1 - +# install development dependencies (for testing) +RUN python3 -m pip install -e tests/vllm_test_utils ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/vllm/README.md b/vllm/README.md index 0836d8723..cfeb24cbb 100644 --- a/vllm/README.md +++ b/vllm/README.md @@ -13,10 +13,12 @@ Easy, fast, and cheap LLM serving for everyone | Documentation | Blog | Paper | Discord | Twitter/X | Developer Slack |

+--- *Latest News* 🔥 -- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! -- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users! +- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). +- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! +- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users! - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing). - [2024/07] We hosted [the fifth vLLM meetup](https://lu.ma/lp0gyjqr) with AWS! Please find the meetup slides [here](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing). - [2024/07] In partnership with Meta, vLLM officially supports Llama 3.1 with FP8 quantization and pipeline parallelism! Please check out our blog post [here](https://blog.vllm.ai/2024/07/23/llama31.html). @@ -42,7 +44,7 @@ vLLM is fast with: - Speculative decoding - Chunked prefill -**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. +**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script. vLLM is flexible and easy to use with: @@ -98,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox diff --git a/vllm/benchmarks/README.md b/vllm/benchmarks/README.md index 192d6c402..2aa4a2850 100644 --- a/vllm/benchmarks/README.md +++ b/vllm/benchmarks/README.md @@ -6,3 +6,14 @@ You can download the dataset by running: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json ``` + +## Downloading the ShareGPT4V dataset + +The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts +will ignore a datapoint if the referred image is missing. +```bash +wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json +mkdir coco -p +wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip +unzip coco/train2017.zip -d coco/ +``` diff --git a/vllm/benchmarks/backend_request_func.py b/vllm/benchmarks/backend_request_func.py index 4813fde27..b67849038 100644 --- a/vllm/benchmarks/backend_request_func.py +++ b/vllm/benchmarks/backend_request_func.py @@ -24,6 +24,7 @@ class RequestFuncInput: model: str best_of: int = 1 logprobs: Optional[int] = None + extra_body: Optional[dict] = None multi_modal_content: Optional[dict] = None ignore_eos: bool = False @@ -36,6 +37,7 @@ class RequestFuncOutput: ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies + tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" @@ -54,6 +56,7 @@ async def async_request_tgi( "do_sample": True, "temperature": 0.01, # TGI does not accept 0.0 temperature. "top_p": 0.99, # TGI does not accept 1.0 top_p. + "truncate": request_func_input.prompt_len, # TGI does not accept ignore_eos flag. } payload = { @@ -79,7 +82,7 @@ async def async_request_tgi( # any data, we should skip it. if chunk_bytes.startswith(":"): continue - chunk = remove_prefix(chunk_bytes, "data:") + chunk = chunk_bytes.removeprefix("data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -144,8 +147,8 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -241,6 +244,8 @@ async def async_request_openai_completions( "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" } @@ -256,13 +261,14 @@ async def async_request_openai_completions( async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: + first_chunk_received = False async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -274,7 +280,8 @@ async def async_request_openai_completions( if data["choices"][0]["text"]: timestamp = time.perf_counter() # First token - if ttft == 0.0: + if not first_chunk_received: + first_chunk_received = True ttft = time.perf_counter() - st output.ttft = ttft @@ -285,9 +292,14 @@ async def async_request_openai_completions( most_recent_timestamp = timestamp generated_text += data["choices"][0]["text"] - + if first_chunk_received: + output.success = True + else: + output.success = False + output.error = ( + "Never received a valid chunk to calculate TTFT." + "This response will be marked as failed!") output.generated_text = generated_text - output.success = True output.latency = latency else: output.error = response.reason or "" @@ -324,10 +336,12 @@ async def async_request_openai_chat_completions( }, ], "temperature": 0.0, - "max_tokens": request_func_input.output_len, + "max_completion_tokens": request_func_input.output_len, "stream": True, "ignore_eos": request_func_input.ignore_eos, } + if request_func_input.extra_body: + payload.update(request_func_input.extra_body) headers = { "Content-Type": "application/json", "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", @@ -349,8 +363,8 @@ async def async_request_openai_chat_completions( if not chunk_bytes: continue - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix( + "data: ") if chunk == "[DONE]": latency = time.perf_counter() - st else: @@ -389,14 +403,6 @@ async def async_request_openai_chat_completions( return output -# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix) -# introduced in Python 3.9 -def remove_prefix(text: str, prefix: str) -> str: - if text.startswith(prefix): - return text[len(prefix):] - return text - - def get_model(pretrained_model_name_or_path: str) -> str: if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': from modelscope import snapshot_download diff --git a/vllm/benchmarks/benchmark_guided.py b/vllm/benchmarks/benchmark_guided.py new file mode 100644 index 000000000..1a0e62598 --- /dev/null +++ b/vllm/benchmarks/benchmark_guided.py @@ -0,0 +1,494 @@ +"""Benchmark guided decoding throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +from typing import List + +import datasets +import pandas as pd +import uvloop +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) +from vllm.sampling_params import GuidedDecodingParams +from vllm.utils import FlexibleArgumentParser, merge_async_iterators + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str = 'json' + completion: str = None + + +def run_vllm(requests: List[SampleRequest], + engine_args: EngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False) -> float: + from vllm import LLM, SamplingParams + llm = LLM(**vars(engine_args)) + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + # create a list containing random selected true or false + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if guided_decoding_rate > 0 else None, + )) + llm.generate(prompts, sampling_params, use_tqdm=False) + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + **{request.structure_type: request.schema}) + if i in guided_decoding_req_idx else None, + )) + + start = time.perf_counter() + outputs = llm.generate(prompts, sampling_params, use_tqdm=False) + ret = [] + for output, request in zip(outputs, requests): + generated_text = output.outputs[0].text + ret.append({ + "generated": generated_text, + "expected": request.completion + }) + end = time.perf_counter() + return end - start, ret + + +async def run_vllm_async( + requests: List[SampleRequest], + engine_args: AsyncEngineArgs, + n: int, + guided_decoding_rate: float = 1.0, + warmup: bool = False, + disable_frontend_multiprocessing: bool = False) -> float: + from vllm import SamplingParams + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + guided_decoding_req_idx = random.sample( + range(len(requests)), int(len(requests) * guided_decoding_rate)) + + if warmup: + print(">>>>>> Running warmup prompt, for the first 5") + # We setup the first 5 requests to warmup FSM + # if using xgrammar dataset, we will skip warmup + warmup_requests = requests[:5] + for i, request in enumerate(warmup_requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams( + json=request.schema) + if guided_decoding_rate > 0 else None, + )) + generators = [] + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + + print(">>>>> Benchmark started...") + prompts = [] + sampling_params = [] + for i, request in enumerate(requests): + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + ignore_eos=True, + max_tokens=request.expected_output_len, + guided_decoding=GuidedDecodingParams(json=request.schema) + if i in guided_decoding_req_idx else None, + )) + + generators = [] + start_time = [] + latencies = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + start_time.append(time.perf_counter()) + latencies.append([]) + all_gens = merge_async_iterators(*generators) + generated_texts = [''] * len(requests) + async for i, res in all_gens: + generated_texts[i] = res.outputs[0].text + lat = time.perf_counter() - start_time[i] + latencies[i].append(lat) + ret = [{ + 'generated': gt, + 'expected': req.completion + } for gt, req in zip(generated_texts, requests)] + end = time.perf_counter() + first_latency = pd.Series([lat[0] * 1000 for lat in latencies]) + next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000 + for lat in latencies]) + return end - start, ret, (first_latency, next_latency) + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + args.warmup = False + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + completion=completion)) + + return requests + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + + # async engine is working for 'regex', 'choice' and 'grammar' + if args.dataset == 'grammar': + args.structure_type = 'grammar' + args.async_engine = False + elif args.dataset == 'regex': + args.structure_type = 'regex' + args.async_engine = False + elif args.dataset == 'choice': + args.structure_type = 'choice' + args.async_engine = False + else: + args.structure_type = 'json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += f"_async{args.async_engine}" + result_file_name += f"_warmup{args.warmup}" + result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}" + result_file_name += ".txt" + else: + result_file_name = None + + # Synthesize a prompt with the given input length. + tokenizer = AutoTokenizer.from_pretrained( + args.tokenizer, trust_remote_code=args.trust_remote_code) + requests = sample_requests(tokenizer, args) + + if args.async_engine: + engine_args = AsyncEngineArgs.from_cli_args(args) + elapsed_time, ret, (first_latency, next_latency) = uvloop.run( + run_vllm_async(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup, + args.disable_frontend_multiprocessing)) + else: + engine_args = EngineArgs.from_cli_args(args) + elapsed_time, ret = run_vllm(requests, engine_args, args.n, + args.guided_decoding_ratio, args.warmup) + first_latency, next_latency = None, None + + score = evaluate(ret, args) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if first_latency is not None: + latency_breakdown = "\nFirst token latency(msecs):\n" + latency_breakdown += f"{first_latency.describe()}" + latency_breakdown += "\nNext token latency(msecs):\n" + latency_breakdown += f"{next_latency.describe()}" + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s", + f"Correct rate is {score} %", + f"{latency_breakdown if first_latency is not None else ''}") + + # Output JSON results if specified + if args.output_json or result_file_name: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "total_output_tokens": total_output_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}", + "output_tokens_per_second": + f"{total_output_tokens / elapsed_time:.2f}", + "correct_rate(%)": score + } + results = {"outputs": ret, **results} + if first_latency is not None: + results["first_token_latency(msecs)"] = first_latency.describe( + ).to_dict() + results["next_token_latency(msecs)"] = next_latency.describe( + ).to_dict() + if args.output_json: + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + elif result_file_name: + with open(result_file_name, "w") as f: + json.dump(results, f, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark guided decoding.") + parser = AsyncEngineArgs.add_cli_args(parser) + + parser.add_argument("--output-len", + type=int, + default=512, + help="Output length for each request. Overrides the " + "output length from the dataset.") + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument("--n", + type=int, + default=1, + help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", + type=int, + default=10, + help="Number of prompts to process.") + parser.add_argument( + '--output-json', + type=str, + default=None, + help='Path to save the throughput results in JSON format.') + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") + parser.add_argument("--warmup", + action="store_true", + default=False, + help="Run warmup prompts before benchmark.") + parser.add_argument("--save-results", + action="store_true", + default=False, + help="save output results.") + args = parser.parse_args() + if args.tokenizer is None: + args.tokenizer = args.model + main(args) diff --git a/vllm/benchmarks/benchmark_prefix_caching.py b/vllm/benchmarks/benchmark_prefix_caching.py index 1aac02999..5e9381f71 100644 --- a/vllm/benchmarks/benchmark_prefix_caching.py +++ b/vllm/benchmarks/benchmark_prefix_caching.py @@ -54,13 +54,30 @@ def test_prefix(llm=None, sampling_params=None, prompts=None): print(f"cost time {end_time - start_time}") -def sample_requests( +@dataclasses.dataclass +class Request: + prompt: str + prompt_len: int + output_len: int + + +def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str: + vocab = tokenizer.get_vocab() + # Remove the special tokens. + vocab = { + k: v + for k, v in vocab.items() if k not in tokenizer.all_special_ids + } + return random.choices(list(vocab.values()), k=length) + + +def sample_requests_from_dataset( dataset_path: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, input_length_range: Tuple[int, int], fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +) -> List[Request]: if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -77,31 +94,55 @@ def sample_requests( random.shuffle(dataset) min_len, max_len = input_length_range + assert min_len >= 0 and max_len >= min_len, "input_length_range too small" # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] + filtered_requests: List[Request] = [] + for i in range(len(dataset)): - if len(filtered_dataset) == num_requests: + if len(filtered_requests) == num_requests: break # Tokenize the prompts and completions. - prompt = dataset[i][0] - prompt_token_ids = tokenizer(prompt).input_ids + prompt_token_ids = tokenizer(dataset[i][0]).input_ids + prompt = tokenizer.decode(prompt_token_ids) completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) - output_len = len(completion_token_ids - ) if fixed_output_len is None else fixed_output_len - if prompt_len < 4 or output_len < 4: - # Prune too short sequences. - continue + output_len = (len(completion_token_ids) + if fixed_output_len is None else fixed_output_len) if min_len <= prompt_len <= max_len: - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_requests.append(Request(prompt, prompt_len, output_len)) + + return filtered_requests - return filtered_dataset + +def sample_requests_from_random( + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + input_length_range: Tuple[int, int], + fixed_output_len: Optional[int], + prefix_len: int, +) -> List[Request]: + + requests = [] + prefix_token_ids = sample_tokens(tokenizer, prefix_len) + min_len, max_len = input_length_range + + for i in range(num_requests): + unique_part_token_ids = sample_tokens( + tokenizer, + random.randint(min_len - prefix_len, max_len - prefix_len)) + prompt_token_ids = prefix_token_ids + unique_part_token_ids + prompt = tokenizer.decode(prompt_token_ids) + prompt_len = len(prompt_token_ids) + assert (min_len <= prompt_len <= max_len + ), f"prompt_len {prompt_len} out of range {min_len}:{max_len}" + requests.append(Request(prompt, prompt_len, fixed_output_len)) + return requests -def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], +def repeat_and_sort_requests(requests: List[Request], repeat_count: int, sort: bool = False) -> List[str]: repeated_requests = requests * repeat_count @@ -109,7 +150,7 @@ def repeat_and_sort_requests(requests: List[Tuple[str, int, int]], repeated_requests.sort(key=lambda x: x[1]) else: random.shuffle(repeated_requests) - return [req[0] for req in repeated_requests] + return [req.prompt for req in repeated_requests] def main(args): @@ -117,9 +158,12 @@ def main(args): input_length_range = tuple(map(int, args.input_length_range.split(':'))) random.seed(args.seed) if args.dataset_path is not None: - print(f"Start to sample {args.num_prompts} prompts" - "from {args.dataset_path}") - filtered_datasets = sample_requests( + if args.prefix_len > 0: + raise ValueError("prefix-len is not supported when " + "dataset-path is provided.") + print(f"Start to sample {args.num_prompts} prompts " + f"from {args.dataset_path}") + filtered_requests = sample_requests_from_dataset( dataset_path=args.dataset_path, num_requests=args.num_prompts, tokenizer=tokenizer, @@ -127,9 +171,22 @@ def main(args): fixed_output_len=args.output_len, ) else: - prompt_len = len(tokenizer(PROMPT).input_ids) - filtered_datasets = [(PROMPT, prompt_len, args.output_len) - ] * args.num_prompts + print(f"Start to sample {args.num_prompts} prompts from random") + filtered_requests = sample_requests_from_random( + num_requests=args.num_prompts, + tokenizer=tokenizer, + input_length_range=input_length_range, + fixed_output_len=args.output_len, + prefix_len=args.prefix_len, + ) + + # Print some helpful stats of the requests. + print(f"Sampled {len(filtered_requests)} requests.") + prompt_lens = [req.prompt_len for req in filtered_requests] + print(f"Average input length: {sum(prompt_lens) / len(prompt_lens)}") + print(f"P50 input length: {sorted(prompt_lens)[len(prompt_lens) // 2]}") + print(f"Min Prompt Length: {min(prompt_lens)}") + print(f"Max Prompt Length: {max(prompt_lens)}") engine_args = EngineArgs.from_cli_args(args) @@ -137,18 +194,11 @@ def main(args): sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) - print("Testing filtered datasets") - prompts = repeat_and_sort_requests(filtered_datasets, + print("Testing filtered requests") + prompts = repeat_and_sort_requests(filtered_requests, repeat_count=args.repeat_count, sort=args.sort) - print("------warm up------") - test_prefix( - llm=llm, - prompts=prompts, - sampling_params=sampling_params, - ) - print("------start generating------") test_prefix( llm=llm, @@ -168,20 +218,29 @@ def main(args): parser.add_argument('--output-len', type=int, default=10) parser.add_argument('--num-prompts', type=int, - default=1, + required=True, help="Number of the prompts sampled from dataset") parser.add_argument('--repeat-count', type=int, - default=100, + default=1, help='Number of times to repeat each prompt') parser.add_argument('--sort', action='store_true', help='Sort prompts by input length') parser.add_argument('--input-length-range', type=str, - default='128:256', + required=True, help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Specifies the length of a common prefix to be " + "added to the input prompt. The input-length-range will " + "subtract this length when filtering prompts. Only used " + "when dataset-path is not provided.", + ) parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() diff --git a/vllm/benchmarks/benchmark_serving.py b/vllm/benchmarks/benchmark_serving.py index 0d205014b..325669214 100644 --- a/vllm/benchmarks/benchmark_serving.py +++ b/vllm/benchmarks/benchmark_serving.py @@ -199,6 +199,56 @@ def sample_sonnet_requests( return sampled_requests +def sample_mmmu_pro_vision_requests( + dataset, + num_requests: int, + tokenizer: PreTrainedTokenizerBase, + fixed_output_len: Optional[int] = None, +) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + sampled_requests: List[Tuple[str, int, int, Dict[str, + Collection[str]]]] = [] + for data in dataset: + if len(sampled_requests) == num_requests: + break + + # MMMU-Pro vision direct prompt + # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 + prompt = ( + "Answer with the option letter from the given choices directly. " + "The last line of your response should be of the following " + "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " + "options.") + + prompt_token_ids = tokenizer(prompt).input_ids + if fixed_output_len is None: + # Default max output len is set to 128 + print("--hf-output-len is not provided. Using default value 128.") + fixed_output_len = 128 + + prompt_len = len(prompt_token_ids) + output_len = fixed_output_len + + assert isinstance( + data["image"], + Image), ("Input image format must be `PIL.Image.Image`, " + f"given {type(data['image'])}.") + image: Image = data["image"] + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + mm_content = { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}" + }, + } + + sampled_requests.append((prompt, prompt_len, output_len, mm_content)) + + return sampled_requests + + def sample_hf_requests( dataset_path: str, dataset_subset: str, @@ -208,6 +258,21 @@ def sample_hf_requests( random_seed: int, fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: + + # Special case for MMMU-Pro vision dataset + if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': + assert dataset_split == "test" + dataset = load_dataset(dataset_path, + name=dataset_subset, + split=dataset_split, + streaming=True) + assert "image" in dataset.features, ( + "MMMU/MMMU_Pro vision dataset must have 'image' column.") + filter_func = lambda x: isinstance(x["image"], Image) + dataset = dataset.shuffle(seed=random_seed).filter(filter_func) + return sample_mmmu_pro_vision_requests(dataset, num_requests, + tokenizer, fixed_output_len) + dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, @@ -251,6 +316,19 @@ def sample_hf_requests( "url": f"data:image/jpeg;base64,{image_base64}" }, } + elif "image" in data and isinstance(data["image"], str): + if (data["image"].startswith("http://") or \ + data["image"].startswith("file://")): + image_url = data["image"] + else: + image_url = f"file://{data['image']}" + + mm_content = { + "type": "image_url", + "image_url": { + "url": image_url + }, + } else: mm_content = None @@ -297,8 +375,33 @@ def sample_random_requests( async def get_request( input_requests: List[Tuple[str, int, int]], request_rate: float, + burstiness: float = 1.0, ) -> AsyncGenerator[Tuple[str, int, int], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + for request in input_requests: yield request @@ -306,8 +409,9 @@ async def get_request( # If the request rate is infinity, then we don't need to wait. continue - # Sample the request interval from the exponential distribution. - interval = np.random.exponential(1.0 / request_rate) + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) # The next request will be sent after the interval. await asyncio.sleep(interval) @@ -406,9 +510,9 @@ def calculate_metrics( median_itl_ms=np.median(itls or 0) * 1000, percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_e2el_ms=np.median(e2els or 0) * 1000, + mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.mean(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], ) @@ -426,6 +530,7 @@ async def benchmark( logprobs: Optional[int], best_of: int, request_rate: float, + burstiness: float, disable_tqdm: bool, profile: bool, selected_percentile_metrics: List[str], @@ -480,7 +585,13 @@ async def benchmark( if profile_output.success: print("Profiler started") + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") print(f"Maximum request concurrency: {max_concurrency}") pbar = None if disable_tqdm else tqdm(total=len(input_requests)) @@ -502,7 +613,7 @@ async def limited_request_func(request_func_input, pbar): benchmark_start_time = time.perf_counter() tasks: List[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate): + async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, prompt=prompt, @@ -769,6 +880,7 @@ def main(args: argparse.Namespace): logprobs=args.logprobs, best_of=args.best_of, request_rate=args.request_rate, + burstiness=args.burstiness, disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), @@ -807,6 +919,7 @@ def main(args: argparse.Namespace): # Traffic result_json["request_rate"] = ( args.request_rate if args.request_rate < float("inf") else "inf") + result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency # Merge with benchmark result @@ -922,8 +1035,20 @@ def main(args: argparse.Namespace): default=float("inf"), help="Number of requests per second. If this is inf, " "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process to synthesize " - "the request arrival times.", + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", ) parser.add_argument("--seed", type=int, default=0) parser.add_argument( diff --git a/vllm/benchmarks/benchmark_serving_guided.py b/vllm/benchmarks/benchmark_serving_guided.py new file mode 100644 index 000000000..4435d87e1 --- /dev/null +++ b/vllm/benchmarks/benchmark_serving_guided.py @@ -0,0 +1,881 @@ +r"""Benchmark online serving throughput with guided decoding. + +On the server side, run one of the following commands: + (vLLM OpenAI API server) + vllm serve --disable-log-requests + + (TGI backend) + ./launch_tgi_server.sh + +On the client side, run: + python benchmarks/benchmark_serving.py \ + --backend \ + --model \ + --dataset json \ + --guided-decoding-ratio 1.0 \ + --guided-decoding-backend xgrammar \ + --request-rate 10 \ + --num-prompts 1000 + + when using tgi backend, add + --endpoint /generate_stream + to the end of the command above. +""" +import argparse +import asyncio +import dataclasses +import json +import os +import random +import time +import warnings +from dataclasses import dataclass +from typing import AsyncGenerator, List, Optional, Tuple + +import datasets +import numpy as np +import pandas as pd +from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, + RequestFuncOutput) +from tqdm.asyncio import tqdm +from transformers import PreTrainedTokenizerBase + +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: List[Tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: List[Tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: List[Tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: List[Tuple[float, float]] + + +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + schema: dict + structure_type: str + completion: str = None + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + if args.dataset == 'json': + if args.json_schema_path is None: + dir_path = os.path.dirname(os.path.realpath(__file__)) + args.json_schema_path = os.path.join(dir_path, + "structured_schemas", + "structured_schema_1.json") + with open(args.json_schema_path) as f: + schema = json.load(f) + prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501 + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "grammar": + schema = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + prompt = "Generate an SQL query to show the 'username' \ + and 'email' from the 'users' table." + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "regex": + regex = r"\w+@\w+\.com\n" + args.regex = regex + prompt = "Generate an email address for Alan Turing, \ + who works in Enigma. End in .com and new line. \ + Example result: alan.turing@enigma.com\n" + + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=regex, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "choice": + choice = ["Positive", "Negative"] + args.choice = choice + prompt = "Classify this sentiment: vLLM is wonderful!" + input_len = len(tokenizer(prompt).input_ids) + print(f"Input length of the prompt: {input_len} tokens") + requests = [ + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=choice, + structure_type=args.structure_type) + for _ in range(args.num_prompts) + ] + + elif args.dataset == "xgrammar_bench": + requests: List[SampleRequest] = [] + dataset = datasets.load_dataset("NousResearch/json-mode-eval", + split="train") + print(f"dataset has {len(dataset)} entries") + len_dataset = len(dataset) + for data_point_idx in range(args.num_prompts): + idx = data_point_idx + while idx >= len_dataset: + idx -= len_dataset + schema = dataset["schema"][idx] + prompt = tokenizer.apply_chat_template(dataset["prompt"][idx], + tokenize=False) + input_len = len(tokenizer(prompt).input_ids) + completion = dataset["completion"][idx] + + requests.append( + SampleRequest(prompt=prompt, + prompt_len=input_len, + expected_output_len=args.output_len, + schema=schema, + structure_type=args.structure_type, + completion=completion)) + + return requests + + +async def get_request( + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[Tuple[int, SampleRequest], None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a tuple. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, ( + f"A positive burstiness factor is expected, but given {burstiness}.") + theta = 1.0 / (request_rate * burstiness) + + for i, request in enumerate(input_requests): + yield i, request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: List[Tuple[str, int, int]], + outputs: List[RequestFuncOutput], + dur_s: float, + tokenizer: PreTrainedTokenizerBase, + selected_percentile_metrics: List[str], + selected_percentiles: List[float], +) -> Tuple[BenchmarkMetrics, List[int]]: + actual_output_lens: List[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: List[float] = [] + tpots: List[float] = [] + all_tpots: List[float] = [] + ttfts: List[float] = [] + e2els: List[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + # We use the tokenizer to count the number of output tokens for all + # serving backends instead of looking at len(outputs[i].itl) since + # multiple output tokens may be bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) + actual_output_lens.append(output_len) + total_input += input_requests[i].prompt_len + tpot = 0 + if output_len > 1: + tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - + 1) + tpots.append(tpot) + outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0 + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + ttfts.append(outputs[i].ttft) + e2els.append(outputs[i].latency) + completed += 1 + else: + actual_output_lens.append(0) + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " + "on the benchmark arguments.", + stacklevel=2) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_ttft_ms=np.mean(ttfts or 0) * + 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) + for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) + for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) + for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) + for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + tokenizer: PreTrainedTokenizerBase, + input_requests: List[SampleRequest], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: List[str], + selected_percentiles: List[str], + ignore_eos: bool, + max_concurrency: Optional[int], + guided_decoding_ratio: float, + guided_decoding_backend: str, +): + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + def prepare_extra_body(request) -> dict: + extra_body = {} + # Add the schema to the extra_body + extra_body[request.structure_type] = request.schema + # Add the specific guided_decoding_backend + extra_body["guided_decoding_backend"] = guided_decoding_backend + return extra_body + + print("Starting initial single prompt test run...") + guided_decoding_req_idx = random.sample( + range(len(input_requests)), + int(len(input_requests) * guided_decoding_ratio)) + + test_request = input_requests[0] + test_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=api_url, + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + test_output = await request_func(request_func_input=test_input) + if not test_output.success: + raise ValueError( + "Initial test run failed - Please make sure benchmark arguments " + f"are correctly specified. Error: {test_output.error}") + else: + print("Initial test run completed. Starting main benchmark run...") + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/start_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=prepare_extra_body(test_request), + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = (asyncio.Semaphore(max_concurrency) + if max_concurrency else None) + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, + pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: List[asyncio.Task] = [] + expected: List[str] = [] + async for i, request in get_request(input_requests, request_rate, + burstiness): + extra_body = prepare_extra_body( + request) if i in guided_decoding_req_idx else None + request_func_input = RequestFuncInput( + model=model_id, + prompt=request.prompt, + api_url=api_url, + prompt_len=request.prompt_len, + output_len=request.expected_output_len, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + expected.append(request.completion) + tasks.append( + asyncio.create_task( + limited_request_func(request_func_input=request_func_input, + pbar=pbar))) + outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_request.prompt, + api_url=base_url + "/stop_profile", + prompt_len=test_request.prompt_len, + output_len=test_request.expected_output_len, + extra_body={test_request.structure_type: test_request.schema}, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentile_metrics=selected_percentile_metrics, + selected_percentiles=selected_percentiles, + ) + + print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", + benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", + metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + metrics.request_throughput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) + + result = { + "duration": + benchmark_duration, + "completed": + metrics.completed, + "total_input_tokens": + metrics.total_input, + "total_output_tokens": + metrics.total_output, + "request_throughput": + metrics.request_throughput, + "output_throughput": + metrics.output_throughput, + "total_token_throughput": + metrics.total_token_throughput, + "ttft_description": + pd.Series([output.ttft for output in outputs]).describe().to_dict(), + "tpot_description": + pd.Series([output.tpot for output in outputs]).describe().to_dict(), + "input_lens": [output.prompt_len for output in outputs], + "output_lens": + actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "errors": [output.error for output in outputs], + } + + ret = [{ + 'generated': output.generated_text, + 'expected': gt + } for output, gt in zip(outputs, expected)] + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"))) + print("{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"))) + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, + f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", + value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", + "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + + print("=" * 50) + + return result, ret + + +def evaluate(ret, args): + + def _eval_correctness_json(expected, actual): + # extract json string from string using regex + import re + actual = actual.replace('\n', '').replace(' ', '').strip() + try: + actual = re.search(r'\{.*\}', actual).group() + actual = json.loads(actual) + except Exception: + return False + + return True + + def _eval_correctness_choice(expected, actual): + return actual in args.choice + + def _eval_correctness_regex(expected, actual): + import re + return re.match(args.regex, actual) is not None + + def _eval_correctness(expected, actual): + if args.structure_type == 'guided_json': + return _eval_correctness_json(expected, actual) + elif args.structure_type == 'guided_regex': + return _eval_correctness_regex(expected, actual) + elif args.structure_type == 'guided_choice': + return _eval_correctness_choice(expected, actual) + else: + return None + + scores = [] + for res in ret: + score = _eval_correctness(res['expected'], res['generated']) + res['correctness'] = score + scores.append(score) + + not_none_scores = [score for score in scores if score is not None] + + return (sum(not_none_scores) / len(not_none_scores) * + 100) if len(not_none_scores) > 0 else None + + +def main(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + backend = args.backend + model_id = args.model + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + + tokenizer = get_tokenizer(tokenizer_id, + trust_remote_code=args.trust_remote_code) + + if args.dataset == 'grammar': + args.structure_type = 'guided_grammar' + elif args.dataset == 'regex': + args.structure_type = 'guided_regex' + elif args.dataset == 'choice': + args.structure_type = 'guided_choice' + else: + args.structure_type = 'guided_json' + + if args.no_guided_decoding: + args.guided_decoding_ratio = 0 + if args.save_results: + result_file_name = f'{args.guided_decoding_ratio}guided' + result_file_name += f"_{backend}" + result_file_name += f"_{args.request_rate}qps" + result_file_name += f"_{args.model.split('/')[-1]}" + result_file_name += f"_{args.dataset}" + result_file_name += f"_{args.num_prompts}" + result_file_name += f"_out{args.output_len}" + result_file_name += ".txt" + else: + result_file_name = None + + input_requests = sample_requests(tokenizer, args) + + benchmark_result, ret = asyncio.run( + benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + tokenizer=tokenizer, + input_requests=input_requests, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[ + float(p) for p in args.metric_percentiles.split(",") + ], + ignore_eos=args.ignore_eos, + max_concurrency=args.max_concurrency, + guided_decoding_ratio=args.guided_decoding_ratio, + guided_decoding_backend=args.guided_decoding_backend, + )) + + # Save config and results to json + score = evaluate(ret, args) + print("correct_rate(%)", score, '\n') + if args.save_results: + results = { + "backend": + backend, + "model_id": + model_id, + "tokenizer_id": + tokenizer_id, + "num_prompts": + args.num_prompts, + "request_rate": + args.request_rate if args.request_rate < float("inf") else "inf", + "burstiness": + args.burstiness, + "max_concurrency": + args.max_concurrency, + "correct_rate(%)": + score + } + results = {"outputs": ret, **results, **benchmark_result} + + # Save to file + if args.result_filename: + result_file_name = args.result_filename + if args.result_dir: + result_file_name = os.path.join(args.result_dir, result_file_name) + with open(result_file_name, "w", encoding='utf-8') as outfile: + json.dump(results, outfile, indent=4) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument( + "--backend", + type=str, + default="vllm", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/completions", + help="API endpoint.", + ) + parser.add_argument( + "--dataset", + default='json', + choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench']) + parser.add_argument("--json_schema_path", + type=str, + default=None, + help="Path to json schema.") + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.") + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help= + "Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process.", + ) + parser.add_argument( + "--output-len", + type=int, + default=128, + help="Number of output tokens.", + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--save-results", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " + "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-seperated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " + "Default value is \"ttft,tpot,itl\".") + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-seperated list of percentiles for selected metrics. " + "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " + "Default value is \"99\". " + "Use \"--percentile-metrics\" to select metrics.", + ) + parser.add_argument("--no-guided-decoding", + action='store_true', + default=False, + help="Whether to disable JSON decoding or not.") + parser.add_argument("--guided-decoding-ratio", + type=float, + default=1.0, + help="Ratio of Guided Decoding requests") + parser.add_argument("--guided-decoding-backend", + type=str, + choices=["outlines", "lm-format-enforcer", "xgrammar"], + default="xgrammar", + help="Backend to use for guided decoding") + + args = parser.parse_args() + main(args) diff --git a/vllm/benchmarks/benchmark_throughput.py b/vllm/benchmarks/benchmark_throughput.py index ee41c8ea3..1e5967bd9 100644 --- a/vllm/benchmarks/benchmark_throughput.py +++ b/vllm/benchmarks/benchmark_throughput.py @@ -4,10 +4,11 @@ import json import random import time -from typing import List, Optional, Tuple +from typing import List, Optional import torch import uvloop +from PIL import Image from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) @@ -15,16 +16,56 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) +from vllm.inputs import TextPrompt +from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators -def sample_requests( - dataset_path: str, - num_requests: int, - tokenizer: PreTrainedTokenizerBase, - fixed_output_len: Optional[int], -) -> List[Tuple[str, int, int]]: +@dataclasses.dataclass +class SampleRequest: + """A class representing a single inference request for benchmarking. + + Attributes: + prompt: The input text prompt for the model. + multi_modal_data: Optional dictionary containing multi-modal data (e.g. + images). + prompt_len: The length of the prompt in tokens. + expected_output_len: The expected length of the output in tokens. + """ + prompt: str + prompt_len: int + expected_output_len: int + multi_modal_data: Optional[MultiModalDataDict] = None + + +def _get_prompt_for_image_model(question: str, *, model: str) -> str: + """Prepend and append special tokens around the question to form a prompt. + + Args: + question: The input question text to wrap with special tokens + model: The name of the model being used, to determine which special + tokens to add + + Returns: + The formatted prompt string with appropriate special tokens for the + model + + Raises: + ValueError: If an unsupported model name is provided + """ + model = model.lower() + if "pixtral" in model: + return f"[INST]{question}\n[IMG][/INST]" + raise ValueError(f"Unsupported model {model}") + + +def sample_requests(tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace) -> List[SampleRequest]: + dataset_path: str = args.dataset + num_requests: int = args.num_prompts + fixed_output_len: Optional[int] = args.output_len + model: str = args.model if fixed_output_len is not None and fixed_output_len < 4: raise ValueError("output_len too small") @@ -33,23 +74,36 @@ def sample_requests( dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] - # Only keep the first two turns of each conversation. - dataset = [(data["conversations"][0]["value"], - data["conversations"][1]["value"]) for data in dataset] - # Shuffle the dataset. random.shuffle(dataset) # Filter out sequences that are too long or too short - filtered_dataset: List[Tuple[str, int, int]] = [] - for i in range(len(dataset)): + filtered_dataset: List[SampleRequest] = [] + for data in dataset: if len(filtered_dataset) == num_requests: break + # Only keep the first two turns of each conversation. + prompt = data["conversations"][0]["value"] + completion = data["conversations"][1]["value"] + + multi_modal_data: Optional[MultiModalDataDict] = None + if "image" in data: + multi_modal_data = multi_modal_data or {} + image_path = data["image"] + # TODO(vllm-project/vllm/issues/9778): Support multiple images. + assert isinstance(image_path, + str), "Only support single image input" + try: + multi_modal_data["image"] = Image.open(image_path).convert( + "RGB") + except FileNotFoundError: + # Ignore datapoint where asset is missing + continue + prompt = _get_prompt_for_image_model(question=prompt, model=model) + # Tokenize the prompts and completions. - prompt = dataset[i][0] prompt_token_ids = tokenizer(prompt).input_ids - completion = dataset[i][1] completion_token_ids = tokenizer(completion).input_ids prompt_len = len(prompt_token_ids) output_len = len(completion_token_ids @@ -60,13 +114,17 @@ def sample_requests( if prompt_len > 1024 or prompt_len + output_len > 2048: # Prune too long sequences. continue - filtered_dataset.append((prompt, prompt_len, output_len)) + filtered_dataset.append( + SampleRequest(prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=multi_modal_data)) return filtered_dataset def run_vllm( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: EngineArgs, ) -> float: @@ -74,17 +132,19 @@ def run_vllm( llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) use_beam_search = False @@ -94,11 +154,11 @@ def run_vllm( llm.generate(prompts, sampling_params, use_tqdm=True) end = time.perf_counter() else: - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] # output_len should be the same for all requests. output_len = requests[0][2] - for prompt, input_len, _output_len in requests: - assert _output_len == output_len + for request in requests: + assert request.expected_output_len == output_len start = time.perf_counter() llm.beam_search( prompts, @@ -112,7 +172,7 @@ def run_vllm( async def run_vllm_async( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], n: int, engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, @@ -123,17 +183,19 @@ async def run_vllm_async( engine_args, disable_frontend_multiprocessing) as llm: # Add the requests to the engine. - prompts: List[str] = [] + prompts: List[TextPrompt] = [] sampling_params: List[SamplingParams] = [] - for prompt, _, output_len in requests: - prompts.append(prompt) + for request in requests: + prompts.append( + TextPrompt(prompt=request.prompt, + multi_modal_data=request.multi_modal_data)) sampling_params.append( SamplingParams( n=n, temperature=1.0, top_p=1.0, ignore_eos=True, - max_tokens=output_len, + max_tokens=request.expected_output_len, )) generators = [] @@ -149,7 +211,7 @@ async def run_vllm_async( def run_hf( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tokenizer: PreTrainedTokenizerBase, n: int, @@ -207,14 +269,14 @@ def run_hf( def run_mii( - requests: List[Tuple[str, int, int]], + requests: List[SampleRequest], model: str, tensor_parallel_size: int, output_len: int, ) -> float: from mii import client, serve llm = serve(model, tensor_parallel=tensor_parallel_size) - prompts = [prompt for prompt, _, _ in requests] + prompts = [request.prompt for request in requests] start = time.perf_counter() llm.generate(prompts, max_new_tokens=output_len) @@ -232,23 +294,41 @@ def main(args: argparse.Namespace): tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: - # Synthesize a prompt with the given input length. - # As tokenizer may add additional tokens like BOS, we need to try - # different lengths to get the desired input length. - for i in range(-10, 10): - prompt = "hi " * (args.input_len + i) - tokenized_prompt = tokenizer(prompt).input_ids - if len(tokenized_prompt) == args.input_len: - break - else: - raise ValueError( - f"Failed to synthesize a prompt with {args.input_len} tokens.") - requests = [(prompt, args.input_len, args.output_len) - for _ in range(args.num_prompts)] + vocab_size = tokenizer.vocab_size + requests = [] + for _ in range(args.num_prompts): + # Synthesize a prompt with the given input length. + candidate_ids = [ + random.randint(0, vocab_size - 1) + for _ in range(args.input_len) + ] + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for _ in range(5): # Max attempts to correct + candidate_prompt = tokenizer.decode(candidate_ids) + tokenized_len = len(tokenizer.encode(candidate_prompt)) + + if tokenized_len == args.input_len: + break + + # Adjust length based on difference + diff = args.input_len - tokenized_len + if diff > 0: + candidate_ids.extend([ + random.randint(100, vocab_size - 100) + for _ in range(diff) + ]) + else: + candidate_ids = candidate_ids[:diff] + requests.append( + SampleRequest(prompt=candidate_prompt, + prompt_len=args.input_len, + expected_output_len=args.output_len)) else: - requests = sample_requests(args.dataset, args.num_prompts, tokenizer, - args.output_len) + requests = sample_requests(tokenizer, args) + is_multi_modal = any(request.multi_modal_data is not None + for request in requests) if args.backend == "vllm": if args.async_engine: elapsed_time = uvloop.run( @@ -270,9 +350,15 @@ def main(args: argparse.Namespace): args.output_len) else: raise ValueError(f"Unknown backend: {args.backend}") - total_num_tokens = sum(prompt_len + output_len - for _, prompt_len, output_len in requests) - total_output_tokens = sum(output_len for _, _, output_len in requests) + total_num_tokens = sum(request.prompt_len + request.expected_output_len + for request in requests) + total_output_tokens = sum(request.expected_output_len + for request in requests) + if is_multi_modal: + print("\033[91mWARNING\033[0m: Multi-modal request detected. The " + "following metrics are not accurate because image tokens are not" + " counted. See vllm-project/vllm/issues/9778 for details.") + # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length. print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " f"{total_output_tokens / elapsed_time:.2f} output tokens/s") @@ -299,7 +385,9 @@ def main(args: argparse.Namespace): parser.add_argument("--dataset", type=str, default=None, - help="Path to the dataset.") + help="Path to the dataset. The dataset is expected to " + "be a json in form of List[Dict[..., conversations: " + "List[Dict[..., value: ]]]]") parser.add_argument("--input-len", type=int, default=None, diff --git a/vllm/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/vllm/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh new file mode 100644 index 000000000..2924ea4a4 --- /dev/null +++ b/vllm/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh @@ -0,0 +1,144 @@ +#!/bin/bash + +# benchmark the overhead of disaggregated prefill. +# methodology: +# - send all request to prefill vLLM instance. It will buffer KV cache. +# - then send all request to decode instance. +# - The TTFT of decode instance is the overhead. + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pkill -f pt_main_thread + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +benchmark() { + + export VLLM_LOGGING_LEVEL=DEBUG + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + # compare chunked prefill with disaggregated prefill + + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=10 + qps=$1 + prefix_len=50 + input_len=2048 + output_len=$2 + + + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + + # let the prefill instance finish prefill + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8100 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_2xtp4.json \ + --request-rate "inf" + + + # send the request to decode. + # The TTFT of this command will be the overhead of disagg prefill impl. + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8200 \ + --save-result \ + --result-dir $results_folder \ + --result-filename disagg_prefill_2xtp4.json \ + --request-rate "$qps" + kill_gpu_processes + +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + + pip install quart httpx + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_qps=1 + default_output_len=1 + benchmark $default_qps $default_output_len + +} + + +main "$@" diff --git a/vllm/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/vllm/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh new file mode 100644 index 000000000..d8d9e976d --- /dev/null +++ b/vllm/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh @@ -0,0 +1,164 @@ +#!/bin/bash + +# Requirement: 8x H100 GPUs. + + +# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV +# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests +# Resource: 8x H100 +# Approaches: +# 1. Chunked prefill: 1 vllm instance with tp=8 +# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4 +# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance +# Prefilling instance: max_output_token=1 +# Decoding instance: force the input tokens be the same across requests to bypass prefilling + +set -ex + +kill_gpu_processes() { + # kill all processes on GPU. + pgrep pt_main_thread | xargs -r kill -9 + pgrep python3 | xargs -r kill -9 + for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done + sleep 1 +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +launch_chunked_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --enable-chunked-prefill \ + --gpu-memory-utilization 0.6 & + wait_for_server 8100 + wait_for_server 8200 + python3 round_robin_proxy.py & + sleep 1 +} + + +launch_disagg_prefill() { + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + # disagg prefill + CUDA_VISIBLE_DEVICES=0 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8100 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + CUDA_VISIBLE_DEVICES=1 python3 \ + -m vllm.entrypoints.openai.api_server \ + --model $model \ + --port 8200 \ + --max-model-len 10000 \ + --gpu-memory-utilization 0.6 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & + + wait_for_server 8100 + wait_for_server 8200 + python3 disagg_prefill_proxy_server.py & + sleep 1 +} + + +benchmark() { + results_folder="./results" + model="meta-llama/Meta-Llama-3.1-8B-Instruct" + dataset_name="sonnet" + dataset_path="../sonnet_4x.txt" + num_prompts=100 + qps=$1 + prefix_len=50 + input_len=1024 + output_len=$2 + tag=$3 + + python3 ../benchmark_serving.py \ + --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --sonnet-input-len $input_len \ + --sonnet-output-len "$output_len" \ + --sonnet-prefix-len $prefix_len \ + --num-prompts $num_prompts \ + --port 8000 \ + --save-result \ + --result-dir $results_folder \ + --result-filename "$tag"-qps-"$qps".json \ + --request-rate "$qps" + + sleep 2 + +} + + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get -y install jq) + (which socat) || (apt-get -y install socat) + + pip install quart httpx matplotlib aiohttp + + cd "$(dirname "$0")" + + cd .. + # create sonnet-4x.txt so that we can sample 2048 tokens for input + echo "" > sonnet_4x.txt + for _ in {1..4} + do + cat sonnet.txt >> sonnet_4x.txt + done + cd disagg_benchmarks + + rm -rf results + mkdir results + + default_output_len=6 + + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + + launch_chunked_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len chunked_prefill + done + kill_gpu_processes + + launch_disagg_prefill + for qps in 2 4 6 8; do + benchmark $qps $default_output_len disagg_prefill + done + kill_gpu_processes + + python3 visualize_benchmark_results.py + +} + + +main "$@" diff --git a/vllm/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/vllm/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py new file mode 100644 index 000000000..4058b1c0a --- /dev/null +++ b/vllm/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py @@ -0,0 +1,61 @@ +import os + +import aiohttp +from quart import Quart, make_response, request + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + +app = Quart(__name__) + + +async def forward_request(url, data): + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" + } + async with session.post(url=url, json=data, + headers=headers) as response: + if response.status == 200: + # if response.headers.get('Transfer-Encoding') == 'chunked': + if True: + async for chunk_bytes in response.content.iter_chunked( + 1024): + yield chunk_bytes + else: + content = await response.read() + yield content + + +@app.route('/v1/completions', methods=['POST']) +async def handle_request(): + try: + original_request_data = await request.get_json() + + prefill_request = original_request_data.copy() + # change max_tokens = 1 to let it only do prefill + prefill_request['max_tokens'] = 1 + + # finish prefill + async for _ in forward_request('http://localhost:8100/v1/completions', + prefill_request): + continue + + # return decode + generator = forward_request('http://localhost:8200/v1/completions', + original_request_data) + response = await make_response(generator) + response.timeout = None + + return response + + except Exception as e: + import sys + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server") + print(e) + print("".join(traceback.format_exception(*exc_info))) + + +if __name__ == '__main__': + app.run(port=8000) diff --git a/vllm/benchmarks/disagg_benchmarks/round_robin_proxy.py b/vllm/benchmarks/disagg_benchmarks/round_robin_proxy.py new file mode 100644 index 000000000..6eb5f6398 --- /dev/null +++ b/vllm/benchmarks/disagg_benchmarks/round_robin_proxy.py @@ -0,0 +1,60 @@ +import asyncio +import itertools + +import aiohttp +from aiohttp import web + + +class RoundRobinProxy: + + def __init__(self, target_ports): + self.target_ports = target_ports + self.port_cycle = itertools.cycle(self.target_ports) + + async def handle_request(self, request): + target_port = next(self.port_cycle) + target_url = f"http://localhost:{target_port}{request.path_qs}" + + async with aiohttp.ClientSession() as session: + try: + # Forward the request + async with session.request( + method=request.method, + url=target_url, + headers=request.headers, + data=request.content, + ) as response: + # Start sending the response + resp = web.StreamResponse(status=response.status, + headers=response.headers) + await resp.prepare(request) + + # Stream the response content + async for chunk in response.content.iter_any(): + await resp.write(chunk) + + await resp.write_eof() + return resp + + except Exception as e: + return web.Response(text=f"Error: {str(e)}", status=500) + + +async def main(): + proxy = RoundRobinProxy([8100, 8200]) + app = web.Application() + app.router.add_route('*', '/{path:.*}', proxy.handle_request) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, 'localhost', 8000) + await site.start() + + print("Proxy server started on http://localhost:8000") + + # Keep the server running + await asyncio.Event().wait() + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/vllm/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/vllm/benchmarks/disagg_benchmarks/visualize_benchmark_results.py new file mode 100644 index 000000000..e59d8bb0e --- /dev/null +++ b/vllm/benchmarks/disagg_benchmarks/visualize_benchmark_results.py @@ -0,0 +1,46 @@ +import json + +import matplotlib.pyplot as plt +import pandas as pd + +if __name__ == "__main__": + + data = [] + for name in ['disagg_prefill', 'chunked_prefill']: + for qps in [2, 4, 6, 8]: + with open(f"results/{name}-qps-{qps}.json") as f: + x = json.load(f) + x['name'] = name + x['qps'] = qps + data.append(x) + + df = pd.DataFrame.from_dict(data) + dis_df = df[df['name'] == 'disagg_prefill'] + chu_df = df[df['name'] == 'chunked_prefill'] + + plt.style.use('bmh') + plt.rcParams['font.size'] = 20 + + for key in [ + 'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms', + 'median_itl_ms', 'p99_itl_ms' + ]: + + fig, ax = plt.subplots(figsize=(11, 7)) + plt.plot(dis_df['qps'], + dis_df[key], + label='disagg_prefill', + marker='o', + linewidth=4) + plt.plot(chu_df['qps'], + chu_df[key], + label='chunked_prefill', + marker='o', + linewidth=4) + ax.legend() + + ax.set_xlabel('QPS') + ax.set_ylabel(key) + ax.set_ylim(bottom=0) + fig.savefig(f'results/{key}.png') + plt.close(fig) diff --git a/vllm/benchmarks/kernels/benchmark_layernorm.py b/vllm/benchmarks/kernels/benchmark_layernorm.py index 92f6053cc..7acea6087 100644 --- a/vllm/benchmarks/kernels/benchmark_layernorm.py +++ b/vllm/benchmarks/kernels/benchmark_layernorm.py @@ -3,8 +3,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -16,7 +16,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/vllm/benchmarks/kernels/benchmark_machete.py b/vllm/benchmarks/kernels/benchmark_machete.py index b70c4b94c..46bab74ae 100644 --- a/vllm/benchmarks/kernels/benchmark_machete.py +++ b/vllm/benchmarks/kernels/benchmark_machete.py @@ -2,8 +2,10 @@ import copy import itertools import math +import os import pickle as pkl import time +from dataclasses import dataclass from itertools import product from typing import Callable, Iterable, List, Optional, Tuple @@ -15,11 +17,12 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales) + GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N, marlin_permute_scales, + marlin_zero_points) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( MarlinWorkspace) from vllm.model_executor.layers.quantization.utils.quant_utils import ( - gptq_pack, pack_rows, quantize_weights) + pack_rows, quantize_weights) from vllm.scalar_type import ScalarType, scalar_types from vllm.utils import FlexibleArgumentParser @@ -27,149 +30,350 @@ DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024] DEFAULT_TP_SIZES = [1] +NVTX_PROFILE = os.environ.get("NVTX_PROFILE", False) + +if NVTX_PROFILE: + import nvtx + + +def terse_type_name(dt): + return { + torch.bfloat16: "bf16", + torch.float16: "fp16", + torch.int8: "int8", + torch.float8_e4m3fn: "fp8", + torch.bfloat16: "bf16", + torch.float: "float", + torch.int: "int", + }[dt] + + +@dataclass +class BenchmarkTensors: + w_ref: torch.Tensor + a: torch.Tensor + + w_q: torch.Tensor + group_size: Optional[int] + wtype: ScalarType + w_g_s: torch.Tensor + w_g_zp: Optional[torch.Tensor] + w_ch_s: Optional[torch.Tensor] + w_tok_s: Optional[torch.Tensor] + + +@dataclass +class TypeConfig: + act_type: torch.dtype + weight_type: ScalarType + output_type: Optional[torch.dtype] + group_scale_type: Optional[torch.dtype] + group_zero_type: Optional[torch.dtype] + channel_scale_type: Optional[torch.dtype] + token_scale_type: Optional[torch.dtype] + + +def rand_data(shape, dtype=torch.float16, scale=1): + if dtype.is_floating_point: + return (scale * torch.rand(shape, device="cuda") - 0.3).to(dtype) + else: + return torch.randint(-15, 15, shape, dtype=dtype, device="cuda") + + +def quantize_and_pack(atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False): + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, + wtype, + group_size=group_size, + zero_points=zero_points, + # to match how the kernel applies zps + ref_zero_points_after_scales=True) -def machete_pack_weights(w_q: torch.tensor, wtype: ScalarType) -> torch.tensor: w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) - w_q = w_q.t().contiguous().t() # make col major - return ops.machete_prepack_B(w_q, wtype) + return w_ref, w_q, w_s, w_zp -def make_bench_tensors( - atype: torch.dtype, wtype: ScalarType, group_size: int, m: int, n: int, - k: int -) -> Tuple[torch.tensor, List[Tuple[torch.tensor, torch.tensor, torch.tensor, - torch.tensor]]]: - assert wtype.is_integer(), "TODO: support floating point weights" +def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig, + group_size: Optional[int]) -> List[BenchmarkTensors]: + m, n, k = shape # we want to make sure that weights don't fit into L2 cache between runs so # we construct enough weights to exceed L2 cache, which is 50mb on a H100 # so we target total weight size > 2*50mb - num_weights = math.ceil(2 * 50 * 1024**2 * 8 / (k * n * wtype.size_bits)) - - a = torch.randn((m, k), device="cuda", dtype=atype) * 5 - weights = [ - torch.randn((k, n), device="cuda", dtype=atype) - for _ in range(num_weights) - ] - quanitized_weights = [ - quantize_weights(w, wtype, group_size) for w in weights - ] - - return a, quanitized_weights + num_weights = math.ceil(2 * 50 * 1024**2 * 8 / + (k * n * types.weight_type.size_bits)) + + a = rand_data((m, k), types.act_type, scale=5) + + benchmark_tensors: List[BenchmarkTensors] = [] + for _ in range(num_weights): + w = rand_data((k, n), types.act_type, scale=5) + + if types.group_scale_type is not None: + w = w.to(types.group_scale_type) + if w.dtype.itemsize == 1: + w = w.to(torch.float16) + + w_ref, w_q_packed, w_s, w_zp = quantize_and_pack( + a.dtype, w, types.weight_type, types.group_scale_type, group_size, + types.group_zero_type is not None) + + if not a.dtype.is_floating_point: + aiinfo = torch.iinfo(a.dtype) + w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) + + w_ref = w_ref.to(torch.float32) + + w_ch_s = None if types.channel_scale_type is None else\ + rand_data((n,), types.channel_scale_type) + w_tok_s = None if types.token_scale_type is None else\ + rand_data((m,), types.token_scale_type) + + benchmark_tensors.append( + BenchmarkTensors(w_ref=w_ref, + a=a, + w_q=w_q_packed, + wtype=types.weight_type, + w_g_s=w_s, + w_g_zp=w_zp, + group_size=group_size, + w_ch_s=w_ch_s, + w_tok_s=w_tok_s)) + + return benchmark_tensors + + +def torch_matmul_f16_create_bench_fn(bt: BenchmarkTensors) -> Callable: + a = bt.a + w = bt.w_ref.to(bt.a.dtype) # use float reference tensor + if a.dtype not in [torch.float16, torch.bfloat16]: + a = a.to(torch.float16) + w = w.to(torch.float16) + return lambda: torch.matmul(a, w) + + +def cutlass_scaled_mm_create_bench_fn(bt: BenchmarkTensors) -> Callable: + if bt.w_ch_s is not None and bt.w_tok_s is not None: + scale_a = bt.w_tok_s.to(torch.float32) + scale_b = bt.w_ch_s.to(torch.float32) + else: + scale_a = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + scale_b = torch.tensor(1.0, dtype=torch.float32, device=bt.a.device) + w_col_major = bt.w_ref.to(bt.a.dtype).t().contiguous().t() + return lambda: ops.cutlass_scaled_mm( + bt.a, w_col_major, scale_a, scale_b, out_dtype=torch.float16) + + +def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable: + device = bt.a.device + + workspace = MarlinWorkspace(bt.w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + if bt.w_g_zp is None: + w_zp = torch.empty(0, dtype=torch.int, device=device) + else: + w_zp = marlin_zero_points(bt.w_g_zp, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.wtype.size_bits) + + if bt.group_size is None: + w_s = torch.tensor([], device="cuda", dtype=torch.half) + else: + w_s = marlin_permute_scales(bt.w_g_s, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.group_size) + + sort_indices = torch.empty(0, dtype=torch.int, device=device) + g_idx = torch.empty(0, dtype=torch.int, device=device) + w_q = ops.gptq_marlin_repack(bt.w_q, sort_indices, bt.w_ref.shape[0], + bt.w_ref.shape[1], bt.wtype.size_bits) + + if bt.a.dtype.is_floating_point: + assert bt.w_ch_s is None + assert bt.w_tok_s is None + assert bt.group_size is not None + + fn = lambda: ops.gptq_marlin_gemm(a=bt.a, + b_q_weight=w_q, + b_scales=w_s, + b_zeros=w_zp, + g_idx=g_idx, + perm=sort_indices, + workspace=workspace.scratch, + b_q_type=bt.wtype, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0], + is_k_full=True, + is_zp_float=False) + else: + assert bt.a.dtype == torch.int8 + assert bt.wtype == scalar_types.uint4b8 + + if bt.w_ch_s is not None: + s_ch = bt.w_ch_s.to(torch.float32) + else: + s_ch = torch.ones(bt.w_ref.shape[1], + dtype=torch.float32, + device=device) + + if bt.w_tok_s is not None: + s_tok = bt.w_tok_s.to(torch.float32) + else: + s_tok = torch.ones(bt.a.shape[0], + dtype=torch.float32, + device=device) + + fn = lambda: ops.marlin_qqq_gemm(a=bt.a, + b_q_weight=w_q, + s_group=w_s, + s_tok=s_tok, + s_ch=s_ch, + workspace=workspace.scratch, + size_m=bt.a.shape[0], + size_n=bt.w_ref.shape[1], + size_k=bt.w_ref.shape[0]) + + return fn + + +def machete_create_bench_fn(bt: BenchmarkTensors, + out_type=torch.dtype, + schedule=None) -> Callable: + w_q = bt.w_q.t().contiguous().t() # make col major + w_q = ops.machete_prepack_B(w_q, bt.a.dtype, bt.wtype, + None if bt.w_g_s is None else bt.w_g_s.dtype) + + w_g_zp = bt.w_g_zp + if w_g_zp is not None: + w_g_zp = -1 * bt.w_g_s * (w_g_zp.to(bt.w_g_s.dtype)) + + return lambda: ops.machete_mm( + a=bt.a, + b_q=bt.w_q, + b_type=bt.wtype, + b_group_scales=bt.w_g_s, + b_group_zeros=w_g_zp, + b_group_size=bt.group_size, + b_channel_scales=bt.w_ch_s, + a_token_scales=bt.w_tok_s, + out_type=out_type, + schedule=schedule, + ) # impl - # bench -def bench_fn(label: str, sub_label: str, description: str, - fn: Callable) -> TMeasurement: - min_run_time = 1 - return TBenchmark.Timer( - stmt="fn()", + +def bench_fns(label: str, sub_label: str, description: str, + fns: List[Callable]): + + min_run_time = 1 if not NVTX_PROFILE else 0.1 + res = TBenchmark.Timer( + stmt=""" + for fn in fns: + fn() + """, globals={ - "fn": fn + "fns": fns }, label=label, sub_label=sub_label, description=description, ).blocked_autorange(min_run_time=min_run_time) + if NVTX_PROFILE: + with nvtx.annotate("mm-bench"), nvtx.annotate( + f"{label}|{sub_label}|{description}"): + fns[0]() -def loop_over_weights( - a: torch.tensor, weights: List[Tuple[torch.tensor, torch.tensor, - torch.tensor, torch.tensor]], - fn: Callable[[torch.tensor, torch.tensor, torch.tensor, torch.tensor], - None]): - for w_ref, w_q, w_s, _ in weights: - fn(a, w_ref, w_q, w_s) + return res _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None -def bench(atype: torch.dtype, - wtype: ScalarType, +def bench(types: TypeConfig, group_size: int, m: int, k: int, n: int, label: str, sub_label: str, - benchmark_marlinv1: bool = True, - sweep_schedules: bool = True) -> Iterable[TMeasurement]: - global _SWEEP_SCHEDULES_RESULTS - - a, weights = make_bench_tensors(atype, wtype, group_size, m, n, k) - sub_label += f", L={len(weights)}" - - weights_machete = [(w_ref, machete_pack_weights(w_q, wtype), w_s, w_zp) - for w_ref, w_q, w_s, w_zp in weights] + sweep_schedules: bool = True) -> List[TMeasurement]: + benchmark_tensors = create_bench_tensors((m, n, k), types, group_size) + sub_label += f", L={len(benchmark_tensors)}" + + name_type_string = f"W{types.weight_type}"+\ + f"-A{terse_type_name(types.act_type)}" + if types.group_scale_type is not None: + name_type_string += f"-GS{terse_type_name(types.group_scale_type)}" + if types.group_zero_type is not None: + name_type_string += f"-GZ{terse_type_name(types.group_zero_type)}" + if group_size is not None: + name_type_string += f"-G{group_size}" + if types.channel_scale_type is not None: + name_type_string += f"-CS{terse_type_name(types.channel_scale_type)}" + if types.token_scale_type is not None: + name_type_string += f"-TS{terse_type_name(types.token_scale_type)}" timers = [] # pytorch impl timers.append( - bench_fn( - label, sub_label, "torch.matmul", lambda: loop_over_weights( - a, - weights, - lambda a, w_ref, w_q, w_s: torch.matmul(a, w_ref), - ))) + bench_fns( + label, sub_label, "torch.matmul (fp16)", + [torch_matmul_f16_create_bench_fn(bt) + for bt in benchmark_tensors])) - if benchmark_marlinv1: - w_ref = weights[0][0] - - w_zp_empty = torch.empty(0, dtype=torch.int, device=w_ref.device) - sort_indices = torch.empty(0, dtype=torch.int, device=w_ref.device) - g_idx = torch.empty(0, dtype=torch.int, device=w_ref.device) - - def marlinv1_pack_weights(w_q: torch.tensor) -> torch.tensor: - w_q_gptq = gptq_pack(w_q, wtype.size_bits, *w_ref.shape) - return ops.gptq_marlin_repack(w_q_gptq, sort_indices, *w_ref.shape, - wtype.size_bits) - - def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor: - return marlin_permute_scales(w_s, *w_ref.shape, group_size) - - weights_marlinv1 = [(w_ref, marlinv1_pack_weights(w_q), - marlinv1_permute_scales(w_s), w_zp) - for w_ref, w_q, w_s, w_zp in weights] - - workspace = MarlinWorkspace(w_ref.shape[1], GPTQ_MARLIN_MIN_THREAD_N, - GPTQ_MARLIN_MAX_PARALLEL) - - # marlinv1 + if types.act_type == torch.int8 or types.act_type == torch.float8_e4m3fn: + timers.append( + bench_fns( + label, sub_label, + f"cutlass_scaled_mm ({terse_type_name(types.act_type)})", [ + cutlass_scaled_mm_create_bench_fn(bt) + for bt in benchmark_tensors + ])) + + if types.act_type != torch.float8_e4m3fn: timers.append( - bench_fn( - label, sub_label, "marlin_orig", lambda: loop_over_weights( - a, weights_marlinv1, lambda a, w_ref, w_q, w_s: ops. - gptq_marlin_gemm(a, - w_q, - w_s, - w_zp_empty, - g_idx, - sort_indices, - workspace.scratch, - wtype, - size_m=a.shape[0], - size_n=w_ref.shape[1], - size_k=w_ref.shape[0], - is_k_full=True)))) + bench_fns(label, sub_label, f"marlin ({name_type_string})", + [marlin_create_bench_fn(bt) + for bt in benchmark_tensors])) # machete timers.append( - bench_fn( - label, sub_label, "machete_heuristic", lambda: loop_over_weights( - a, weights_machete, lambda a, _, w_q, w_s: ops.machete_gemm( - a, w_q, wtype, b_scales=w_s, b_group_size=group_size)))) + bench_fns(label, sub_label, f"machete ({name_type_string})", [ + machete_create_bench_fn(bt, out_type=types.output_type) + for bt in benchmark_tensors + ])) if sweep_schedules: + global _SWEEP_SCHEDULES_RESULTS + print("Finding best schedule for machete") best = None best_schedule = None - schedules = ops.machete_supported_schedules(wtype) + schedules = ops.machete_supported_schedules( + a_type=types.act_type, + b_type=types.weight_type, + group_scales_type=types.group_scale_type, + group_zeros_type=types.group_zero_type, + token_scales_type=types.token_scale_type, + channel_scales_type=types.channel_scale_type, + out_type=types.output_type) + + if schedules is None or len(schedules) == 0: + raise ValueError("No schedules found to sweep") + for schedule in reversed(schedules): schedule_M = int(schedule.split("_")[0].split("x")[1]) @@ -177,16 +381,11 @@ def marlinv1_permute_scales(w_s: torch.tensor) -> torch.tensor: if schedule_M >= 2 * max(m, 16) or schedule_M < m // 4: continue - def run(a, _, w_q, w_s, schedule=schedule): - ops.machete_gemm(a, - w_q, - wtype, - w_s, - b_group_size=group_size, - schedule=schedule) - - res = bench_fn(label, sub_label, "machete_best", - lambda: loop_over_weights(a, weights_machete, run)) + res = bench_fns(label, sub_label, "machete_best", [ + machete_create_bench_fn( + bt, out_type=types.output_type, schedule=schedule) + for bt in benchmark_tensors + ]) results_row = { "M": m, @@ -213,25 +412,33 @@ def run(a, _, w_q, w_s, schedule=schedule): # runner -def print_timers(timers: Iterable[TMeasurement]): +def print_timers(timers: List[TMeasurement]): compare = TBenchmark.Compare(timers) compare.print() -def run(dtype: torch.dtype, sweep_schedules: bool, - MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: +def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]: + types = TypeConfig( + act_type=args.act_type, + weight_type=scalar_types.uint4b8 if args.group_zero_type is None \ + else scalar_types.uint4, + output_type=args.out_type, + group_scale_type=args.group_scale_type, + group_zero_type=args.group_zero_type, + channel_scale_type=args.channel_scale_type, + token_scale_type=args.token_scale_type, + ) - results = [] + results: List[TMeasurement] = [] for m, k, n in MKNs: - timers = bench(dtype, - scalar_types.uint4b8, - 128, + timers = bench(types, + args.group_size, m, k, n, - f"{dtype}-gemm", + f"{args.act_type}-gemm", f"MKN=({m}x{k}x{n})", - sweep_schedules=sweep_schedules) + sweep_schedules=args.sweep_schedules) print_timers(timers) results.extend(timers) @@ -240,7 +447,7 @@ def run(dtype: torch.dtype, sweep_schedules: bool, # output makers def make_output( - data: Iterable[TMeasurement], + data: List[TMeasurement], MKNs: Iterable[Tuple[int, int, int]], base_description: str, timestamp=None, @@ -262,17 +469,16 @@ def run_square_bench(args): dim_sizes = list( range(args.dim_start, args.dim_end + 1, args.dim_increment)) MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes)) - data = run(args.dtype, args.sweep_schedules, MKNs) make_output(data, MKNs, f"square_bench-{args.dtype}") def run_range_bench(args): - m_start, k_start, n_start = [int(x) for x in args.dim_start.split(",")] - m_end, k_end, n_end = [int(x) for x in args.dim_end.split(",")] + m_start, k_start, n_start = (int(x) for x in args.dim_start.split(",")) + m_end, k_end, n_end = (int(x) for x in args.dim_end.split(",")) m_increment, k_increment, n_increment = \ - [int(x) for x in args.dim_increment.split(",")] + (int(x) for x in args.dim_increment.split(",")) Ms = list(range(m_start, m_end + 1, m_increment)) Ks = list(range(k_start, k_end + 1, k_increment)) Ns = list(range(n_start, n_end + 1, n_increment)) @@ -306,33 +512,49 @@ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]: for k, n in KNs: MKNs.append((m, k, n)) - data = run(args.dtype, args.sweep_schedules, MKNs) + data = run(args, MKNs) model_bench_data.append(data) + type_string = f"{args.act_type}" + # Print all results for data, model_tp in zip(model_bench_data, models_tps): model, tp_size = model_tp - print(f"== Results {args.dtype} {model}-TP{tp_size} ====") + print(f"== Results {type_string} {model}-TP{tp_size} ====") print_timers(data) - timestamp = int(time.time()) + timestr = time.strftime("%Y%m%d-%H%M%S") - all_data = [] + all_results = [] for d in model_bench_data: - all_data.extend(d) + all_results.extend(d) + # pickle all data - with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f: - pkl.dump(all_data, f) + with open(f"model_bench-{type_string}-{timestr}.pkl", "wb") as f: + args_dict = vars(args) + args_dict.pop("func") + pkl.dump({ + "args": args_dict, + "results": all_results, + }, f) if __name__ == "__main__": def to_torch_dtype(dt): - if dt == "bfloat16": - return torch.bfloat16 - if dt == "float16": - return torch.float16 - raise ValueError("unsupported dtype") + return { + "bfloat16": torch.bfloat16, + "float16": torch.float16, + "int8": torch.int8, + "float8_e4m3fn": torch.float8_e4m3fn, + "int": torch.int, + "float": torch.float, + }[dt] + + class ToTorchDtype(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, to_torch_dtype(values)) parser = FlexibleArgumentParser( description=""" @@ -352,12 +574,42 @@ def to_torch_dtype(dt): """, # noqa: E501 formatter_class=argparse.RawTextHelpFormatter, ) - parser.add_argument( - "--dtype", - type=to_torch_dtype, + "--act-type", + action=ToTorchDtype, required=True, - help="Available options are ['bfloat16', 'float16']", + choices=['bfloat16', 'float16', 'int8', 'float8_e4m3fn'], + ) + parser.add_argument( + "--group-scale-type", + action=ToTorchDtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--group-zero-type", + type=to_torch_dtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--channel-scale-type", + action=ToTorchDtype, + choices=['float'], + ) + parser.add_argument( + "--token-scale-type", + action=ToTorchDtype, + choices=['float'], + ) + parser.add_argument( + "--out-type", + action=ToTorchDtype, + choices=['bfloat16', 'float16'], + ) + parser.add_argument( + "--group-size", + type=int, + help="Available options are ['None', '-1', '128'], default=128", + default=128, ) parser.add_argument( "--sweep-schedules", diff --git a/vllm/benchmarks/kernels/benchmark_marlin.py b/vllm/benchmarks/kernels/benchmark_marlin.py index 536c133bb..8fb44e3a3 100644 --- a/vllm/benchmarks/kernels/benchmark_marlin.py +++ b/vllm/benchmarks/kernels/benchmark_marlin.py @@ -131,7 +131,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, @@ -141,7 +141,7 @@ def bench_run(results: List[benchmark.Measurement], model: str, results.append( benchmark.Timer( stmt= - "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True)", # noqa: E501 + "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501 globals=globals, label=label, sub_label=sub_label, diff --git a/vllm/benchmarks/kernels/benchmark_moe.py b/vllm/benchmarks/kernels/benchmark_moe.py index 4f88e8e6e..8f538c21f 100644 --- a/vllm/benchmarks/kernels/benchmark_moe.py +++ b/vllm/benchmarks/kernels/benchmark_moe.py @@ -10,7 +10,8 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser class BenchmarkConfig(TypedDict): @@ -167,7 +168,7 @@ class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - seed_everything(seed) + current_platform.seed_everything(seed) self.seed = seed def benchmark( @@ -181,7 +182,7 @@ def benchmark( use_fp8_w8a8: bool, use_int8_w8a16: bool, ) -> Tuple[Dict[str, int], float]: - seed_everything(self.seed) + current_platform.seed_everything(self.seed) dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) diff --git a/vllm/benchmarks/kernels/benchmark_paged_attention.py b/vllm/benchmarks/kernels/benchmark_paged_attention.py index 87864d038..14eef00b8 100644 --- a/vllm/benchmarks/kernels/benchmark_paged_attention.py +++ b/vllm/benchmarks/kernels/benchmark_paged_attention.py @@ -5,8 +5,9 @@ import torch from vllm import _custom_ops as ops +from vllm.platforms import current_platform from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - create_kv_caches_with_random, seed_everything) + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -28,7 +29,7 @@ def main( device: str = "cuda", kv_cache_dtype: Optional[str] = None, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty(num_seqs, diff --git a/vllm/benchmarks/kernels/benchmark_quant.py b/vllm/benchmarks/kernels/benchmark_quant.py index 743a5744e..1d6248344 100644 --- a/vllm/benchmarks/kernels/benchmark_quant.py +++ b/vllm/benchmarks/kernels/benchmark_quant.py @@ -3,8 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, - seed_everything) +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser @torch.inference_mode() @@ -17,7 +17,7 @@ def main(num_tokens: int, do_profile: bool = False, num_warmup_iters: int = 5, num_iters: int = 100) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/vllm/benchmarks/kernels/benchmark_rope.py b/vllm/benchmarks/kernels/benchmark_rope.py index 784b1cf98..250d50516 100644 --- a/vllm/benchmarks/kernels/benchmark_rope.py +++ b/vllm/benchmarks/kernels/benchmark_rope.py @@ -6,7 +6,8 @@ from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, get_rope) -from vllm.utils import FlexibleArgumentParser, seed_everything +from vllm.platforms import current_platform +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/vllm/benchmarks/kernels/graph_machete_bench.py b/vllm/benchmarks/kernels/graph_machete_bench.py index de608fd05..7d0bd8415 100644 --- a/vllm/benchmarks/kernels/graph_machete_bench.py +++ b/vllm/benchmarks/kernels/graph_machete_bench.py @@ -20,10 +20,11 @@ args = parser.parse_args() with open(args.filename, 'rb') as f: - data: List[TMeasurement] = pickle.load(f) + data = pickle.load(f) + raw_results: List[TMeasurement] = data["results"] results = defaultdict(lambda: list()) - for v in data: + for v in raw_results: result = re.search(r"MKN=\(\d+x(\d+x\d+)\)", v.task_spec.sub_label) if result is not None: KN = result.group(1) diff --git a/vllm/benchmarks/kernels/weight_shapes.py b/vllm/benchmarks/kernels/weight_shapes.py index 25ec9d602..51f24f3ba 100644 --- a/vllm/benchmarks/kernels/weight_shapes.py +++ b/vllm/benchmarks/kernels/weight_shapes.py @@ -40,4 +40,10 @@ ([8192, 57344], 1), ([28672, 8192], 0), ], + "meta-llama/Llama-3.1-405b-hf": [ + ([16384, 18432], 1), + ([16384, 16384], 0), + ([16384, 106496], 1), + ([53248, 16384], 0), + ], } diff --git a/vllm/benchmarks/launch_tgi_server.sh b/vllm/benchmarks/launch_tgi_server.sh index 8c5cd454f..ba7383d88 100755 --- a/vllm/benchmarks/launch_tgi_server.sh +++ b/vllm/benchmarks/launch_tgi_server.sh @@ -4,13 +4,13 @@ PORT=8000 MODEL=$1 TOKENS=$2 -docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \ - -v $PWD/data:/data \ +docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \ + -v "$PWD/data:/data" \ ghcr.io/huggingface/text-generation-inference:2.2.0 \ - --model-id $MODEL \ + --model-id "$MODEL" \ --sharded false \ --max-input-length 1024 \ --max-total-tokens 2048 \ --max-best-of 5 \ --max-concurrent-requests 5000 \ - --max-batch-total-tokens $TOKENS + --max-batch-total-tokens "$TOKENS" diff --git a/vllm/benchmarks/structured_schemas/structured_schema_1.json b/vllm/benchmarks/structured_schemas/structured_schema_1.json new file mode 100644 index 000000000..600369846 --- /dev/null +++ b/vllm/benchmarks/structured_schemas/structured_schema_1.json @@ -0,0 +1,113 @@ +{ + "$schema": + "https://json-schema.org/draft/2020-12/schema", + "title": + "User Profile", + "type": + "object", + "properties": { + "userId": { + "type": "string", + "description": "Unique identifier for the user." + }, + "personalInfo": { + "type": "object", + "properties": { + "firstName": { + "type": "string", + "description": "The user's first name." + }, + "lastName": { + "type": "string", + "description": "The user's last name." + }, + "age": { + "type": "integer", + "minimum": 0, + "description": "The user's age." + }, + "phoneNumbers": { + "type": + "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["home", "work", "mobile"], + "description": "Type of phone number." + }, + "number": { + "type": "string", + "pattern": "^\\+?[1-9]\\d{1,14}$", + "description": "Phone number in E.164 format." + } + }, + "required": ["type", "number"] + }, + "description": + "List of phone numbers associated with the user." + } + }, + "required": ["firstName", "lastName"] + }, + "address": { + "type": "object", + "properties": { + "street": { + "type": "string", + "description": "Street address." + }, + "city": { + "type": "string", + "description": "City name." + }, + "state": { + "type": "string", + "description": "State or province." + }, + "postalCode": { + "type": "string", + "pattern": "^\\d{5}(-\\d{4})?$", + "description": "Postal code." + }, + "country": { + "type": "string", + "description": "Country name." + } + }, + "required": ["street", "city", "state", "postalCode", "country"] + }, + "preferences": { + "type": "object", + "properties": { + "newsletterSubscribed": { + "type": + "boolean", + "description": + "Indicates if the user is subscribed to the newsletter." + }, + "favoriteCategories": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of user's favorite categories." + } + }, + "required": ["newsletterSubscribed"] + }, + "accountStatus": { + "type": "string", + "enum": ["active", "inactive", "suspended"], + "description": "Current status of the user's account." + }, + "registrationDate": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 formatted date-time of user registration." + } + }, + "required": + ["userId", "personalInfo", "address", "accountStatus", "registrationDate"] +} \ No newline at end of file diff --git a/vllm/cmake/cpu_extension.cmake b/vllm/cmake/cpu_extension.cmake index 7237d246d..68f7ca1af 100644 --- a/vllm/cmake/cpu_extension.cmake +++ b/vllm/cmake/cpu_extension.cmake @@ -16,6 +16,12 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc") # # Check the compile flags # + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + list(APPEND CXX_COMPILE_FLAGS + "-mf16c" + ) +endif() list(APPEND CXX_COMPILE_FLAGS "-fopenmp" "-DVLLM_CPU_EXTENSION") @@ -52,6 +58,8 @@ find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) find_isa(${CPUINFO} "POWER10" POWER10_FOUND) find_isa(${CPUINFO} "POWER9" POWER9_FOUND) +find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support +find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS @@ -71,9 +79,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) else() message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") endif() + elseif (AVX2_FOUND) list(APPEND CXX_COMPILE_FLAGS "-mavx2") message(WARNING "vLLM CPU backend using AVX2 ISA") + elseif (POWER9_FOUND OR POWER10_FOUND) message(STATUS "PowerPC detected") # Check for PowerPC VSX support @@ -81,8 +91,20 @@ elseif (POWER9_FOUND OR POWER10_FOUND) "-mvsx" "-mcpu=native" "-mtune=native") + +elseif (ASIMD_FOUND) + message(STATUS "ARMv8 or later architecture detected") + if(ARM_BF16_FOUND) + message(STATUS "BF16 extension detected") + set(MARCH_FLAGS "-march=armv8.2-a+bf16+dotprod+fp16") + add_compile_definitions(ARM_BF16_SUPPORT) + else() + message(WARNING "BF16 functionality is not available") + set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16") + endif() + list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS}) else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.") endif() # @@ -92,7 +114,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED) FetchContent_Declare( oneDNN GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git - GIT_TAG v3.5.3 + GIT_TAG v3.6 GIT_PROGRESS TRUE GIT_SHALLOW TRUE ) @@ -152,4 +174,4 @@ define_gpu_extension_target( WITH_SOABI ) -message(STATUS "Enabling C extension.") +message(STATUS "Enabling C extension.") \ No newline at end of file diff --git a/vllm/collect_env.py b/vllm/collect_env.py index 80403d576..254c19b19 100644 --- a/vllm/collect_env.py +++ b/vllm/collect_env.py @@ -1,17 +1,19 @@ # ruff: noqa # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py -# Unlike the rest of the PyTorch this file must be python2 compliant. -# This script outputs relevant system environment info -# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` import datetime import locale import os import re import subprocess import sys +# Unlike the rest of the PyTorch this file must be python2 compliant. +# This script outputs relevant system environment info +# Run it with `python collect_env.py` or `python -m torch.utils.collect_env` from collections import namedtuple +from vllm.envs import environment_variables + try: import torch TORCH_AVAILABLE = True @@ -52,6 +54,7 @@ 'vllm_version', # vllm specific field 'vllm_build_flags', # vllm specific field 'gpu_topo', # vllm specific field + 'env_vars', ]) DEFAULT_CONDA_PATTERNS = { @@ -512,6 +515,22 @@ def is_xnnpack_available(): else: return "N/A" +def get_env_vars(): + env_vars = '' + secret_terms=('secret', 'token', 'api', 'access', 'password') + report_prefix = ("TORCH", "NCCL", "PYTORCH", + "CUDA", "CUBLAS", "CUDNN", + "OMP_", "MKL_", + "NVIDIA") + for k, v in os.environ.items(): + if any(term in k.lower() for term in secret_terms): + continue + if k in environment_variables: + env_vars = env_vars + "{}={}".format(k, v) + "\n" + if k.startswith(report_prefix): + env_vars = env_vars + "{}={}".format(k, v) + "\n" + + return env_vars def get_env_info(): run_lambda = run @@ -583,6 +602,7 @@ def get_version_or_na(cfg, prefix): vllm_version=vllm_version, vllm_build_flags=vllm_build_flags, gpu_topo=gpu_topo, + env_vars=get_env_vars(), ) @@ -631,6 +651,8 @@ def get_version_or_na(cfg, prefix): {vllm_build_flags} GPU Topology: {gpu_topo} + +{env_vars} """.strip() diff --git a/vllm/csrc/attention/attention_kernels.cu b/vllm/csrc/attention/attention_kernels.cuh similarity index 64% rename from vllm/csrc/attention/attention_kernels.cu rename to vllm/csrc/attention/attention_kernels.cuh index bcd170411..563e1438f 100644 --- a/vllm/csrc/attention/attention_kernels.cu +++ b/vllm/csrc/attention/attention_kernels.cuh @@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel( } // namespace vllm -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel), \ - shared_mem_size); \ - vllm::paged_attention_v1_kernel \ - <<>>( \ - out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ - scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -// TODO(woosuk): Tune NUM_THREADS. -template -void paged_attention_v1_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_seq_len = - DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; - int logits_size = padded_max_seq_len * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len - // Keep that in sync with the logic here! - int shared_mem_size = std::max(logits_size, outputs_size); - - dim3 grid(num_heads, num_seqs, 1); - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V1(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V1(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V1(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V1(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V1(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V1(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V1(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V1(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v1_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); - -#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V1_LAUNCHER_BLOCK_SIZE) -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel \ - <<>>( \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ - value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); \ - vllm::paged_attention_v2_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ - max_num_partitions); - -template -void paged_attention_v2_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); - int logits_size = PARTITION_SIZE * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - - // For paged attention v2 kernel. - dim3 grid(num_heads, num_seqs, max_num_partitions); - int shared_mem_size = std::max(logits_size, outputs_size); - // For paged attention v2 reduce kernel. - dim3 reduce_grid(num_heads, num_seqs); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); - - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 64: - LAUNCH_PAGED_ATTENTION_V2(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V2(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V2(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V2(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V2(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V2(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V2(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V2(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v2_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& - tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V2_LAUNCHER_BLOCK_SIZE) -} - #undef WARP_SIZE #undef MAX #undef MIN diff --git a/vllm/csrc/attention/paged_attention_v1.cu b/vllm/csrc/attention/paged_attention_v1.cu new file mode 100644 index 000000000..741cd0c82 --- /dev/null +++ b/vllm/csrc/attention/paged_attention_v1.cu @@ -0,0 +1,196 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ + VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ + ((void*)vllm::paged_attention_v1_kernel), \ + shared_mem_size); \ + vllm::paged_attention_v1_kernel \ + <<>>( \ + out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ + scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ + alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +// TODO(woosuk): Tune NUM_THREADS. +template +void paged_attention_v1_launcher( + torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int padded_max_seq_len = + DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; + int logits_size = padded_max_seq_len * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len + // Keep that in sync with the logic here! + int shared_mem_size = std::max(logits_size, outputs_size); + + dim3 grid(num_heads, num_seqs, 1); + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V1(32); + break; + case 64: + LAUNCH_PAGED_ATTENTION_V1(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V1(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V1(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V1(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V1(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V1(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V1(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V1(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v1_launcher( \ + out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ + seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); + +#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v1( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V1_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/vllm/csrc/attention/paged_attention_v2.cu b/vllm/csrc/attention/paged_attention_v2.cu new file mode 100644 index 000000000..6de8d0bdd --- /dev/null +++ b/vllm/csrc/attention/paged_attention_v2.cu @@ -0,0 +1,206 @@ +/* + * Adapted from + * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp + * Copyright (c) 2023, The vLLM team. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "attention_kernels.cuh" + +#ifndef USE_ROCM + #define WARP_SIZE 32 +#else + #define WARP_SIZE warpSize +#endif + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b)) + +#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ + vllm::paged_attention_v2_kernel \ + <<>>( \ + exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ + value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ + seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ + kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + blocksparse_local_blocks, blocksparse_vert_stride, \ + blocksparse_block_size, blocksparse_head_sliding_step); \ + vllm::paged_attention_v2_reduce_kernel \ + <<>>( \ + out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ + max_num_partitions); + +template +void paged_attention_v2_launcher( + torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, + torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, + torch::Tensor& value_cache, int num_kv_heads, float scale, + torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, + const c10::optional& alibi_slopes, float k_scale, + float v_scale, const int tp_rank, const int blocksparse_local_blocks, + const int blocksparse_vert_stride, const int blocksparse_block_size, + const int blocksparse_head_sliding_step) { + int num_seqs = query.size(0); + int num_heads = query.size(1); + int head_size = query.size(2); + int max_num_blocks_per_seq = block_tables.size(1); + int q_stride = query.stride(0); + int kv_block_stride = key_cache.stride(0); + int kv_head_stride = key_cache.stride(1); + + [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); + assert(head_size % thread_group_size == 0); + + // NOTE: alibi_slopes is optional. + const float* alibi_slopes_ptr = + alibi_slopes + ? reinterpret_cast(alibi_slopes.value().data_ptr()) + : nullptr; + + T* out_ptr = reinterpret_cast(out.data_ptr()); + float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); + float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); + T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); + T* query_ptr = reinterpret_cast(query.data_ptr()); + CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); + CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); + int* block_tables_ptr = block_tables.data_ptr(); + int* seq_lens_ptr = seq_lens.data_ptr(); + + constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; + int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); + int logits_size = PARTITION_SIZE * sizeof(float); + int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); + + // For paged attention v2 kernel. + dim3 grid(num_heads, num_seqs, max_num_partitions); + int shared_mem_size = std::max(logits_size, outputs_size); + // For paged attention v2 reduce kernel. + dim3 reduce_grid(num_heads, num_seqs); + int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); + + dim3 block(NUM_THREADS); + const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + switch (head_size) { + // NOTE(woosuk): To reduce the compilation time, we only compile for the + // head sizes that we use in the model. However, we can easily extend this + // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; + case 64: + LAUNCH_PAGED_ATTENTION_V2(64); + break; + case 80: + LAUNCH_PAGED_ATTENTION_V2(80); + break; + case 96: + LAUNCH_PAGED_ATTENTION_V2(96); + break; + case 112: + LAUNCH_PAGED_ATTENTION_V2(112); + break; + case 120: + LAUNCH_PAGED_ATTENTION_V2(120); + break; + case 128: + LAUNCH_PAGED_ATTENTION_V2(128); + break; + case 192: + LAUNCH_PAGED_ATTENTION_V2(192); + break; + case 256: + LAUNCH_PAGED_ATTENTION_V2(256); + break; + default: + TORCH_CHECK(false, "Unsupported head size: ", head_size); + break; + } +} + +#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ + paged_attention_v2_launcher( \ + out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ + num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ + k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ + blocksparse_vert_stride, blocksparse_block_size, \ + blocksparse_head_sliding_step); + +#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ + switch (is_block_sparse) { \ + case true: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ + break; \ + case false: \ + CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ + break; \ + } + +// NOTE(woosuk): To reduce the compilation time, we omitted block sizes +// 1, 2, 4, 64, 128, 256. +#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ + switch (block_size) { \ + case 8: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ + break; \ + case 16: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ + break; \ + case 32: \ + CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported block size: ", block_size); \ + break; \ + } + +void paged_attention_v2( + torch::Tensor& out, // [num_seqs, num_heads, head_size] + torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] + torch::Tensor& + tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] + torch::Tensor& query, // [num_seqs, num_heads, head_size] + torch::Tensor& + key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] + torch::Tensor& + value_cache, // [num_blocks, num_heads, head_size, block_size] + int64_t num_kv_heads, // [num_heads] + double scale, + torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] + torch::Tensor& seq_lens, // [num_seqs] + int64_t block_size, int64_t max_seq_len, + const c10::optional& alibi_slopes, + const std::string& kv_cache_dtype, double k_scale, double v_scale, + const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, + const int64_t blocksparse_head_sliding_step) { + const bool is_block_sparse = (blocksparse_vert_stride > 1); + DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, + CALL_V2_LAUNCHER_BLOCK_SIZE) +} + +#undef WARP_SIZE +#undef MAX +#undef MIN +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/vllm/csrc/cpu/attention.cpp b/vllm/csrc/cpu/attention.cpp index abb4e3bea..e21832ba7 100644 --- a/vllm/csrc/cpu/attention.cpp +++ b/vllm/csrc/cpu/attention.cpp @@ -22,6 +22,24 @@ struct KernelVecType { using v_load_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { +#ifdef __powerpc64__ + // Power architecture-specific vector types + using q_load_vec_type = vec_op::FP32Vec8; + using k_load_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures, including x86 + using q_load_vec_type = vec_op::FP16Vec8; + using k_load_vec_type = vec_op::FP16Vec16; + using v_load_vec_type = vec_op::FP16Vec16; +#endif + using q_vec_type = vec_op::FP32Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; +}; + #ifdef __AVX512BF16__ template <> struct KernelVecType { @@ -33,6 +51,21 @@ struct KernelVecType { using v_load_vec_type = vec_op::BF16Vec16; }; #else + #ifdef __aarch64__ + #ifndef ARM_BF16_SUPPORT + // pass + #else +template <> +struct KernelVecType { + using q_load_vec_type = vec_op::BF16Vec8; + using q_vec_type = vec_op::FP32Vec16; + using k_load_vec_type = vec_op::BF16Vec16; + using k_vec_type = vec_op::FP32Vec16; + using qk_acc_vec_type = vec_op::FP32Vec16; + using v_load_vec_type = vec_op::BF16Vec16; +}; + #endif + #else template <> struct KernelVecType { using q_load_vec_type = vec_op::BF16Vec8; @@ -42,6 +75,7 @@ struct KernelVecType { using qk_acc_vec_type = vec_op::FP32Vec16; using v_load_vec_type = vec_op::BF16Vec16; }; + #endif #endif template @@ -375,6 +409,9 @@ void paged_attention_v1_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; @@ -692,6 +729,9 @@ void paged_attention_v2_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; @@ -755,4 +795,4 @@ void paged_attention_v2( CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t); CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl) }); -} +} \ No newline at end of file diff --git a/vllm/csrc/cpu/cpu_types.hpp b/vllm/csrc/cpu/cpu_types.hpp index 0213be091..28db04797 100644 --- a/vllm/csrc/cpu/cpu_types.hpp +++ b/vllm/csrc/cpu/cpu_types.hpp @@ -1,4 +1,3 @@ - #ifndef CPU_TYPES_HPP #define CPU_TYPES_HPP @@ -8,8 +7,11 @@ #elif defined(__POWER9_VECTOR__) //ppc implementation #include "cpu_types_vsx.hpp" +#elif defined(__aarch64__) + //arm implementation + #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" #endif -#endif +#endif \ No newline at end of file diff --git a/vllm/csrc/cpu/cpu_types_arm.hpp b/vllm/csrc/cpu/cpu_types_arm.hpp new file mode 100644 index 000000000..73e0f8cb2 --- /dev/null +++ b/vllm/csrc/cpu/cpu_types_arm.hpp @@ -0,0 +1,515 @@ +#include +#include +#include + +namespace vec_op { + +#ifdef ARM_BF16_SUPPORT + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) +#else + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) +#endif + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { + template + constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); + }; +}; + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; +}; + +struct FP32Vec8; +struct FP32Vec16; + +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + float16x8_t reg; + + explicit FP16Vec8(const void *ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; + + explicit FP16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { + vst1q_f16(static_cast<__fp16 *>(ptr), reg); + } +}; + +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + float16x8x2_t reg; + + explicit FP16Vec16(const void *ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void *ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void *ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + } + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + for (int i = 0; i < remainder; ++i) { + reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i); + } + } + } +}; + + +#ifdef ARM_BF16_SUPPORT +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + bfloat16x8_t reg; + + explicit BF16Vec8(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; + + explicit BF16Vec8(const FP32Vec8 &); + + explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + bfloat16x8x2_t reg; + + explicit BF16Vec16(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; + + explicit BF16Vec16(const FP32Vec16 &); + + explicit BF16Vec16(float32x4x4_t v) : reg({ + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) + }){}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; +}; + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + bfloat16x8x4_t reg; + + explicit BF16Vec32(const void *ptr) + : reg(*reinterpret_cast(ptr)) {}; + + explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; + + explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ + vec8_data.reg, + vec8_data.reg, + vec8_data.reg, + vec8_data.reg + }) {}; + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; +}; +#endif + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + + union AliasReg { + float32x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4_t reg; + + explicit FP32Vec4(float v) : reg(vdupq_n_f32(v)) {}; + + explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; + + explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + + explicit FP32Vec4(float32x4_t data) : reg(data) {}; + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + float32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4x2_t reg; + + explicit FP32Vec8(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v)}) {}; + + explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; + + explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + + explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; + + explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + + explicit FP32Vec8(const FP16Vec8 &v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; + + explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + + #ifdef ARM_BF16_SUPPORT + + explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + + explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + + #endif + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float answer = 0; + unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + + return answer; + } + + FP32Vec8 exp() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t exp_vec0 = {expf(ar.values[0]), expf(ar.values[1])}; + float32x2_t exp_vec1 = {expf(ar.values[2]), expf(ar.values[3])}; + float32x2_t exp_vec2 = {expf(ar.values[4]), expf(ar.values[5])}; + float32x2_t exp_vec3 = {expf(ar.values[6]), expf(ar.values[7])}; + + float32x4_t result0 = vcombine_f32(exp_vec0, exp_vec1); + float32x4_t result1 = vcombine_f32(exp_vec2, exp_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 tanh() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t tanh_vec0 = {tanhf(ar.values[0]), tanhf(ar.values[1])}; + float32x2_t tanh_vec1 = {tanhf(ar.values[2]), tanhf(ar.values[3])}; + float32x2_t tanh_vec2 = {tanhf(ar.values[4]), tanhf(ar.values[5])}; + float32x2_t tanh_vec3 = {tanhf(ar.values[6]), tanhf(ar.values[7])}; + + float32x4_t result0 = vcombine_f32(tanh_vec0, tanh_vec1); + float32x4_t result1 = vcombine_f32(tanh_vec2, tanh_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 er() const { + AliasReg ar; + ar.reg = reg; + + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + + float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); + float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); + + float32x4x2_t result; + result.val[0] = result0; + result.val[1] = result1; + + return FP32Vec8(result); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + } + + void save(float *ptr) const { + vst1q_f32(ptr, reg.val[0]); + vst1q_f32(ptr + 4, reg.val[1]); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + float32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + float32x4x4_t reg; + + explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + + explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + + explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + + explicit FP32Vec16(float32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + + #ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) : reg({ + vcvtq_low_f32_bf16(v.val[0]), + vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), + vcvtq_high_f32_bf16(v.val[1]) + }) {}; + #endif + + explicit FP32Vec16(const FP32Vec4 &data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + }; + + #ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16 &v) : reg({ + vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1]) + }) {}; + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; + #endif + + explicit FP32Vec16(const FP16Vec16 &v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); + }; + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); + }; + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); + }; + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3]) + })); + }; + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(float32x4x4_t({ + vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3]) + })); + }; + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float answer = 0; + unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + + return answer; + }; + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float answer = 0; + const int start = idx * group_size; + unroll_loop( + [&answer, &start, ar](int i) { answer += ar.values[start + i]; }); + + return answer; + }; + + void save(float *ptr) const { + vst1q_f32(ptr, reg.val[0]); + vst1q_f32(ptr + 4, reg.val[1]); + vst1q_f32(ptr + 8, reg.val[2]); + vst1q_f32(ptr + 12, reg.val[3]); + }; +}; + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +template <> struct VecType { using vec_type = FP16Vec8; }; + +#ifdef ARM_BF16_SUPPORT +template <> struct VecType { using vec_type = BF16Vec8; }; +#endif + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast<__fp16 *>(ptr) = v; +} + +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); + + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); +}; + +inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); + + reg = vcombine_f16(lower_half, upper_half); +}; + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + + acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); + acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); + acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); + acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a.reg.val[3], b.reg.val[3]); +}; + +#ifdef ARM_BF16_SUPPORT +inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { + + float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); + float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); + float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); + float32x4_t a1_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[1])); + + float32x4_t b0_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[0])); + float32x4_t b0_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[0])); + float32x4_t b1_low = vcvt_f32_bf16(vget_low_bf16(b.reg.val[1])); + float32x4_t b1_high = vcvt_f32_bf16(vget_high_bf16(b.reg.val[1])); + + acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a0_low, b0_low); + acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a0_high, b0_high); + acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a1_low, b1_low); + acc.reg.val[3] = vfmaq_f32(acc.reg.val[3], a1_high, b1_high); +}; +#endif + +#ifdef ARM_BF16_SUPPORT +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) + }){}; +#endif + +inline void prefetch(const void *addr) { + __builtin_prefetch(addr, 0, 1); +}; + +#ifdef ARM_BF16_SUPPORT +template <> +inline void storeFP32(float v, c10::BFloat16 *ptr) { + *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +}; +#endif +}; \ No newline at end of file diff --git a/vllm/csrc/cpu/cpu_types_x86.hpp b/vllm/csrc/cpu/cpu_types_x86.hpp index a325153b4..4bb4eb0f4 100644 --- a/vllm/csrc/cpu/cpu_types_x86.hpp +++ b/vllm/csrc/cpu/cpu_types_x86.hpp @@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -// FIXME: FP16 is not fully supported in Torch-CPU #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) @@ -50,37 +50,37 @@ template struct Vec { struct FP32Vec8; struct FP32Vec16; -#ifdef __AVX512FP16__ struct FP16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; - __m128h reg; + __m128i reg; - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + explicit FP16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + explicit FP16Vec8(const FP32Vec8 &); - explicit FP16Vec8(__m128h data) : reg(data) {} + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } +struct FP16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } + __m256i reg; - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } + explicit FP16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } + explicit FP16Vec16(const FP32Vec16 &); - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + + void save(void* ptr, const int elem_num) const { + constexpr uint32_t M = 0xFFFFFFFF; + __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num)); + _mm256_mask_storeu_epi16(ptr, mask, reg); + } }; -#endif struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; @@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif + explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} explicit FP32Vec8(const BF16Vec8 &v) : reg(_mm256_castsi256_ps( @@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec { : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const INT32Vec16 &v) @@ -430,6 +432,16 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(const FP32Vec8 &data) : reg_low(data.reg), reg_high(data.reg) {} + explicit FP32Vec16(const FP16Vec16 &v) { + __m128i low = _mm256_extractf128_si256(v.reg, 0); + __m128i high = _mm256_extractf128_si256(v.reg, 1); + + reg_low = _mm256_cvtph_ps(low); + reg_high = _mm256_cvtph_ps(high); + } + + explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec16 &v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -534,24 +546,34 @@ template using vec_t = typename VecType::vec_type; template <> struct VecType { using vec_type = FP32Vec8; }; -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; -#endif +template <> struct VecType { using vec_type = FP16Vec8; }; template <> struct VecType { using vec_type = BF16Vec8; }; template void storeFP32(float v, T *ptr) { *ptr = v; } -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { acc = acc + a * b; } +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast(ptr) = + _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); +} + +inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} + +#ifdef __AVX512F__ +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtps_ph(v.reg, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} +#else +inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) + : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +#endif + #ifdef __AVX512BF16__ template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); diff --git a/vllm/csrc/cpu/dnnl_helper.hpp b/vllm/csrc/cpu/dnnl_helper.hpp index 024ad4ae4..8b5011dc0 100644 --- a/vllm/csrc/cpu/dnnl_helper.hpp +++ b/vllm/csrc/cpu/dnnl_helper.hpp @@ -2,6 +2,7 @@ #define DNNL_HELPER_HPP #include +#include #include "oneapi/dnnl/dnnl.hpp" @@ -32,6 +33,11 @@ struct DNNLType { static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16; }; +template <> +struct DNNLType { + static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16; +}; + template constexpr inline dnnl::memory::data_type get_dnnl_type() { return DNNLType>::type; diff --git a/vllm/csrc/cpu/quant.cpp b/vllm/csrc/cpu/quant.cpp index b493fd793..d9aed657a 100644 --- a/vllm/csrc/cpu/quant.cpp +++ b/vllm/csrc/cpu/quant.cpp @@ -23,6 +23,19 @@ struct KernelVecType { using cvt_vec_type = vec_op::FP32Vec16; }; +template <> +struct KernelVecType { +#ifdef __powerpc64__ + // Power architecture-specific vector type + using load_vec_type = vec_op::FP32Vec16; +#else + // Fallback for other architectures + using load_vec_type = vec_op::FP16Vec16; +#endif + using azp_adj_load_vec_type = vec_op::INT32Vec16; + using cvt_vec_type = vec_op::FP32Vec16; +}; + #ifdef __AVX512F__ template void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output, diff --git a/vllm/csrc/custom_all_reduce.cu b/vllm/csrc/custom_all_reduce.cu index 9b82bec44..123278bfe 100644 --- a/vllm/csrc/custom_all_reduce.cu +++ b/vllm/csrc/custom_all_reduce.cu @@ -5,32 +5,29 @@ #include "custom_all_reduce.cuh" -// fake pointer type, must match fptr_t type in ops.h +// Fake pointer type, must match fptr_t type in ops.h. +// We use this type alias to indicate when pointers are passed in as int64_t. using fptr_t = int64_t; static_assert(sizeof(void*) == sizeof(fptr_t)); -fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, - const std::vector& handles, - const std::vector& offsets, int64_t rank, +fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, + torch::Tensor& rank_data, int64_t rank, bool full_nvlink) { - int world_size = offsets.size(); + int world_size = fake_ipc_ptrs.size(); if (world_size > 8) throw std::invalid_argument("world size > 8 is not supported"); if (world_size % 2 != 0) throw std::invalid_argument("Odd num gpus is not supported for now"); - if (world_size != handles.size()) - throw std::invalid_argument( - "handles length should equal to offsets length"); if (rank < 0 || rank >= world_size) throw std::invalid_argument("invalid rank passed in"); - cudaIpcMemHandle_t ipc_handles[8]; + vllm::Signal* ipc_ptrs[8]; for (int i = 0; i < world_size; i++) { - std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t)); + ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); } - return (fptr_t) new vllm::CustomAllreduce( - reinterpret_cast(meta.data_ptr()), rank_data.data_ptr(), - rank_data.numel(), ipc_handles, offsets, rank, full_nvlink); + return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(), + rank_data.numel(), rank, world_size, + full_nvlink); } /** @@ -55,26 +52,48 @@ bool _is_weak_contiguous(torch::Tensor& t) { t.numel() * t.element_size()); } -void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, - cudaStream_t stream) { +/** + * Performs an out-of-place allreduce and stores result in out. + * + * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered. + * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first + * copied into _reg_buffer. + */ +void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, + fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) { auto fa = reinterpret_cast(_fa); + const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); + auto stream = c10::cuda::getCurrentCUDAStream().stream(); + + TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); + TORCH_CHECK_EQ(inp.numel(), out.numel()); TORCH_CHECK(_is_weak_contiguous(out)); + TORCH_CHECK(_is_weak_contiguous(inp)); + auto input_size = inp.numel() * inp.element_size(); + auto reg_buffer = reinterpret_cast(_reg_buffer); + if (reg_buffer) { + TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes); + AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size, + cudaMemcpyDeviceToDevice, stream)); + } else { + reg_buffer = inp.data_ptr(); + } switch (out.scalar_type()) { case at::ScalarType::Float: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + fa->allreduce(stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } case at::ScalarType::Half: { - fa->allreduce(stream, reinterpret_cast(inp.data_ptr()), + fa->allreduce(stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) case at::ScalarType::BFloat16: { fa->allreduce( - stream, reinterpret_cast(inp.data_ptr()), + stream, reinterpret_cast(reg_buffer), reinterpret_cast(out.data_ptr()), out.numel()); break; } @@ -85,57 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, } } -void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - _all_reduce(_fa, inp, out, stream); -} - -void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, - torch::Tensor& out) { - const at::cuda::OptionalCUDAGuard device_guard(device_of(inp)); - auto stream = c10::cuda::getCurrentCUDAStream().stream(); - - auto input_size = inp.numel() * inp.element_size(); - TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type()); - TORCH_CHECK_EQ(inp.numel(), out.numel()); - TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(), - "registered buffer is too small to contain the input"); - AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(), - input_size, cudaMemcpyDeviceToDevice, stream)); - _all_reduce(_fa, reg_buffer, out, stream); -} - void dispose(fptr_t _fa) { - auto fa = reinterpret_cast(_fa); - delete fa; + delete reinterpret_cast(_fa); } int64_t meta_size() { return sizeof(vllm::Signal); } -void register_buffer(fptr_t _fa, torch::Tensor& t, - const std::vector& handles, - const std::vector& offsets) { +void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs) { auto fa = reinterpret_cast(_fa); - fa->register_buffer(handles, offsets, t.data_ptr()); + TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_); + void* ipc_ptrs[8]; + for (int i = 0; i < fake_ipc_ptrs.size(); i++) { + ipc_ptrs[i] = reinterpret_cast(fake_ipc_ptrs[i]); + } + fa->register_buffer(ipc_ptrs); } -std::tuple> get_graph_buffer_ipc_meta( - fptr_t _fa) { +// Use vector to represent byte data for python binding compatibility. +std::tuple, std::vector> +get_graph_buffer_ipc_meta(fptr_t _fa) { auto fa = reinterpret_cast(_fa); - auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta(); - auto options = - torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU); - auto handles = - torch::empty({static_cast(handle_bytes.size())}, options); - std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size()); - return {handles, std::move(offsets)}; + auto [handle, offsets] = fa->get_graph_buffer_ipc_meta(); + std::vector bytes(handle.begin(), handle.end()); + return std::make_tuple(bytes, offsets); } -void register_graph_buffers(fptr_t _fa, const std::vector& handles, +// Use vector to represent byte data for python binding compatibility. +void register_graph_buffers(fptr_t _fa, + const std::vector>& handles, const std::vector>& offsets) { auto fa = reinterpret_cast(_fa); - fa->register_graph_buffers(handles, offsets); + std::vector bytes; + bytes.reserve(handles.size()); + for (int i = 0; i < handles.size(); i++) { + bytes.emplace_back(handles[i].begin(), handles[i].end()); + } + bytes.reserve(handles.size()); + fa->register_graph_buffers(bytes, offsets); } diff --git a/vllm/csrc/custom_all_reduce.cuh b/vllm/csrc/custom_all_reduce.cuh index a2f7e4330..6be4d4f2b 100644 --- a/vllm/csrc/custom_all_reduce.cuh +++ b/vllm/csrc/custom_all_reduce.cuh @@ -285,46 +285,52 @@ class CustomAllreduce { int world_size_; bool full_nvlink_; - // below are device pointers RankSignals sg_; + // Stores an map from a pointer to its peer pointters from all ranks. std::unordered_map buffers_; Signal* self_sg_; - // stores the registered device pointers from all ranks + // Stores rank data from all ranks. This is mainly for cuda graph purposes. + // For cuda graph to work, all kernel arguments must be fixed during graph + // capture time. However, the peer pointers are not known during graph capture + // time. Therefore, during capture, we increment the rank data pointer and use + // that as the argument to the kernel. The kernel arguments are stored in + // graph_unreg_buffers_. The actual peer pointers will be filled in at the + // memory pointed to by the pointers in graph_unreg_buffers_ when + // the IPC handles are exchanged between ranks. + // + // The overall process looks like this: + // 1. Graph capture. + // 2. Each rank obtains the IPC handles for each addresses used during cuda + // graph capture using get_graph_buffer_ipc_meta. + // 3. (In Python) all gather the IPC handles. + // 4. Obtain the peer pointers by opening the IPC handles, and store them in + // the rank data array at corresponding positions. RankData *d_rank_data_base_, *d_rank_data_end_; std::vector graph_unreg_buffers_; // a map from IPC handles to opened IPC pointers std::map ipc_handles_; /** - * meta is a pointer to device metadata and temporary buffer for allreduce. + * Signals are an array of ipc-enabled buffers from all ranks. + * For each of the buffer, the layout is as follows: + * | -- sizeof(Signal) -- | ------ a few MB ----- | + * The first section is for allreduce synchronization, and the second section + * is for storing the intermediate results required by some allreduce algos. * - * There's a total of sizeof(Signal) of prefix before the actual data, - * so meta + 1 points to actual temporary buffer. - * - * note: this class does not own any device memory. Any required buffers - * are passed in from the constructor + * Note: this class does not own any device memory. Any required buffers + * are passed in from the constructor. */ - CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz, - const cudaIpcMemHandle_t* handles, - const std::vector& offsets, int rank, - bool full_nvlink = true) + CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz, + int rank, int world_size, bool full_nvlink = true) : rank_(rank), - world_size_(offsets.size()), + world_size_(world_size), full_nvlink_(full_nvlink), - self_sg_(meta), + self_sg_(signals[rank]), d_rank_data_base_(reinterpret_cast(rank_data)), d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) { for (int i = 0; i < world_size_; i++) { - Signal* rank_sg; - if (i != rank_) { - char* handle = open_ipc_handle(&handles[i]); - handle += offsets[i]; - rank_sg = (Signal*)handle; - } else { - rank_sg = self_sg_; - } - sg_.signals[i] = rank_sg; + sg_.signals[i] = signals[i]; } } @@ -341,11 +347,10 @@ class CustomAllreduce { return it->second; } - std::pair, std::vector> - get_graph_buffer_ipc_meta() { + std::pair> get_graph_buffer_ipc_meta() { auto num_buffers = graph_unreg_buffers_.size(); auto handle_sz = sizeof(cudaIpcMemHandle_t); - std::vector handles(handle_sz * num_buffers, 0); + std::string handles(handle_sz * num_buffers, static_cast(0)); std::vector offsets(num_buffers); for (int i = 0; i < num_buffers; i++) { auto ptr = graph_unreg_buffers_[i]; @@ -370,26 +375,22 @@ class CustomAllreduce { std::to_string(d_rank_data_base_ + num - d_rank_data_end_)); } - void register_buffer(const std::vector& handles, - const std::vector& offsets, void* self) { + /** + * Register already-shared IPC pointers. + */ + void register_buffer(void** ptrs) { check_rank_data_capacity(); RankData data; for (int i = 0; i < world_size_; i++) { - if (i != rank_) { - char* handle = open_ipc_handle(handles[i].data()); - handle += offsets[i]; - data.ptrs[i] = handle; - } else { - data.ptrs[i] = self; - } + data.ptrs[i] = ptrs[i]; } auto d_data = d_rank_data_base_++; CUDACHECK( cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice)); - buffers_[self] = d_data; + buffers_[ptrs[rank_]] = d_data; } - // note: when registering graph buffers, we intentionally choose to not + // Note: when registering graph buffers, we intentionally choose to not // deduplicate the addresses. That means if the allocator reuses some // addresses, they will be registered again. This is to account for the remote // possibility of different allocation patterns between ranks. For example, @@ -424,11 +425,13 @@ class CustomAllreduce { } /** - * This is the result after careful grid search. Using 36 blocks give the best - * or close to the best runtime on the devices I tried: A100, A10, A30, T4, - * V100. You'll notice that NCCL kernels also only take a small amount of SMs. - * Not quite sure the underlying reason, but my guess is that too many SMs - * will cause contention on NVLink bus. + * Performs allreduce, assuming input has already been registered. + * + * Block and grid default configs are results after careful grid search. Using + * 36 blocks give the best or close to the best runtime on the devices I + * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only + * take a small amount of SMs. Not quite sure the underlying reason, but my + * guess is that too many SMs will cause contention on NVLink bus. */ template void allreduce(cudaStream_t stream, T* input, T* output, int size, diff --git a/vllm/csrc/custom_all_reduce_test.cu b/vllm/csrc/custom_all_reduce_test.cu index 376687e91..b59ea40d9 100644 --- a/vllm/csrc/custom_all_reduce_test.cu +++ b/vllm/csrc/custom_all_reduce_test.cu @@ -135,24 +135,26 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit, void* rank_data; size_t rank_data_sz = 16 * 1024 * 1024; CUDACHECK(cudaMalloc(&rank_data, rank_data_sz)); - std::vector offsets(nRanks, 0); - vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles, - offsets, myRank); + vllm::Signal* ipc_ptrs[8]; + for (int i = 0; i < nRanks; i++) { + if (i == myRank) + ipc_ptrs[i] = buffer; + else + CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i], + cudaIpcMemLazyEnablePeerAccess)); + } + vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks); auto* self_data = reinterpret_cast(reinterpret_cast(buffer) + sizeof(vllm::Signal) + data_size * sizeof(T)); // hack buffer registration { - std::vector handles; - handles.reserve(nRanks); + void* data[8]; for (int i = 0; i < nRanks; i++) { - char* begin = (char*)&data_handles[i]; - char* end = (char*)&data_handles[i + 1]; - handles.emplace_back(begin, end); + data[i] = + ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T); } - std::vector offsets(nRanks, - sizeof(vllm::Signal) + data_size * sizeof(T)); - fa.register_buffer(handles, offsets, self_data); + fa.register_buffer(data); } double* ground_truth; diff --git a/vllm/csrc/cutlass_extensions/cute_utils.cuh b/vllm/csrc/cutlass_extensions/cute_utils.cuh index 1842fab8b..f61fe3ceb 100644 --- a/vllm/csrc/cutlass_extensions/cute_utils.cuh +++ b/vllm/csrc/cutlass_extensions/cute_utils.cuh @@ -20,9 +20,9 @@ CUTE_HOST_DEVICE static constexpr auto permute_layout(Layout l) { // is the layout f(x) = x template CUTE_HOST_DEVICE static constexpr bool is_identity_layout() { - if constexpr (std::is_same_v) + if constexpr (std::is_same_v) { return true; - else { + } else { constexpr auto coalesced_layout = coalesce(Layout{}); if constexpr (rank(coalesced_layout) == 1 && stride<0>(coalesced_layout) == 1) { diff --git a/vllm/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp b/vllm/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp similarity index 99% rename from vllm/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp rename to vllm/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp index d407d66ab..7aa87feb4 100644 --- a/vllm/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp +++ b/vllm/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp @@ -52,6 +52,7 @@ // clang-format off #include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp" +#include "cutlass/epilogue/threadblock/fusion/visitors.hpp" #include "cute/tensor.hpp" namespace cutlass::epilogue::threadblock { diff --git a/vllm/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/vllm/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp similarity index 100% rename from vllm/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp rename to vllm/csrc/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp diff --git a/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp new file mode 100644 index 000000000..c69e87999 --- /dev/null +++ b/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -0,0 +1,317 @@ +#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp" + +/* + This file defines custom epilogues for fusing channel scales, token scales, + bias, and activation zero-points onto a GEMM operation using the + CUTLASS 2.x API, for sm80 (Ampere) NVIDIA GPUs. + + Epilogues must contain a public type named EVTCompute of type Sm80EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. +*/ + +namespace vllm::c2x { + +using namespace cute; + +/* + * This class provides the common load descriptors for the + * ScaledEpilogue[...] classes + */ +template +struct ScaledEpilogueBase { + protected: + using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; + + template + using ColOrScalarLoad = + cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< + OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; + + template + using RowOrScalarLoad = + cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + template + using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast< + OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; + + template + using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + template + using RowOrZeroLoad = + cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast< + OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; + + // This utility function constructs the arguments for the load descriptors + // from a tensor. It can handle both row and column, as well as row/column or + // scalar cases. + template + static auto args_from_tensor(torch::Tensor const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = static_cast(tensor.data_ptr()); + if constexpr (std::is_same_v> || + std::is_same_v>) { + return Arguments{data_ptr, tensor.numel() != 1}; + } else { + // it would technically work but no use case as data_ptr is never nullptr + static_assert(!std::is_same_v>); + return Arguments{data_ptr}; + } + } + + // This overload handles the case where there might not be a tensor, in which + // case a nullptr is passed and a constant (0) is used. + template + static auto args_from_tensor(c10::optional const& tensor) { + static_assert(std::is_same_v>); + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; + return Arguments{data_ptr}; + } +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args}; + } +}; + +/* + * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. + * This bias can also be used in the per-tensor azp case, where the activation + * zero point (azp) is used to compute an azp correction term, + * which is folded into the bias. + * + * The bias tensor must be per-output channel. + * ScaleA and ScaleB can be per-tensor or per-token/per-channel. + */ +template +struct ScaledEpilogueBias + : protected ScaledEpilogueBase { + protected: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args, bias_args}; + } +}; + +/* + * This epilogue directly supports per-tensor azp in int32 form. + * As opposed to the per-token epilogue below, this epilogue only has an azp_adj + * term, which should already be multiplied with the scalar azp. + * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzp + : protected ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowOrZeroLoad; + + // This is the full AZP term, azp * J @ B, shape (1,n) + using AzpWithAdj = typename SUPER::template RowLoad; + + // Compute float(accum - azp_adj), both operands are int32_t + using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +/* + * This epilogue supports per-token azp by computing and applying + * the correction term using a rank-1 update. If the term were materialized, + * it would require O(m*n) space, and this way it only requires O(m+n) space. + * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero + * point for each row of A. + * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzpToken + : protected ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowOrZeroLoad; + + // Per-token azp term, shape (m,1) + using Azp = typename SUPER::template ColLoad; + + // This is the AZP adjustment term, J @ B, shape (1,n) + using AzpAdj = typename SUPER::template RowLoad; + + // Compute azp * azp_adj + using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, int32_t, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::threadblock::Sm80EVT; + + // Compute float(accum - azp*azp_adj), all operands are int32_t + using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAcc = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::threadblock::Sm80EVT; + + using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + torch::Tensor const& azp, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_args = SUPER::template args_from_tensor(azp); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +}; // namespace vllm::c2x \ No newline at end of file diff --git a/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp new file mode 100644 index 000000000..95764ecdd --- /dev/null +++ b/vllm/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -0,0 +1,315 @@ +#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp" + +/* + This file defines custom epilogues for fusing channel scales, token scales, + bias, and activation zero-points onto a GEMM operation using the + CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later. + + Epilogues must contain a public type named EVTCompute of type Sm90EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. +*/ + +namespace vllm::c3x { + +using namespace cute; + +/* + * This class provides the common load descriptors for the + * ScaledEpilogue[...] classes + */ +template +struct ScaledEpilogueBase { + protected: + using Accum = cutlass::epilogue::fusion::Sm90AccFetch; + + template + using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<0>, Int<0>>>; + + template + using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<1>, Int<0>>>; + + // Don't want to support nullptr by default + template + using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + + // Don't want to support nullptr by default + template + using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< + 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, + Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; + + // This utility function constructs the arguments for the load descriptors + // from a tensor. It can handle both row and column, as well as row/column or + // scalar cases. + template + static auto args_from_tensor(torch::Tensor const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = static_cast(tensor.data_ptr()); + if constexpr (std::is_same_v> || + std::is_same_v>) { + return Arguments{data_ptr, tensor.numel() != 1}; + } else { + static_assert(!std::is_same_v> && + !std::is_same_v>); + return Arguments{data_ptr}; + } + } + + // This overload handles the case where there might not be a tensor, in which + // case a nullptr is passed and a constant (0) is used. + template + static auto args_from_tensor(c10::optional const& tensor) { + using Arguments = typename Descriptor::Arguments; + auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; + static_assert(std::is_same_v> || + std::is_same_v>); + return Arguments{data_ptr}; + } +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch.scaled_mm_. + + A and B may be both either int8 or fp8_e4m3. A can be + quantized per-tensor or per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args}; + } +}; + +/* + * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. + * This bias can also be used in the per-tensor azp case, where the activation + * zero point (azp) is used to compute an azp correction term, + * which is folded into the bias. + * + * The bias tensor must be per-output channel. + * ScaleA and ScaleB can be per-tensor or per-token/per-channel. + */ +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + using Compute0 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::fusion::Sm90EVT; + + using Compute1 = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + + using ArgumentType = typename EVTCompute::Arguments; + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + + typename EVTCompute0::Arguments evt0_args{b_args}; + return ArgumentType{a_args, evt0_args, bias_args}; + } +}; + +/* + * This epilogue directly supports per-tensor azp in int32 form. + * As opposed to the per-token epilogue below, this epilogue only has an azp_adj + * term, which should already be multiplied with the scalar azp. + * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzp + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + // This is the full AZP term, azp * J @ B, shape (1,n) + using AzpWithAdj = typename SUPER::template RowLoad; + + // Compute float(accum - azp_adj), both operands are int32_t + using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +/* + * This epilogue supports per-token azp by computing and applying + * the correction term using a rank-1 update. If the term were materialized, + * it would require O(m*n) space, and this way it only requires O(m+n) space. + * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero + * point for each row of A. + * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. + * + * This epilogue also supports bias, which remains per-channel. + */ +template +struct ScaledEpilogueBiasAzpToken + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::template ColOrScalarLoad; + using ScaleB = typename SUPER::template RowOrScalarLoad; + using Bias = typename SUPER::template RowLoad; + + // Per-token azp term, shape (m,1) + using Azp = typename SUPER::template ColLoad; + + // This is the AZP adjustment term, J @ B, shape (1,n) + using AzpAdj = typename SUPER::template RowLoad; + + // Compute azp * azp_adj + using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, int32_t, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAzp = + cutlass::epilogue::fusion::Sm90EVT; + + // Compute float(accum - azp*azp_adj), all operands are int32_t + using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute< + cutlass::minus, float, int32_t, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeAcc = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTComputeScaleB = + cutlass::epilogue::fusion::Sm90EVT; + + using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::fusion::Sm90EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& azp_adj, + torch::Tensor const& azp, + c10::optional const& bias) { + auto a_args = SUPER::template args_from_tensor(a_scales); + auto b_args = SUPER::template args_from_tensor(b_scales); + auto bias_args = SUPER::template args_from_tensor(bias); + auto azp_args = SUPER::template args_from_tensor(azp); + auto azp_adj_args = + SUPER::template args_from_tensor(azp_adj); + + typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; + typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; + typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; + return ArgumentType{a_args, evt_scale_b_args, bias_args}; + } +}; + +}; // namespace vllm::c3x \ No newline at end of file diff --git a/vllm/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/vllm/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index 4fcfcd311..a5beea1a3 100644 --- a/vllm/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/vllm/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -35,6 +35,35 @@ class MixedInputKernelScheduleType(enum.Enum): } } +VLLMDataTypeSize: Dict[Union[VLLMDataType, DataType], int] = { + **DataTypeSize, # type: ignore + **{ + VLLMDataType.u4b8: 4, + VLLMDataType.u8b128: 8, + } +} + +VLLMDataTypeVLLMScalarTypeTag: Dict[Union[VLLMDataType, DataType], str] = { + VLLMDataType.u4b8: "vllm::kU4B8", + VLLMDataType.u8b128: "vllm::kU8B128", + DataType.u4: "vllm::kU4", + DataType.u8: "vllm::kU8", + DataType.s4: "vllm::kS4", + DataType.s8: "vllm::kS8", + DataType.f16: "vllm::kFloat16", + DataType.bf16: "vllm::kBfloat16", +} + +VLLMDataTypeTorchDataTypeTag: Dict[Union[VLLMDataType, DataType], str] = { + DataType.u8: "at::ScalarType::Byte", + DataType.s8: "at::ScalarType::Char", + DataType.e4m3: "at::ScalarType::Float8_e4m3fn", + DataType.s32: "at::ScalarType::Int", + DataType.f16: "at::ScalarType::Half", + DataType.bf16: "at::ScalarType::BFloat16", + DataType.f32: "at::ScalarType::Float", +} + VLLMKernelScheduleTag: Dict[Union[ MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore diff --git a/vllm/csrc/cutlass_extensions/vllm_numeric_conversion.cuh b/vllm/csrc/cutlass_extensions/vllm_numeric_conversion.cuh index 2ad914f8e..90f226cf6 100644 --- a/vllm/csrc/cutlass_extensions/vllm_numeric_conversion.cuh +++ b/vllm/csrc/cutlass_extensions/vllm_numeric_conversion.cuh @@ -3,6 +3,7 @@ #include "cutlass/numeric_conversion.h" #include "cutlass_extensions/vllm_custom_types.cuh" #include "cutlass_extensions/cute_utils.cuh" +#include "cutlass_extensions/vllm_type_utils.cuh" // this file extends: // https://github.com/NVIDIA/cutlass/blob/cutlass-3.5.0/include/cutlass/numeric_conversion.h @@ -28,8 +29,19 @@ struct InterleavedNumericArrayConverter { CUTLASS_DEVICE static result_type convert(source_type const& source) { - CUTE_INVALID_CONTROL_PATH( - "InterleavedNumericArrayConverter not implemented\n"); + if (cute::elect_one_sync()) { + if constexpr (std::is_same_v) { + printf( + "Convert %s <= %s (N = %d, IlvBlkLayout = void), not implemented\n", + nameof_v, nameof_v, N); + } else { + printf( + "Convert %s <= %s (N = %d, size(IlvBlkLayout{}) = %d), not " + "implemented\n", + nameof_v, nameof_v, N, size(IlvBlkLayout{})); + } + __brkpt(); + } return {}; } @@ -56,11 +68,6 @@ struct InterleavedNumericArrayConverter< result_type operator()(source_type const& s) const { return convert(s); } }; -// TODO (LucasWilkinson): Implement -// for Array <= Array - -// .... - template struct ArrayConverterPacked32Bit { using result_type = Array; @@ -86,14 +93,16 @@ struct ArrayConverterPacked32Bit { using ScalarConverter = NumericConverter; template - CUTLASS_DEVICE static uint32_t to_reg(PackedSrc const& source) { + CUTLASS_DEVICE static auto to_regs(PackedSrc const& src) { if constexpr (sizeof(PackedSrc) == 1) { - return static_cast(reinterpret_cast(source)); + return Array{reinterpret_cast(src)}; } else if constexpr (sizeof(PackedSrc) == 2) { - return static_cast(reinterpret_cast(source)); + return Array{reinterpret_cast(src)}; + } else if constexpr (sizeof(PackedSrc) == 4) { + return Array{reinterpret_cast(src)}; } else { - static_assert(sizeof(PackedSrc) == 4); - return reinterpret_cast(source); + static_assert(sizeof(PackedSrc) == 8); + return reinterpret_cast const&>(src); } } @@ -110,7 +119,7 @@ struct ArrayConverterPacked32Bit { static_assert(std::is_same_v); static_assert(std::is_same_v); - return RegConvert32bit::template convert(to_reg(source)); + return RegConvert32bit::template convert(to_regs(source)); } friend class detail::VectorizedConverter; @@ -140,6 +149,131 @@ struct ArrayConverterPacked32Bit { } }; +// Convert 8 4bit values packed into a 32bit register to 8 8bit values packed +// into 2 32bit register. +template +CUTLASS_DEVICE cutlass::AlignedArray lut_4bit_to_8bit_convert( + uint32_t src) { + cutlass::AlignedArray r; + // Determines if the value is in the top half of the LUT if set or + // (i.e. LUT[8:15]) in the bottom half (i.e. LUT[0:7]) if not set. Then move + // into bit position 0x4 of each nibble so when or'd with final_prmt_base it + // selects the correct candidate. When elements in final_prmt_base + // are >= 0x4, the high candidate is selected (i.e. LUT[8:15]), when elements + // are < 0x4, the low candidate is selected (i.e. LUT[0:7]) + uint32_t high_bit = (src & 0x88888888) >> 1; + + // `high_bit` is OR'd with 0x31203120 to find the correct value in the LUT + // (selects correct high or low candidate) + const uint32_t final_prmt_base = 0x32103210; + + // Ignore the high bit when indexing into LUT, for each 4bit value + // we index into both the high and low candidates then use + // high_bit | final_prmt_base to select the correct candidate + uint32_t lut_idx = (src & 0x77777777); + + auto pack = [](uint8_t a, uint8_t b, uint8_t c, uint8_t d) { + return uint32_t(a) | (uint32_t(b) << 8) | (uint32_t(c) << 16) | + (uint32_t(d) << 24); + }; + + static constexpr uint32_t LOW_0 = pack(LUT0, LUT1, LUT2, LUT3); + static constexpr uint32_t LOW_1 = pack(LUT4, LUT5, LUT6, LUT7); + static constexpr uint32_t HIGH_0 = pack(LUT8, LUT9, LUT10, LUT11); + static constexpr uint32_t HIGH_1 = pack(LUT12, LUT13, LUT14, LUT15); + + CUTLASS_PRAGMA_UNROLL + for (int ii = 0; ii < 2; ++ii, lut_idx >>= 16, high_bit >>= 16) { + uint32_t final_prmt_idx = final_prmt_base | high_bit; + + // This uses a look up table to convert packed int4s to packed int8s, + // using the int4 value as the index to prmt. It first select both the + // high and low candidates, then uses the high bit (i.e. `high_bit`) to + // select the correct candidate. + asm volatile( + "{\n" + " .reg .b32 low, high;\n" + " prmt.b32 low, %1, %2, %5;\n" + " prmt.b32 high, %3, %4, %5;\n" + " prmt.b32 %0, low, high, %6;\n" + "}\n" + : "=r"(r[ii]) + : "n"(LOW_0), "n"(LOW_1), "n"(HIGH_0), "n"(HIGH_1), "r"(lut_idx), + "r"(final_prmt_idx)); + } + + return r; +}; + +// for Array <= Array +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + static FloatRoundStyle const round_style = Round; + + private: + struct RegConvert { + template + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as int8s + auto r = lut_4bit_to_8bit_convert<0xF8, 0xF9, 0xFA, 0xFB, // + 0xFC, 0xFD, 0xFE, 0xFF, // + 0x00, 0x01, 0x02, 0x03, // + 0x04, 0x05, 0x06, 0x07>(src_[0]); + return reinterpret_cast(r); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + +// for Array <= Array +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + static FloatRoundStyle const round_style = Round; + + private: + struct RegConvert { + template + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + // [-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7] as fp8s + auto r = lut_4bit_to_8bit_convert<0xD0, 0xCE, 0xCC, 0xCA, // + 0xC8, 0xC4, 0xC0, 0xB8, // + 0x00, 0x38, 0x40, 0x44, // + 0x48, 0x4A, 0x4C, 0x4E>(src_[0]); + return reinterpret_cast(r); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + // for Array <= Array template struct NumericArrayConverter { @@ -148,7 +282,8 @@ struct NumericArrayConverter { struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -249,7 +384,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -338,7 +474,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -417,7 +554,8 @@ struct NumericArrayConverter { struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; // Hold output FP16s in reg. We need 1 reg for every 2 elements using RegArray = cutlass::AlignedArray { private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; PackedResultType r; // __byte_perm simulates the add.u32 0x4B000000 to every u8 element of @@ -513,7 +652,8 @@ struct NumericArrayConverter { private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src_reg) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src_reg = src_[0]; // Hold output BF16s in reg. We need 1 reg for every 2 elements using RegArray = cutlass::AlignedArray, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -671,7 +812,8 @@ struct InterleavedNumericArrayConverter, Stride<_4, _1>>, private: struct RegConvert { template - CUTLASS_DEVICE static PackedResultType convert(uint32_t src) { + CUTLASS_DEVICE static PackedResultType convert(Array src_) { + uint32_t src = src_[0]; using RegArray = cutlass::AlignedArray; @@ -788,6 +930,61 @@ struct NumericArrayConverter { #endif +// for Array <= Array +// FastFP16toINT8 from https://arxiv.org/pdf/2406.09904 +template +struct NumericArrayConverter { + using result_type = Array; + using source_type = Array; + + struct RegConvert { + // FastFP16toINT8 from https://arxiv.org/pdf/2406.09904 + template + CUTLASS_DEVICE static PackedResultType convert( + Array src) { + // Hold output int8s in reg. We need 1 reg for every 4 elements + using RegArray = cutlass::AlignedArray< + uint32_t, std::max(PackedResultType::kElements / 4, size_t(1))>; + RegArray r; + + static constexpr uint32_t MAGIC_BIAS_ = 0x64806480; + auto MAGIC_BIAS = *reinterpret_cast(&MAGIC_BIAS_); + + *reinterpret_cast(&src[0]) = + __hadd2(*reinterpret_cast(&src[0]), MAGIC_BIAS); + + if constexpr (src_regs > 1) { + *reinterpret_cast(&src[1]) = + __hadd2(*reinterpret_cast(&src[1]), MAGIC_BIAS); + } + + static_assert(PackedResultType::kElements <= 4); + uint32_t uint8s; + static constexpr uint32_t MASK_0246 = 0x6420; + static constexpr uint32_t UINT8s_TO_INT8s_MASK = 0x80808080; + asm volatile("prmt.b32 %0,%1,%2,%3;\n" + : "=r"(uint8s) + : "r"(src[0]), "r"((src_regs > 1) ? src[1] : src[0]), + "n"(MASK_0246)); + + uint32_t int8s = (uint8s ^ UINT8s_TO_INT8s_MASK); + + return reinterpret_cast(int8s); + }; + }; + + public: + CUTLASS_DEVICE + static result_type convert(source_type const& source) { + return ArrayConverterPacked32Bit::convert(source); + } + + CUTLASS_DEVICE + result_type operator()(source_type const& s) const { return convert(s); } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/vllm/csrc/cutlass_extensions/vllm_type_utils.cuh b/vllm/csrc/cutlass_extensions/vllm_type_utils.cuh new file mode 100644 index 000000000..500ed508c --- /dev/null +++ b/vllm/csrc/cutlass_extensions/vllm_type_utils.cuh @@ -0,0 +1,42 @@ +#include "cutlass/bfloat16.h" +#include "cutlass/half.h" +#include "cuda_bf16.h" + +#include "cutlass_extensions/vllm_custom_types.cuh" + +namespace cutlass { + +template +struct nameof { + static constexpr char const* value = "unknown"; +}; + +template +inline constexpr auto nameof_v = nameof::value; + +#define NAMEOF_TYPE(T) \ + template <> \ + struct nameof { \ + static constexpr char const* value = #T; \ + }; + +NAMEOF_TYPE(float_e4m3_t) +NAMEOF_TYPE(float_e5m2_t) +NAMEOF_TYPE(half_t) +NAMEOF_TYPE(nv_bfloat16) +NAMEOF_TYPE(bfloat16_t) +NAMEOF_TYPE(float) + +NAMEOF_TYPE(int4b_t) +NAMEOF_TYPE(int8_t) +NAMEOF_TYPE(int32_t) +NAMEOF_TYPE(int64_t) + +NAMEOF_TYPE(vllm_uint4b8_t) +NAMEOF_TYPE(uint4b_t) +NAMEOF_TYPE(uint8_t) +NAMEOF_TYPE(vllm_uint8b128_t) +NAMEOF_TYPE(uint32_t) +NAMEOF_TYPE(uint64_t) + +}; // namespace cutlass \ No newline at end of file diff --git a/vllm/csrc/layernorm_kernels.cu b/vllm/csrc/layernorm_kernels.cu index 7a7a25d21..fb6882f3e 100644 --- a/vllm/csrc/layernorm_kernels.cu +++ b/vllm/csrc/layernorm_kernels.cu @@ -1,21 +1,13 @@ -#include -#include +#include "type_convert.cuh" +#include "dispatch_utils.h" + +#include #include -#include "dispatch_utils.h" #ifndef USE_ROCM - #include - #include - #include #include #else - #include - #include - #include #include - -using __nv_bfloat16 = __hip_bfloat16; -using __nv_bfloat162 = __hip_bfloat162; #endif namespace vllm { @@ -51,155 +43,6 @@ __global__ void rms_norm_kernel( } } -/* Converter structs for the conversion from torch types to HIP/CUDA types, - and the associated type conversions within HIP/CUDA. These helpers need - to be implemented for now because the relevant type conversion - operators/constructors are not consistently implemented by HIP/CUDA, so - a generic conversion via type casts cannot be implemented. - - Each struct should have the member static constexpr bool `exists`: - If false, the optimized kernel is not used for the corresponding torch type. - If true, the struct should be fully defined as shown in the examples below. - */ -template -struct _typeConvert { - static constexpr bool exists = false; -}; - -#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) -// CUDA < 12.0 runs into issues with packed type conversion -template <> -struct _typeConvert { - static constexpr bool exists = true; - using hip_type = __half; - using packed_hip_type = __half2; - - __device__ static inline float convert(hip_type x) { return __half2float(x); } - __device__ static inline float2 convert(packed_hip_type x) { - return __half22float2(x); - } - __device__ static inline hip_type convert(float x) { - return __float2half_rn(x); - } - __device__ static inline packed_hip_type convert(float2 x) { - return __float22half2_rn(x); - } -}; - - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -// CUDA_ARCH < 800 does not have BF16 support -// TODO: Add in ROCm support once public headers handle bf16 maturely -template <> -struct _typeConvert { - static constexpr bool exists = true; - using hip_type = __nv_bfloat16; - using packed_hip_type = __nv_bfloat162; - - __device__ static inline float convert(hip_type x) { - return __bfloat162float(x); - } - __device__ static inline float2 convert(packed_hip_type x) { - return __bfloat1622float2(x); - } - __device__ static inline hip_type convert(float x) { - return __float2bfloat16(x); - } - __device__ static inline packed_hip_type convert(float2 x) { - return __float22bfloat162_rn(x); - } -}; - #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= - // 12000)) - -/* Vector POD struct to generate vectorized and packed FP16/BF16 ops - for appropriate specializations of fused_add_rms_norm_kernel. - Only functions that are necessary in that kernel are implemented. - Alignment to 16 bytes is required to use 128-bit global memory ops. - */ -template -struct alignas(16) _f16Vec { - /* Not theoretically necessary that width is a power of 2 but should - almost always be the case for optimization purposes */ - static_assert(width > 0 && (width & (width - 1)) == 0, - "Width is not a positive power of 2!"); - using Converter = _typeConvert; - using T1 = typename Converter::hip_type; - using T2 = typename Converter::packed_hip_type; - T1 data[width]; - - __device__ _f16Vec& operator+=(const _f16Vec& other) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i + 1]}; - temp += T2{other.data[i], other.data[i + 1]}; - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) data[i] += other.data[i]; - } - return *this; - } - - __device__ _f16Vec& operator*=(const _f16Vec& other) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - T2 temp{data[i], data[i + 1]}; - temp *= T2{other.data[i], other.data[i + 1]}; - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) data[i] *= other.data[i]; - } - return *this; - } - - __device__ _f16Vec& operator*=(const float scale) { - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - float2 temp_f = Converter::convert(T2{data[i], data[i + 1]}); - temp_f.x *= scale; - temp_f.y *= scale; - T2 temp = Converter::convert(temp_f); - data[i] = temp.x; - data[i + 1] = temp.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) { - float temp = Converter::convert(data[i]) * scale; - data[i] = Converter::convert(temp); - } - } - return *this; - } - - __device__ float sum_squares() const { - float result = 0.0f; - if constexpr (width % 2 == 0) { -#pragma unroll - for (int i = 0; i < width; i += 2) { - float2 z = Converter::convert(T2{data[i], data[i + 1]}); - result += z.x * z.x + z.y * z.y; - } - } else { -#pragma unroll - for (int i = 0; i < width; ++i) { - float x = Converter::convert(data[i]); - result += x * x; - } - } - return result; - } -}; - /* Function specialization in the case of FP16/BF16 tensors. Additional optimizations we can make in this case are packed and vectorized operations, which help with the diff --git a/vllm/csrc/layernorm_quant_kernels.cu b/vllm/csrc/layernorm_quant_kernels.cu new file mode 100644 index 000000000..c18e2a4e4 --- /dev/null +++ b/vllm/csrc/layernorm_quant_kernels.cu @@ -0,0 +1,234 @@ +/* + * This file contains the CUDA kernels for the fused quantized layernorm. + * The kernels correspond to the kernels in layernorm_kernels.cu, except they + * also produce quantized output directly. + * Currently, only static fp8 quantization is supported. + */ + +#include "type_convert.cuh" +#include "quantization/fp8/common.cuh" +#include "dispatch_utils.h" + +#include +#include + +#ifndef USE_ROCM + #include +#else + #include +#endif + +namespace vllm { + +// TODO(woosuk): Further optimize this kernel. +template +__global__ void rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + const scalar_t* __restrict__ input, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + __shared__ float s_variance; + float variance = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + const float x = (float)input[blockIdx.x * hidden_size + idx]; + variance += x * x; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)input[blockIdx.x * hidden_size + idx]; + float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; + out[blockIdx.x * hidden_size + idx] = + scaled_fp8_conversion(out_norm, scale_inv); + } +} + +/* Function specialization in the case of FP16/BF16 tensors. + Additional optimizations we can make in this case are + packed and vectorized operations, which help with the + memory latency bottleneck. */ +template +__global__ std::enable_if_t<(width > 0) && _typeConvert::exists> +fused_add_rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + // Sanity checks on our vector struct and type-punned pointer arithmetic + static_assert(std::is_pod_v<_f16Vec>); + static_assert(sizeof(_f16Vec) == sizeof(scalar_t) * width); + + const int vec_hidden_size = hidden_size / width; + __shared__ float s_variance; + float variance = 0.0f; + /* These and the argument pointers are all declared `restrict` as they are + not aliased in practice. Argument pointers should not be dereferenced + in this kernel as that would be undefined behavior */ + auto* __restrict__ input_v = + reinterpret_cast<_f16Vec*>(input); + auto* __restrict__ residual_v = + reinterpret_cast<_f16Vec*>(residual); + auto* __restrict__ weight_v = + reinterpret_cast*>(weight); + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = input_v[id]; + temp += residual_v[id]; + variance += temp.sum_squares(); + residual_v[id] = temp; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) { + int id = blockIdx.x * vec_hidden_size + idx; + _f16Vec temp = residual_v[id]; + temp *= s_variance; + temp *= weight_v[idx]; +#pragma unroll + for (int i = 0; i < width; ++i) { + out[id * width + i] = + scaled_fp8_conversion(float(temp.data[i]), scale_inv); + } + } +} + +/* Generic fused_add_rms_norm_kernel + The width field is not used here but necessary for other specializations. + */ +template +__global__ std::enable_if_t<(width == 0) || !_typeConvert::exists> +fused_add_rms_norm_static_fp8_quant_kernel( + FP8_TYPE* __restrict__ out, // [..., hidden_size] + scalar_t* __restrict__ input, // [..., hidden_size] + scalar_t* __restrict__ residual, // [..., hidden_size] + const scalar_t* __restrict__ weight, // [hidden_size] + const float* __restrict__ scale, // [1] + const float epsilon, const int num_tokens, const int hidden_size) { + __shared__ float s_variance; + float variance = 0.0f; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + scalar_t z = input[blockIdx.x * hidden_size + idx]; + z += residual[blockIdx.x * hidden_size + idx]; + float x = (float)z; + variance += x * x; + residual[blockIdx.x * hidden_size + idx] = z; + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage reduceStore; + variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x); + + if (threadIdx.x == 0) { + s_variance = rsqrtf(variance / hidden_size + epsilon); + } + __syncthreads(); + + // invert scale to avoid division + float const scale_inv = 1.0f / *scale; + + for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { + float x = (float)residual[blockIdx.x * hidden_size + idx]; + float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx]; + out[blockIdx.x * hidden_size + idx] = + scaled_fp8_conversion(out_norm, scale_inv); + } +} + +} // namespace vllm + +void rms_norm_static_fp8_quant(torch::Tensor& out, // [..., hidden_size] + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + torch::Tensor& scale, // [1] + double epsilon) { + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + dim3 block(std::min(hidden_size, 1024)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { + vllm::rms_norm_static_fp8_quant_kernel + <<>>( + out.data_ptr(), input.data_ptr(), + weight.data_ptr(), scale.data_ptr(), epsilon, + num_tokens, hidden_size); + }); +} + +#define LAUNCH_FUSED_ADD_RMS_NORM(width) \ + VLLM_DISPATCH_FLOATING_TYPES( \ + input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \ + vllm::fused_add_rms_norm_static_fp8_quant_kernel \ + <<>>( \ + out.data_ptr(), input.data_ptr(), \ + residual.data_ptr(), weight.data_ptr(), \ + scale.data_ptr(), epsilon, num_tokens, hidden_size); \ + }); + +void fused_add_rms_norm_static_fp8_quant( + torch::Tensor& out, // [..., hidden_size], + torch::Tensor& input, // [..., hidden_size] + torch::Tensor& residual, // [..., hidden_size] + torch::Tensor& weight, // [hidden_size] + torch::Tensor& scale, // [1] + double epsilon) { + int hidden_size = input.size(-1); + int num_tokens = input.numel() / hidden_size; + + dim3 grid(num_tokens); + /* This kernel is memory-latency bound in many scenarios. + When num_tokens is large, a smaller block size allows + for increased block occupancy on CUs and better latency + hiding on global mem ops. */ + const int max_block_size = (num_tokens < 256) ? 1024 : 256; + dim3 block(std::min(hidden_size, max_block_size)); + const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + /*If the tensor types are FP16/BF16, try to use the optimized kernel + with packed + vectorized ops. + Max optimization is achieved with a width-8 vector of FP16/BF16s + since we can load at most 128 bits at once in a global memory op. + However, this requires each tensor's data to be aligned to 16 + bytes. + */ + auto inp_ptr = reinterpret_cast(input.data_ptr()); + auto res_ptr = reinterpret_cast(residual.data_ptr()); + auto wt_ptr = reinterpret_cast(weight.data_ptr()); + bool ptrs_are_aligned = + inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0; + if (ptrs_are_aligned && hidden_size % 8 == 0) { + LAUNCH_FUSED_ADD_RMS_NORM(8); + } else { + LAUNCH_FUSED_ADD_RMS_NORM(0); + } +} diff --git a/vllm/csrc/mamba/causal_conv1d/causal_conv1d.cu b/vllm/csrc/mamba/causal_conv1d/causal_conv1d.cu index 3a464c5f3..498d069c0 100644 --- a/vllm/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/vllm/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -418,6 +418,31 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { typename Ktraits::BlockStoreT(smem_store).Store(out, out_vals_store, seqlen - chunk * kChunkSize); } out += kChunkSize; + + int final_state_position = ((seqlen - (kWidth - 1)) - (n_chunks - 1) * kChunkSize); + // in case the final state is separated between the last "smem_exchange" and + // and the one before it (chunk = n_chunks - 1 and chunk = n_chunks - 2), + // (which occurs when `final_state_position` is a non-positivie index) + // we load the correct data from smem_exchange from both chunks, the last chunk iteration and the one before it + if (final_state_position < 0 && seqlen > kWidth){ + input_t vals_load[kNElts] = {0}; + if ((chunk == n_chunks - 2) && (tidx == kNThreads - 1)){ + // chunk = n_chunks - 2, a segment of the final state sits in the last index + reinterpret_cast(vals_load)[0] = smem_exchange[kNThreads - 1]; + #pragma unroll + for (int w = 0; w < -final_state_position; ++w){ + conv_states[w] = vals_load[kNElts + final_state_position + w]; + } + } + if ((chunk == n_chunks - 1) && tidx == 0){ + // chunk = n_chunks - 1, the second segment of the final state first positions + reinterpret_cast(vals_load)[0] = smem_exchange[0]; + for (int w = -final_state_position; w < kWidth - 1; ++w){ + conv_states[w] = vals_load[w + final_state_position]; + } + return; + } + } } // Final state is stored in the smem_exchange last token slot, // in case seqlen < kWidth, we would need to take the final state from the @@ -446,9 +471,14 @@ void causal_conv1d_fwd_kernel(ConvParamsBase params) { } else { // in case the final state is in between the threads data - reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; - reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; const int offset = ((seqlen - (kWidth - 1)) % (kNElts)); + if ((offset + kWidth - 2) >= kNElts && (last_thread + 1 < kNThreads)){ + // In case last_thread == kNThreads - 1, accessing last_thread + 1 will result in a + // illegal access error on H100. + // Therefore, we access last_thread + 1, only if the final state data sits there + reinterpret_cast(x_vals_load)[1] = smem_exchange[last_thread + 1]; + } + reinterpret_cast(x_vals_load)[0] = smem_exchange[last_thread]; #pragma unroll for (int w = 0; w < kWidth - 1; ++w){ conv_states[w] = x_vals_load[offset + w ]; diff --git a/vllm/csrc/ops.h b/vllm/csrc/ops.h index c50eb39a3..ea001190b 100644 --- a/vllm/csrc/ops.h +++ b/vllm/csrc/ops.h @@ -56,6 +56,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight, void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, double epsilon); +void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input, + torch::Tensor& weight, torch::Tensor& scale, + double epsilon); + +void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out, + torch::Tensor& input, + torch::Tensor& residual, + torch::Tensor& weight, + torch::Tensor& scale, double epsilon); + void rotary_embedding(torch::Tensor& positions, torch::Tensor& query, torch::Tensor& key, int64_t head_size, torch::Tensor& cos_sin_cache, bool is_neox); @@ -118,6 +128,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel, int64_t thx, int64_t thy); torch::Tensor permute_cols(torch::Tensor const& A, torch::Tensor const& perm); +#endif torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, int64_t n); @@ -128,6 +139,7 @@ torch::Tensor ggml_mul_mat_vec_a8(torch::Tensor W, torch::Tensor X, torch::Tensor ggml_mul_mat_a8(torch::Tensor W, torch::Tensor X, int64_t type, int64_t row); +#ifndef USE_ROCM bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, @@ -199,20 +211,16 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, #ifndef USE_ROCM using fptr_t = int64_t; -fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data, - const std::vector& handles, - const std::vector& offsets, int64_t rank, - bool full_nvlink); -void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out); -void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer, - torch::Tensor& out); +fptr_t init_custom_ar(const std::vector& fake_ipc_ptrs, + torch::Tensor& rank_data, int64_t rank, bool full_nvlink); +void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out, + fptr_t reg_buffer, int64_t reg_buffer_sz_bytes); void dispose(fptr_t _fa); int64_t meta_size(); -void register_buffer(fptr_t _fa, torch::Tensor& t, - const std::vector& handles, - const std::vector& offsets); -std::tuple> get_graph_buffer_ipc_meta( - fptr_t _fa); -void register_graph_buffers(fptr_t _fa, const std::vector& handles, +void register_buffer(fptr_t _fa, const std::vector& fake_ipc_ptrs); +std::tuple, std::vector> +get_graph_buffer_ipc_meta(fptr_t _fa); +void register_graph_buffers(fptr_t _fa, + const std::vector>& handles, const std::vector>& offsets); #endif diff --git a/vllm/csrc/prepare_inputs/advance_step.cu b/vllm/csrc/prepare_inputs/advance_step.cu index 46fef79f4..bd184ee22 100644 --- a/vllm/csrc/prepare_inputs/advance_step.cu +++ b/vllm/csrc/prepare_inputs/advance_step.cu @@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t, } } +/// each thread processes a block per query __global__ void advance_step_flashinfer_kernel( int num_threads, int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, long const* sampled_token_ids_ptr, @@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel( int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr + if (idx == 0) { + paged_kv_indptr_ptr[idx] = 0; + } if (idx < num_queries) { int sum = 0; for (int i = 0; i <= idx; ++i) { @@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel( } __global__ void advance_step_flashinfer_indices_kernel( - int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const block_tables_stride, int* paged_kv_indices_ptr, + int num_seqs, int num_queries, int const* block_tables_ptr, + int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - int row = idx / block_tables_stride; - int col = idx % block_tables_stride; - - if (row < num_queries && col < block_table_bound_ptr[row]) { - paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] = - block_tables_ptr[row * block_tables_stride + col]; + // note: max_num_blocks_per_seq = block_tables.stride(0) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // when cuda graphs are enabled, paged_kv_indptr tensor + // has to be updated for the padded queries + // tid represents a query# for paged_kv_indptr tensor + if (num_queries < tid && tid <= num_seqs) { + paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; } - // if cudagraph, fill padded seqs with the last valid seq's indptr - if (num_queries < row && row <= num_seqs) { - paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries]; + + // each thread processes a block_ptr in block_tables + // block_tables shape: [num_queries, max_num_blocks_per_seq] + // paged_kv_indices is flattened block_tables. + for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); + idx += (gridDim.x * blockDim.x)) { + // block_tables-row = paged_kv_indptr[queryNum] + int queryNum = idx / max_num_blocks_per_seq; + int col = idx % max_num_blocks_per_seq; + if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { + int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; + int block_tables_idx = queryNum * max_num_blocks_per_seq + col; + paged_kv_indices_ptr[indices_arr_idx] = + block_tables_ptr[block_tables_idx]; + } } } @@ -247,22 +263,16 @@ void advance_step_flashinfer( int threads; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - if (logging) { - printf("launching kernel with %d blocks\n", blocks); - } - // TODO(will): support arbitrary block_tables stride - if ((blocks * threads) / block_tables.stride(0) < num_queries) { - TORCH_CHECK(false, - "multi-step: not enough threads to map block_table to" - "FlashInfer's paged_kv_indices on GPU. Try reducing the number " - "of seqs,", - " increasing the block size or take smaller steps.", - " num_queries = ", num_queries, - " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); + int block_tables_stride = block_tables.stride(0); + TORCH_CHECK((blocks * threads > num_queries), + "multi-step: not enough threads to map to num_queries = ", + num_queries, " block_tables.stride(0) = ", block_tables.stride(0), + " blocks = ", blocks, " max_threads = ", threads); + if (logging) { + printf("launching kernels with %d blocks and %d threads\n", blocks, + threads); } - advance_step_flashinfer_kernel<<>>( threads, num_seqs, num_queries, block_size, reinterpret_cast(input_tokens.data_ptr()), @@ -281,7 +291,7 @@ void advance_step_flashinfer( reinterpret_cast(block_table_bound.data_ptr())); advance_step_flashinfer_indices_kernel<<>>( - threads, num_seqs, num_queries, + num_seqs, num_queries, reinterpret_cast(block_tables.data_ptr()), block_tables.stride(0), reinterpret_cast(paged_kv_indices.data_ptr()), diff --git a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index ee801e165..dbb72e8bb 100644 --- a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -8,6 +8,10 @@ #include "scaled_mm_c2x_sm89_fp8_dispatch.cuh" #include "scaled_mm_c2x_sm89_int8_dispatch.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp" + +using namespace vllm; + /* This file defines quantized GEMM operations using the CUTLASS 2.x API, for NVIDIA GPUs with SM versions prior to sm90 (Hopper). @@ -22,12 +26,11 @@ void cutlass_scaled_mm_sm75_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm75_dispatch( + return cutlass_gemm_sm75_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm75_dispatch( + return cutlass_gemm_sm75_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -42,10 +45,10 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales); } } @@ -61,10 +64,10 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm75_epilogue( + return cutlass_scaled_mm_sm75_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } @@ -78,12 +81,11 @@ void cutlass_scaled_mm_sm80_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm80_dispatch( + return cutlass_gemm_sm80_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm80_dispatch( + return cutlass_gemm_sm80_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -98,10 +100,10 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales); } } @@ -117,10 +119,10 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm80_epilogue( + return cutlass_scaled_mm_sm80_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } @@ -134,13 +136,12 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kInt8); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm89_int8_dispatch( + return cutlass_gemm_sm89_int8_dispatch( out, a, b, std::forward(epilogue_args)...); } else { assert(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm89_int8_dispatch( + return cutlass_gemm_sm89_int8_dispatch( out, a, b, std::forward(epilogue_args)...); } } else { @@ -148,13 +149,13 @@ void cutlass_scaled_mm_sm89_epilogue(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn); if (out.dtype() == torch::kBFloat16) { - return vllm::cutlass_gemm_sm89_fp8_dispatch< - cutlass::float_e4m3_t, cutlass::bfloat16_t, Epilogue>( + return cutlass_gemm_sm89_fp8_dispatch( out, a, b, std::forward(epilogue_args)...); } else { TORCH_CHECK(out.dtype() == torch::kFloat16); - return vllm::cutlass_gemm_sm89_fp8_dispatch( + return cutlass_gemm_sm89_fp8_dispatch( out, a, b, std::forward(epilogue_args)...); } } @@ -170,10 +171,10 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == out.dtype(), "currently bias dtype must match output dtype ", out.dtype()); - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales); } } @@ -189,10 +190,10 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm89_epilogue( + return cutlass_scaled_mm_sm89_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } diff --git a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh index 6329ff636..d03242f44 100644 --- a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh +++ b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh @@ -21,7 +21,6 @@ #include "cutlass/epilogue/threadblock/fusion/visitors.hpp" #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" -#include "broadcast_load_epilogue_c2x.hpp" #include "common.hpp" // clang-format on @@ -71,307 +70,6 @@ struct enable_sm89_to_sm90 : Kernel { #endif } }; - -/* - * This class provides the common load descriptors for the - * ScaledEpilogue[...] classes - */ -template -struct ScaledEpilogueBase { - protected: - using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; - - template - using ColOrScalarLoad = - cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< - OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; - - template - using RowOrScalarLoad = - cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - template - using ColLoad = cutlass::epilogue::threadblock::VisitorColBroadcast< - OutputTileThreadMap, T, Stride, Int<0>, Int<0>>>; - - template - using RowLoad = cutlass::epilogue::threadblock::VisitorRowBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - template - using RowOrZeroLoad = - cutlass::epilogue::threadblock::VisitorRowOrZeroBroadcast< - OutputTileThreadMap, T, Stride, Int<1>, Int<0>>>; - - // This utility function constructs the arguments for the load descriptors - // from a tensor. It can handle both row and column, as well as row/column or - // scalar cases. - template - static auto args_from_tensor(torch::Tensor const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = static_cast(tensor.data_ptr()); - if constexpr (std::is_same_v> || - std::is_same_v>) { - return Arguments{data_ptr, tensor.numel() != 1}; - } else { - // it would technically work but no use case as data_ptr is never nullptr - static_assert(!std::is_same_v>); - return Arguments{data_ptr}; - } - } - - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template - static auto args_from_tensor(c10::optional const& tensor) { - static_assert(std::is_same_v>); - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; - return Arguments{data_ptr}; - } -}; - -/* - This epilogue function defines a quantized GEMM operation similar to - torch._scaled_mm. - - A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or - per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ -template -struct ScaledEpilogue - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; - } -}; - -/* - * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. - * This bias can also be used in the per-tensor azp case, where the activation - * zero point (azp) is used to compute an azp correction term, - * which is folded into the bias. - * - * The bias tensor must be per-output channel. - * ScaleA and ScaleB can be per-tensor or per-token/per-channel. - */ -template -struct ScaledEpilogueBias - : protected ScaledEpilogueBase { - protected: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::threadblock::Sm80EVT; - - using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; - using ArgumentType = typename EVTCompute::Arguments; - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; - } -}; - -/* - * This epilogue directly supports per-tensor azp in int32 form. - * As opposed to the per-token epilogue below, this epilogue only has an azp_adj - * term, which should already be multiplied with the scalar azp. - * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzp - : protected ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowOrZeroLoad; - - // This is the full AZP term, azp * J @ B, shape (1,n) - using AzpWithAdj = typename SUPER::template RowLoad; - - // Compute float(accum - azp_adj), both operands are int32_t - using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - -/* - * This epilogue supports per-token azp by computing and applying - * the correction term using a rank-1 update. If the term were materialized, - * it would require O(m*n) space, and this way it only requires O(m+n) space. - * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero - * point for each row of A. - * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzpToken - : protected ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowOrZeroLoad; - - // Per-token azp term, shape (m,1) - using Azp = typename SUPER::template ColLoad; - - // This is the AZP adjustment term, J @ B, shape (1,n) - using AzpAdj = typename SUPER::template RowLoad; - - // Compute azp * azp_adj - using ComputeAzp = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, int32_t, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::threadblock::Sm80EVT; - - // Compute float(accum - azp*azp_adj), all operands are int32_t - using ComputeAcc = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAcc = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleB = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::threadblock::Sm80EVT; - - using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::threadblock::Sm80EVT; - - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_args = SUPER::template args_from_tensor(azp); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - template typename ArchGuard, typename ElementAB_, typename ElementD_, template typename Epilogue_, typename TileShape, diff --git a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 292c9e4b3..33581a63d 100644 --- a/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/vllm/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -23,11 +23,12 @@ #include "cutlass/epilogue/collective/collective_builder.hpp" #include "cutlass/gemm/collective/collective_builder.hpp" -#include "broadcast_load_epilogue_c3x.hpp" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" #include "common.hpp" // clang-format on using namespace cute; +using namespace vllm; /* This file defines quantized GEMM operations using the CUTLASS 3.x API, for @@ -56,305 +57,6 @@ struct enable_sm90_or_later : Kernel { #endif } }; - -/* - * This class provides the common load descriptors for the - * ScaledEpilogue[...] classes - */ -template -struct ScaledEpilogueBase { - protected: - using Accum = cutlass::epilogue::fusion::Sm90AccFetch; - - template - using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>>; - - template - using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>>; - - // Don't want to support nullptr by default - template - using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<0>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; - - // Don't want to support nullptr by default - template - using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast< - 0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, - Stride, Int<1>, Int<0>>, 128 / sizeof_bits_v, EnableNullPtr>; - - // This utility function constructs the arguments for the load descriptors - // from a tensor. It can handle both row and column, as well as row/column or - // scalar cases. - template - static auto args_from_tensor(torch::Tensor const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = static_cast(tensor.data_ptr()); - if constexpr (std::is_same_v> || - std::is_same_v>) { - return Arguments{data_ptr, tensor.numel() != 1}; - } else { - static_assert(!std::is_same_v> && - !std::is_same_v>); - return Arguments{data_ptr}; - } - } - - // This overload handles the case where there might not be a tensor, in which - // case a nullptr is passed and a constant (0) is used. - template - static auto args_from_tensor(c10::optional const& tensor) { - using Arguments = typename Descriptor::Arguments; - auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; - static_assert(std::is_same_v> || - std::is_same_v>); - return Arguments{data_ptr}; - } -}; - -/* - This epilogue function defines a quantized GEMM operation similar to - torch.scaled_mm_. - - A and B may be both either int8 or fp8_e4m3. A can be - quantized per-tensor or per-row. B can be quantized per-tensor or per-column. - Any combination of per-tensor and per-row or column is supported. - A and B must have symmetric quantization (zero point == 0). - - So the GEMM operation is D = (a_scales * A) (b_scales * B), where the - scales are applied elementwise with numpy-style broadcasting. - - ScaleA and ScaleB define the epilogue functions that apply the scales for - the A and B operands respectively. These scales may be either per-tensor or - per row or column. -*/ -template -struct ScaledEpilogue - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - - using Compute0 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::fusion::Sm90EVT; - - using Compute1 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args}; - } -}; - -/* - * This epilogue performs the same operation as ScaledEpilogue, but adds a bias. - * This bias can also be used in the per-tensor azp case, where the activation - * zero point (azp) is used to compute an azp correction term, - * which is folded into the bias. - * - * The bias tensor must be per-output channel. - * ScaleA and ScaleB can be per-tensor or per-token/per-channel. - */ -template -struct ScaledEpilogueBias - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - using Compute0 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTCompute0 = - cutlass::epilogue::fusion::Sm90EVT; - - using Compute1 = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - - using ArgumentType = typename EVTCompute::Arguments; - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - - typename EVTCompute0::Arguments evt0_args{b_args}; - return ArgumentType{a_args, evt0_args, bias_args}; - } -}; - -/* - * This epilogue directly supports per-tensor azp in int32 form. - * As opposed to the per-token epilogue below, this epilogue only has an azp_adj - * term, which should already be multiplied with the scalar azp. - * The azp_adj term is a 1D tensor of shape (1,n), computed as azp * J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzp - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - // This is the full AZP term, azp * J @ B, shape (1,n) - using AzpWithAdj = typename SUPER::template RowLoad; - - // Compute float(accum - azp_adj), both operands are int32_t - using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{{}, azp_adj_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_azp_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - -/* - * This epilogue supports per-token azp by computing and applying - * the correction term using a rank-1 update. If the term were materialized, - * it would require O(m*n) space, and this way it only requires O(m+n) space. - * The azp term is a 1D tensor of shape (m,1), and represents the unscaled zero - * point for each row of A. - * The azp_adj term is a 1D tensor of shape (1,n), computed as J @ B. - * - * This epilogue also supports bias, which remains per-channel. - */ -template -struct ScaledEpilogueBiasAzpToken - : private ScaledEpilogueBase { - private: - using SUPER = ScaledEpilogueBase; - using Accum = typename SUPER::Accum; - using ScaleA = typename SUPER::template ColOrScalarLoad; - using ScaleB = typename SUPER::template RowOrScalarLoad; - using Bias = typename SUPER::template RowLoad; - - // Per-token azp term, shape (m,1) - using Azp = typename SUPER::template ColLoad; - - // This is the AZP adjustment term, J @ B, shape (1,n) - using AzpAdj = typename SUPER::template RowLoad; - - // Compute azp * azp_adj - using ComputeAzp = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, int32_t, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAzp = - cutlass::epilogue::fusion::Sm90EVT; - - // Compute float(accum - azp*azp_adj), all operands are int32_t - using ComputeAcc = cutlass::epilogue::fusion::Sm90Compute< - cutlass::minus, float, int32_t, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeAcc = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleB = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiplies, float, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - using EVTComputeScaleB = - cutlass::epilogue::fusion::Sm90EVT; - - using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute< - cutlass::multiply_add, ElementD, float, - cutlass::FloatRoundStyle::round_to_nearest>; - - public: - using EVTCompute = - cutlass::epilogue::fusion::Sm90EVT; - using ArgumentType = typename EVTCompute::Arguments; - - static ArgumentType prepare_args(torch::Tensor const& a_scales, - torch::Tensor const& b_scales, - torch::Tensor const& azp_adj, - torch::Tensor const& azp, - c10::optional const& bias) { - auto a_args = SUPER::template args_from_tensor(a_scales); - auto b_args = SUPER::template args_from_tensor(b_scales); - auto bias_args = SUPER::template args_from_tensor(bias); - auto azp_args = SUPER::template args_from_tensor(azp); - auto azp_adj_args = - SUPER::template args_from_tensor(azp_adj); - - typename EVTComputeAzp::Arguments evt_azp_args{azp_args, azp_adj_args}; - typename EVTComputeAcc::Arguments evt_acc_args{{}, evt_azp_args}; - typename EVTComputeScaleB::Arguments evt_scale_b_args{b_args, evt_acc_args}; - return ArgumentType{a_args, evt_scale_b_args, bias_args}; - } -}; - template typename Epilogue_, typename TileShape, typename ClusterShape, typename KernelSchedule, @@ -721,11 +423,11 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, if (bias) { TORCH_CHECK(bias->dtype() == c.dtype(), "currently bias dtype must match output dtype ", c.dtype()); - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( c, a, b, a_scales, b_scales, *bias); } else { - return cutlass_scaled_mm_sm90_epilogue(c, a, b, a_scales, - b_scales); + return cutlass_scaled_mm_sm90_epilogue( + c, a, b, a_scales, b_scales); } } @@ -740,10 +442,10 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (azp) { - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( out, a, b, a_scales, b_scales, azp_adj, *azp, bias); } else { - return cutlass_scaled_mm_sm90_epilogue( + return cutlass_scaled_mm_sm90_epilogue( out, a, b, a_scales, b_scales, azp_adj, bias); } } diff --git a/vllm/csrc/quantization/fp8/common.cu b/vllm/csrc/quantization/fp8/common.cu index f2c609c1b..e4f6615ed 100644 --- a/vllm/csrc/quantization/fp8/common.cu +++ b/vllm/csrc/quantization/fp8/common.cu @@ -1,185 +1,16 @@ -#include -#include -#include - -#include - -#include "cuda_compat.h" +#include "common.cuh" #include "dispatch_utils.h" +#include + #ifndef USE_ROCM - #include #include #else - #include #include #endif -#ifndef USE_ROCM -using FP8_TYPE = c10::Float8_e4m3fn; -C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = - std::numeric_limits::max(); -#else - #include "amd/hip_float8.h" -using FP8_TYPE = c10::Float8_e4m3fnuz; -// Using the default max value from pytorch (240.0) will cause accuracy -// issue when running dynamic quantization. Here use 224.0f for rocm. -constexpr auto FP8_E4M3_MAX = 224.0f; -#endif - namespace vllm { -__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { - float old; - old = (value >= 0) - ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) - : __uint_as_float( - atomicMin((unsigned int*)addr, __float_as_uint(value))); - - return old; -} - -template -__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, - float const scale) { - float x = 0.0f; - if constexpr (is_scale_inverted) { - x = val * scale; - } else { - x = val / scale; - } - - float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); -#ifndef USE_ROCM - return static_cast(r); -#else - // Use hardware cvt instruction for fp8 on rocm - return c10::Float8_e4m3fnuz(hip_fp8(r).data, - c10::Float8_e4m3fnuz::from_bits()); -#endif -} - -// Compute the absolute maximum m of the input tensor and store -// m / float8_e4m3::max() in *scale. Each thread block performs a -// reduction tree and the memory in scale is atomically updated. -// So to get the right answer, *scale needs to be initialized to -// a value <= 0.0 and we need to wait for all thread blocks to -// finish before consuming *scale. -template -__global__ void segmented_max_reduction(float* __restrict__ scale, - const scalar_t* __restrict__ input, - int64_t num_elems) { - __shared__ float cache[1024]; - int64_t i = blockDim.x * blockIdx.x + threadIdx.x; - - // First store maximum for all values processes by - // the current thread in cache[threadIdx.x] - scalar_t tmp = 0.0; - while (i < num_elems) { - float x = static_cast(input[i]); - tmp = max(tmp, fabs(x)); - i += blockDim.x * gridDim.x; - } - cache[threadIdx.x] = tmp; - - __syncthreads(); - - // Now perform parallel reduction within the thread block - int ib = blockDim.x / 2; - while (ib != 0) { - if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { - cache[threadIdx.x] = cache[threadIdx.x + ib]; - } - __syncthreads(); - ib /= 2; - } - // Finally, since cache[0] contains the maximum for this thread block, - // atomically write the max to the target location - if (threadIdx.x == 0) { - atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX); - } -} - -template -struct __align__(8) vec4_t { - scalar_t x; - scalar_t y; - scalar_t z; - scalar_t w; -}; - -typedef struct __align__(4) { - FP8_TYPE x; - FP8_TYPE y; - FP8_TYPE z; - FP8_TYPE w; -} -float8x4_t; - -template -__device__ float thread_max_vec(scalar_t const* __restrict__ input, - int64_t const num_elems, int const tid, - int const step) { - // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); - - int64_t const num_vec_elems = num_elems >> 2; - float absmax_val = 0.0f; - -#pragma unroll 4 - for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - absmax_val = max(absmax_val, fabs(in_vec.x)); - absmax_val = max(absmax_val, fabs(in_vec.y)); - absmax_val = max(absmax_val, fabs(in_vec.z)); - absmax_val = max(absmax_val, fabs(in_vec.w)); - } - - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - absmax_val = max(absmax_val, fabs(input[i])); - } - - return absmax_val; -} - -template -__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, - scalar_t const* __restrict__ input, - float const scale, - int64_t const num_elems, - int const tid, int const step) { - // Vectorized input/output to better utilize memory bandwidth. - vec4_t const* vectorized_in = - reinterpret_cast const*>(input); - float8x4_t* vectorized_out = reinterpret_cast(out); - - int64_t const num_vec_elems = num_elems >> 2; - -#pragma unroll 4 - for (int64_t i = tid; i < num_vec_elems; i += step) { - vec4_t in_vec = vectorized_in[i]; - float8x4_t out_vec; - - out_vec.x = scaled_fp8_conversion( - static_cast(in_vec.x), scale); - out_vec.y = scaled_fp8_conversion( - static_cast(in_vec.y), scale); - out_vec.z = scaled_fp8_conversion( - static_cast(in_vec.z), scale); - out_vec.w = scaled_fp8_conversion( - static_cast(in_vec.w), scale); - vectorized_out[i] = out_vec; - } - - // Handle the remaining elements if num_elems is not divisible by 4 - for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { - out[i] = scaled_fp8_conversion( - static_cast(input[i]), scale); - } -} - template __global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out, const scalar_t* __restrict__ input, diff --git a/vllm/csrc/quantization/fp8/common.cuh b/vllm/csrc/quantization/fp8/common.cuh new file mode 100644 index 000000000..d7c0297d5 --- /dev/null +++ b/vllm/csrc/quantization/fp8/common.cuh @@ -0,0 +1,172 @@ +#pragma once + +#include + +#ifndef USE_ROCM + #include +using FP8_TYPE = c10::Float8_e4m3fn; +C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = + std::numeric_limits::max(); +#else + #include + #include "amd/hip_float8.h" +using FP8_TYPE = c10::Float8_e4m3fnuz; +// Using the default max value from pytorch (240.0) will cause accuracy +// issue when running dynamic quantization. Here use 224.0f for rocm. +constexpr auto FP8_E4M3_MAX = 224.0f; +#endif + +namespace vllm { + +__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) { + float old; + old = (value >= 0) + ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) + : __uint_as_float( + atomicMin((unsigned int*)addr, __float_as_uint(value))); + + return old; +} + +template +__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val, + float const scale) { + float x = 0.0f; + if constexpr (is_scale_inverted) { + x = val * scale; + } else { + x = val / scale; + } + + float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX)); +#ifndef USE_ROCM + return static_cast(r); +#else + // Use hardware cvt instruction for fp8 on rocm + return c10::Float8_e4m3fnuz(hip_fp8(r).data, + c10::Float8_e4m3fnuz::from_bits()); +#endif +} + +// Compute the absolute maximum m of the input tensor and store +// m / float8_e4m3::max() in *scale. Each thread block performs a +// reduction tree and the memory in scale is atomically updated. +// So to get the right answer, *scale needs to be initialized to +// a value <= 0.0 and we need to wait for all thread blocks to +// finish before consuming *scale. +template +__global__ void segmented_max_reduction(float* __restrict__ scale, + const scalar_t* __restrict__ input, + int64_t num_elems) { + __shared__ float cache[1024]; + int64_t i = blockDim.x * blockIdx.x + threadIdx.x; + + // First store maximum for all values processes by + // the current thread in cache[threadIdx.x] + scalar_t tmp = 0.0; + while (i < num_elems) { + float x = static_cast(input[i]); + tmp = max(tmp, fabs(x)); + i += blockDim.x * gridDim.x; + } + cache[threadIdx.x] = tmp; + + __syncthreads(); + + // Now perform parallel reduction within the thread block + int ib = blockDim.x / 2; + while (ib != 0) { + if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) { + cache[threadIdx.x] = cache[threadIdx.x + ib]; + } + __syncthreads(); + ib /= 2; + } + // Finally, since cache[0] contains the maximum for this thread block, + // atomically write the max to the target location + if (threadIdx.x == 0) { + atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX); + } +} + +template +struct __align__(8) vec4_t { + scalar_t x; + scalar_t y; + scalar_t z; + scalar_t w; +}; + +typedef struct __align__(4) { + FP8_TYPE x; + FP8_TYPE y; + FP8_TYPE z; + FP8_TYPE w; +} +float8x4_t; + +template +__device__ float thread_max_vec(scalar_t const* __restrict__ input, + int64_t const num_elems, int const tid, + int const step) { + // Vectorized input/output to better utilize memory bandwidth. + vec4_t const* vectorized_in = + reinterpret_cast const*>(input); + + int64_t const num_vec_elems = num_elems >> 2; + float absmax_val = 0.0f; + +#pragma unroll 4 + for (int64_t i = tid; i < num_vec_elems; i += step) { + vec4_t in_vec = vectorized_in[i]; + absmax_val = max(absmax_val, fabs(in_vec.x)); + absmax_val = max(absmax_val, fabs(in_vec.y)); + absmax_val = max(absmax_val, fabs(in_vec.z)); + absmax_val = max(absmax_val, fabs(in_vec.w)); + } + + // Handle the remaining elements if num_elems is not divisible by 4 + for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + absmax_val = max(absmax_val, fabs(input[i])); + } + + return absmax_val; +} + +template +__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out, + scalar_t const* __restrict__ input, + float const scale, + int64_t const num_elems, + int const tid, int const step) { + // Vectorized input/output to better utilize memory bandwidth. + vec4_t const* vectorized_in = + reinterpret_cast const*>(input); + float8x4_t* vectorized_out = reinterpret_cast(out); + + int64_t const num_vec_elems = num_elems >> 2; + +#pragma unroll 4 + for (int64_t i = tid; i < num_vec_elems; i += step) { + vec4_t in_vec = vectorized_in[i]; + float8x4_t out_vec; + + out_vec.x = scaled_fp8_conversion( + static_cast(in_vec.x), scale); + out_vec.y = scaled_fp8_conversion( + static_cast(in_vec.y), scale); + out_vec.z = scaled_fp8_conversion( + static_cast(in_vec.z), scale); + out_vec.w = scaled_fp8_conversion( + static_cast(in_vec.w), scale); + vectorized_out[i] = out_vec; + } + + // Handle the remaining elements if num_elems is not divisible by 4 + for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) { + out[i] = scaled_fp8_conversion( + static_cast(input[i]), scale); + } +} + +} // namespace vllm \ No newline at end of file diff --git a/vllm/csrc/quantization/gguf/ggml-common.h b/vllm/csrc/quantization/gguf/ggml-common.h index fba94fd1d..d42205a65 100644 --- a/vllm/csrc/quantization/gguf/ggml-common.h +++ b/vllm/csrc/quantization/gguf/ggml-common.h @@ -1,7 +1,7 @@ // copied from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-common.h #define QK_K 256 #define K_QUANTS_PER_ITERATION 2 -#define WARP_SIZE 32 +#define WARP_SIZE_GGUF 32 #define K_SCALE_SIZE 12 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 #define CUDA_QUANTIZE_BLOCK_SIZE 256 @@ -1112,4 +1112,19 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #endif return c; } + +static __device__ __forceinline__ uint32_t __vcmpeq4(const uint32_t a, const uint32_t b) { + uint32_t neq = a^b; + return !(neq & 0xff000000) * 0xff000000 | + !(neq & 0x00ff0000) * 0x00ff0000 | + !(neq & 0x0000ff00) * 0x0000ff00 | + !(neq & 0x000000ff) * 0x000000ff; +} + +static __device__ __forceinline__ uint32_t __vsub4(const uint32_t a, const uint32_t b) { + return (static_cast(((a & 0xff000000) >> 24) - ((b & 0xff000000) >> 24)) << 24) + + (static_cast(((a & 0x00ff0000) >> 16) - ((b & 0x00ff0000) >> 16)) << 16) + + (static_cast(((a & 0x0000ff00) >> 8) - ((b & 0x0000ff00) >> 8)) << 8) + + (static_cast(((a & 0x000000ff) >> 0) - ((b & 0x000000ff) >> 0)) << 0); +} #endif // defined(USE_ROCM) diff --git a/vllm/csrc/quantization/gguf/gguf_kernel.cu b/vllm/csrc/quantization/gguf/gguf_kernel.cu index 37e4de4e1..5f0eaf5a9 100644 --- a/vllm/csrc/quantization/gguf/gguf_kernel.cu +++ b/vllm/csrc/quantization/gguf/gguf_kernel.cu @@ -4,6 +4,8 @@ #include #include +#include "cuda_compat.h" + #include "ggml-common.h" #include "vecdotq.cuh" #include "dequantize.cuh" @@ -32,8 +34,8 @@ static __global__ void quantize_q8_1(const half* __restrict__ x, #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { - amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32)); - sum += __shfl_xor_sync(0xffffffff, sum, mask, 32); + amax = fmaxf(amax, VLLM_SHFL_XOR_SYNC_WIDTH(amax, mask, 32)); + sum += VLLM_SHFL_XOR_SYNC_WIDTH(sum, mask, 32); } const float d = amax / 127; diff --git a/vllm/csrc/quantization/gguf/mmq.cuh b/vllm/csrc/quantization/gguf/mmq.cuh index d13efd596..c935faa07 100644 --- a/vllm/csrc/quantization/gguf/mmq.cuh +++ b/vllm/csrc/quantization/gguf/mmq.cuh @@ -10,7 +10,7 @@ static __device__ __forceinline__ void mul_mat_q( const int blocks_per_row_x = ncols_x / qk; const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE / qi; + const int blocks_per_warp = WARP_SIZE_GGUF / qi; const int & ncols_dst = ncols_y; @@ -27,10 +27,10 @@ static __device__ __forceinline__ void mul_mat_q( allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; + __shared__ int tile_y_qs[mmq_x * WARP_SIZE_GGUF]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE_GGUF/QI8_1]; - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + float sum[mmq_y/WARP_SIZE_GGUF][mmq_x/nwarps] = {{0.0f}}; for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { @@ -39,26 +39,26 @@ static __device__ __forceinline__ void mul_mat_q( #pragma unroll for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + threadIdx.x; + const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x; const int kbxd = kqs / QI8_1; #pragma unroll for (int i = 0; i < mmq_x; i += nwarps) { const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + const int index_y = (threadIdx.y + i) * WARP_SIZE_GGUF + kqs % WARP_SIZE_GGUF; tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); } #pragma unroll for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; - const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); const int col_y_eff = min(col_y_0 + ids, ncols_y-1); // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE_GGUF/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE_GGUF/QI8_1) + kby]; if (need_sum) { *dsi_dst = *dsi_src; } else { @@ -70,12 +70,12 @@ static __device__ __forceinline__ void mul_mat_q( __syncthreads(); // #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { + for (int k = ir*WARP_SIZE_GGUF/qr; k < (ir+1)*WARP_SIZE_GGUF/qr; k += vdr) { #pragma unroll for (int j = 0; j < mmq_x; j += nwarps) { #pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/nwarps] += vec_dot( + for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { + sum[i/WARP_SIZE_GGUF][j/nwarps] += vec_dot( tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, threadIdx.x + i, threadIdx.y + j, k); } @@ -93,12 +93,12 @@ static __device__ __forceinline__ void mul_mat_q( } #pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { + for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { const int row_dst = row_dst_0 + threadIdx.x + i; if (row_dst >= nrows_dst) { continue; } - dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE][j/nwarps]); + dst[col_dst*nrows_dst + row_dst] = __float2half(sum[i/WARP_SIZE_GGUF][j/nwarps]); } } } @@ -115,7 +115,7 @@ static __device__ __forceinline__ void mul_mat_q( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_0, 2) #endif mul_mat_q4_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -140,7 +140,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -165,7 +165,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_1, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_1, 2) #endif mul_mat_q4_1( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -190,7 +190,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -215,7 +215,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_0, 2) #endif mul_mat_q5_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -240,7 +240,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -265,7 +265,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_1, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_1, 2) #endif mul_mat_q5_1( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -289,7 +289,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -314,7 +314,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q8_0, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q8_0, 2) #endif mul_mat_q8_0( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -338,7 +338,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -363,7 +363,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q2_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q2_K, 2) #endif mul_mat_q2_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -387,7 +387,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -412,7 +412,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q3_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q3_K, 2) #endif mul_mat_q3_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -438,7 +438,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -463,7 +463,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q4_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q4_K, 2) #endif mul_mat_q4_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -487,7 +487,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -512,7 +512,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q5_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q5_K, 2) #endif mul_mat_q5_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -537,7 +537,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; @@ -562,7 +562,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( template static __global__ void #if defined(USE_ROCM) -__launch_bounds__(WARP_SIZE*NWARPS_Q6_K, 2) +__launch_bounds__(WARP_SIZE_GGUF*NWARPS_Q6_K, 2) #endif mul_mat_q6_K( const void * __restrict__ vx, const void * __restrict__ vy, half * __restrict__ dst, @@ -586,7 +586,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); + const dim3 block_dims(WARP_SIZE_GGUF, nwarps, 1); if (nrows_x % mmq_y == 0) { const bool need_check = false; diff --git a/vllm/csrc/quantization/gguf/mmvq.cuh b/vllm/csrc/quantization/gguf/mmvq.cuh index b221ae789..b01e93980 100644 --- a/vllm/csrc/quantization/gguf/mmvq.cuh +++ b/vllm/csrc/quantization/gguf/mmvq.cuh @@ -28,8 +28,8 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * // sum up partial sums and write back result #pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32); + for (int mask = WARP_SIZE/2; mask > 0; mask >>= 1) { + tmp += VLLM_SHFL_XOR_SYNC(tmp, mask); } if (threadIdx.x == 0) { diff --git a/vllm/csrc/quantization/gguf/vecdotq.cuh b/vllm/csrc/quantization/gguf/vecdotq.cuh index d5af345a6..e00422637 100644 --- a/vllm/csrc/quantization/gguf/vecdotq.cuh +++ b/vllm/csrc/quantization/gguf/vecdotq.cuh @@ -43,7 +43,7 @@ static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( const int * v, const int * u, const float & d4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -68,7 +68,7 @@ template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( const int * v, const int * u, const half2 & dm4, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -95,7 +95,7 @@ template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -128,7 +128,7 @@ template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -162,7 +162,7 @@ template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( const int * v, const int * u, const float & d8_0, const float & d8_1) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -176,7 +176,7 @@ template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( const int * v, const int * u, const half2 & dm8, const half2 & ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; @@ -202,7 +202,7 @@ template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, const half2 & dm2, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -230,7 +230,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, const half2 & dm2, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi_d = 0; int sumi_m = 0; @@ -267,7 +267,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, const int & scale_offset, const float & d3, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf = 0.0f; @@ -301,7 +301,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d3, const float & d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM int sumi = 0; #pragma unroll @@ -326,7 +326,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -351,7 +351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -382,7 +382,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -413,7 +413,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -445,7 +445,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, const float & d, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf = 0.0f; #pragma unroll @@ -465,7 +465,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, const float & d6, const float * __restrict__ d8) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM float sumf_d = 0.0f; #pragma unroll @@ -507,8 +507,8 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI4_0) + mmq_y/QI4_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; } @@ -529,11 +529,11 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbx] = bxi->d; } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_0; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -543,7 +543,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i / QI4_0 + kbxd] = __half2float(bxi->d); } } @@ -559,13 +559,13 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_0) % WARP_SIZE_GGUF]; } return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dmf[i * (WARP_SIZE_GGUF/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q4_1_q8_1( @@ -587,8 +587,8 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_1) + mmq_y/QI4_1]; *x_ql = tile_x_qs; *x_dm = tile_x_dm; } @@ -608,10 +608,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_1; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -621,7 +621,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; } } @@ -634,13 +634,13 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI4_1) % WARP_SIZE_GGUF]; } return vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], u, x_dm[i * (WARP_SIZE_GGUF/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_0_q8_1( @@ -664,8 +664,8 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI5_0) + mmq_y/QI5_0]; *x_ql = tile_x_ql; *x_dm = (half2 *) tile_x_d; @@ -697,7 +697,7 @@ template static __device__ __forceinlin qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; int qs1 = (ql >> 4) & 0x0F0F0F0F; qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 @@ -706,10 +706,10 @@ template static __device__ __forceinlin qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_0; const int kbxd = k % blocks_per_tile_x_row; float * x_dmf = (float *) x_dm; @@ -722,7 +722,7 @@ template static __device__ __forceinlin } const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI5_0) + i / QI5_0 + kbxd] = __half2float(bxi->d); } } @@ -730,7 +730,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const int index_bx = i * (WARP_SIZE_GGUF/QI5_0) + i/QI5_0 + k/QI5_0; const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -738,12 +738,12 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( #pragma unroll for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_0) % WARP_SIZE_GGUF]; } return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q5_1_q8_1( @@ -767,8 +767,8 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_1) + mmq_y/QI5_1]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -801,7 +801,7 @@ template static __device__ __forceinlin qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+0] = qs0; int qs1 = (ql >> 4) & 0x0F0F0F0F; qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 @@ -809,10 +809,10 @@ template static __device__ __forceinlin qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2*k+1] = qs1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_1; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -825,7 +825,7 @@ template static __device__ __forceinlin const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; } } @@ -833,18 +833,18 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + const int index_bx = i * (WARP_SIZE_GGUF/QI5_1) + + i/QI5_1 + k/QI5_1; int u[2*VDR_Q5_1_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + u[2*l+0] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l) % WARP_SIZE_GGUF]; + u[2*l+1] = y_qs[j * WARP_SIZE_GGUF + (kyqs + l + QI5_1) % WARP_SIZE_GGUF]; } return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); + (&x_ql[i * (2*WARP_SIZE_GGUF + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE_GGUF/QI8_1) + (2*k/QI8_1) % (WARP_SIZE_GGUF/QI8_1)]); } static __device__ __forceinline__ float vec_dot_q8_0_q8_1( @@ -865,8 +865,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE_GGUF/QI8_0) + mmq_y/QI8_0]; *x_ql = tile_x_qs; *x_dm = (half2 *) tile_x_d; @@ -889,10 +889,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_int8(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI8_0; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -903,7 +903,7 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i / QI8_0 + kbxd] = __half2float(bxi->d); } } @@ -914,8 +914,8 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const float * y_df = (const float *) y_ds; return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], - y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); + (&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[j * WARP_SIZE_GGUF + k], x_dmf[i * (WARP_SIZE_GGUF/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE_GGUF/QI8_1) + k/QI8_1]); } static __device__ __forceinline__ float vec_dot_q2_K_q8_1( @@ -942,9 +942,9 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -967,10 +967,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI2_K; const int kbxd = k % blocks_per_tile_x_row; #pragma unroll @@ -981,18 +981,18 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i / QI2_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); if (need_check) { i = min(i, i_max); } - const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI2_K/4); + x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); } } @@ -1005,7 +1005,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); #pragma unroll @@ -1013,10 +1013,10 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; } - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4]) + ky/4; - const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; - return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (QR2_K*k) % WARP_SIZE_GGUF; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE_GGUF/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q3_K_q8_1( @@ -1047,10 +1047,10 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; - __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE_GGUF/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/4) + mmq_y/4]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1073,10 +1073,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI3_K; const int kbxd = k % blocks_per_tile_x_row; float * x_dmf = (float *) x_dm; @@ -1087,27 +1087,27 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i / QI3_K + kbxd] = __half2float(bxi->d); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + int i = i0 + i_offset * 2 + k / (WARP_SIZE_GGUF/2); if (need_check) { i = min(i, i_max); } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/2)) / (QI3_K/2); // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + x_qh[i * (WARP_SIZE_GGUF/2) + i / 2 + k % (WARP_SIZE_GGUF/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + int i = i0 + i_offset * 4 + k / (WARP_SIZE_GGUF/4); if (need_check) { i = min(i, i_max); } - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/4)) / (QI3_K/4); const int ksc = k % (QI3_K/4); @@ -1121,7 +1121,7 @@ template static __device__ __forceinlin const int sc = __vsubss4(sc_low | sc_high, 0x20202020); - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + x_sc[i * (WARP_SIZE_GGUF/4) + i / 4 + k % (WARP_SIZE_GGUF/4)] = sc; } } @@ -1134,24 +1134,24 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE_GGUF/4) + i/4 + kbx*4)) + ky/4; int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; #pragma unroll for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int kqsx = i * (WARP_SIZE_GGUF + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); const int shift = 2 * ((ky % 32) / 8); const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vh = x_qh[i * (WARP_SIZE_GGUF/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); const int vlh = (vh << 2) & 0x04040404; v[l] = __vsubss4(vll, vlh); } - const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; - return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (k*QR3_K) % WARP_SIZE_GGUF; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE_GGUF/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q4_K_q8_1( @@ -1200,9 +1200,9 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1225,10 +1225,10 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + x_ql[i * (WARP_SIZE_GGUF + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); } - const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI4_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll @@ -1238,27 +1238,27 @@ template static __device__ __forceinlin i = min(i, i_max); } const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i / QI4_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI4_K/8); const int * scales = (const int *) bxi->scales; - const int ksc = k % (WARP_SIZE/8); + const int ksc = k % (WARP_SIZE_GGUF/8); // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; } } @@ -1267,11 +1267,11 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { (void)x_qh; - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2*((k % 16) / 8); - const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); + const int index_y = j * WARP_SIZE_GGUF + (QR4_K*k) % WARP_SIZE_GGUF; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE_GGUF + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE_GGUF/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q5_K_q8_1( @@ -1321,9 +1321,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1360,11 +1360,11 @@ template static __device__ __forceinlin const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); - x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; - x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = ql1 | qh1; } - const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI5_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 #pragma unroll @@ -1376,40 +1376,40 @@ template static __device__ __forceinlin } const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; + x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i / QI5_K + kbxd] = bxi->dm; } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / (QI5_K/8); const int * scales = (const int *) bxi->scales; - const int ksc = k % (WARP_SIZE/8); + const int ksc = k % (WARP_SIZE_GGUF/8); // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + ksc] = scales8; } } static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + const int index_x = i * (QR5_K*WARP_SIZE_GGUF + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE_GGUF + (QR5_K*k) % WARP_SIZE_GGUF; return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); + x_dm[i * (WARP_SIZE_GGUF/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_q6_K_q8_1( @@ -1439,9 +1439,9 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE_GGUF) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE_GGUF/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE_GGUF/8) + mmq_y/8]; *x_ql = tile_x_ql; *x_dm = tile_x_dm; @@ -1478,11 +1478,11 @@ template static __device__ __forceinlin const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); - x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); - x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_ql[i * (2*WARP_SIZE_GGUF + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); } - const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int blocks_per_tile_x_row = WARP_SIZE_GGUF / QI6_K; // == 1 if QK_K == 256 const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 float * x_dmf = (float *) x_dm; @@ -1496,20 +1496,20 @@ template static __device__ __forceinlin const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d); + x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i / QI6_K + kbxd] = __half2float(bxi->d); } #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + int i = (i0 + i_offset * 8 + k / (WARP_SIZE_GGUF/8)) % mmq_y; if (need_check) { i = min(i, i_max); } - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE_GGUF/8)) / 4; - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + x_sc[i * (WARP_SIZE_GGUF/8) + i / 8 + k % (WARP_SIZE_GGUF/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); } } @@ -1519,11 +1519,11 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE_GGUF/8) + i/8 + k/8]); - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; - return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); + const int index_x = i * (QR6_K*WARP_SIZE_GGUF + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE_GGUF + (QR6_K*k) % WARP_SIZE_GGUF; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE_GGUF/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); } static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( @@ -1582,7 +1582,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq2_s * bq2 = (const block_iq2_s *) vbq; const int ib32 = iqs; @@ -1619,7 +1619,7 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; const int ib32 = iqs; @@ -1646,7 +1646,7 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq3_s * bq2 = (const block_iq3_s *) vbq; const int ib32 = iqs; @@ -1671,7 +1671,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int qs_packed = get_int_b2(bq1->qs, iqs); @@ -1703,7 +1703,7 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq1_m * bq1 = (const block_iq1_m *) vbq; @@ -1763,7 +1763,7 @@ static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4 static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq4_nl * bq = (const block_iq4_nl *) vbq; @@ -1788,7 +1788,7 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610 || defined USE_ROCM const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; const uint8_t * values = (const uint8_t *)kvalues_iq4nl; diff --git a/vllm/csrc/quantization/gptq_marlin/gptq_marlin.cu b/vllm/csrc/quantization/gptq_marlin/gptq_marlin.cu index 6dbf9594e..0c698ced7 100644 --- a/vllm/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/vllm/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -54,9 +54,10 @@ template shared // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale + const bool has_act_order, // whether act_order is enabled + const int group_blocks = -1, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? > __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk @@ -82,7 +83,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, torch::Tensor& workspace, vllm::ScalarTypeId const b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, - bool is_k_full, bool has_zp) { + bool is_k_full, bool has_zp, bool is_zp_float) { TORCH_CHECK_NOT_IMPLEMENTED(false, "marlin_gemm(..) requires CUDA_ARCH >= 8.0"); return torch::empty({1, 1}); @@ -516,10 +517,11 @@ template shared // fetch pipeline - const bool has_act_order, // whether act_order is enabled - const bool has_zp, // whether zero-points are enabled - const int group_blocks = -1 // number of consecutive 16x16 blocks - // with a separate quantization scale + const bool has_act_order, // whether act_order is enabled + const bool has_zp, // whether zero-points are enabled + const int group_blocks = -1, // number of consecutive 16x16 blocks + // with a separate quantization scale + const bool is_zp_float // is zero point of float16 type? > __global__ void Marlin( const int4* __restrict__ A, // fp16 input matrix of shape mxk @@ -692,8 +694,10 @@ __global__ void Marlin( int act_s_col_tb_stride = act_s_col_warp_stride * tb_n_warps; // Zero-points sizes/strides - int zp_gl_stride = (prob_n / pack_factor) / 4; - constexpr int zp_sh_stride = ((16 * thread_n_blocks) / pack_factor) / 4; + int zp_gl_stride = is_zp_float ? prob_n / 8 : (prob_n / pack_factor) / 4; + constexpr int zp_sh_stride = is_zp_float + ? 16 * thread_n_blocks / 8 + : ((16 * thread_n_blocks) / pack_factor) / 4; constexpr int zp_tb_groups = s_tb_groups; constexpr int zp_sh_stage = has_zp ? zp_tb_groups * zp_sh_stride : 0; int zp_gl_rd_delta = zp_gl_stride; @@ -768,9 +772,16 @@ __global__ void Marlin( constexpr int num_ints_per_thread = 8 / pack_factor; int zp_sh_rd; if constexpr (has_zp) { - zp_sh_rd = num_ints_per_thread * num_col_threads * - ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + if constexpr (is_zp_float) { + if constexpr (group_blocks != -1) { + zp_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; + } + } else { + zp_sh_rd = num_ints_per_thread * num_col_threads * + ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + num_ints_per_thread * ((threadIdx.x % 32) / num_row_threads); + } } // Precompute which thread should not read memory in which iterations; this is @@ -832,6 +843,7 @@ __global__ void Marlin( FragS act_frag_s[2][4][4]; // For act-order int frag_qzp[2][num_ints_per_thread]; // Zero-points FragZP frag_zp; // Zero-points in fp16 + FragZP frag_zpf[2]; // Zero-points in fp16 in HQQ // Zero accumulators. auto zero_accums = [&]() { @@ -1126,7 +1138,7 @@ __global__ void Marlin( // has_zp implies AWQ, which doesn't have act_order, static_assert(!has_zp || group_blocks != 0); - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { int pipe = full_pipe % stages; if constexpr (group_blocks == -1) { @@ -1170,11 +1182,44 @@ __global__ void Marlin( } } } + + else if constexpr (has_zp && is_zp_float) { + int pipe = full_pipe % stages; + + if constexpr (group_blocks != -1) { + if constexpr (group_blocks >= thread_k_blocks) { + int4* sh_zp_stage = + sh_zp + zp_sh_stage * ((group_blocks / thread_k_blocks) * + (pipe / (group_blocks / thread_k_blocks))); + reinterpret_cast(&frag_zpf[k % 2])[0] = sh_zp_stage[zp_sh_rd]; + } else { + int warp_id = threadIdx.x / 32; + int n_warps = thread_n_blocks / 4; + + int warp_row = warp_id / n_warps; + + int cur_k = warp_row * 16; + cur_k += k_iter_size * (k % b_sh_wr_iters); + + int k_blocks = cur_k / 16; + // Suppress bogus and persistent divide-by-zero warning + #pragma nv_diagnostic push + #pragma nv_diag_suppress divide_by_zero + int cur_group_id = k_blocks / group_blocks; + #pragma nv_diagnostic pop + + int4* sh_zp_stage = sh_zp + zp_sh_stage * pipe; + + reinterpret_cast(&frag_zpf[k % 2])[0] = + sh_zp_stage[zp_sh_rd + cur_group_id * zp_sh_stride]; + } + } + } }; // Execute the actual tensor core matmul of a sub-tile. auto matmul = [&](int k) { - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { FragB frag_zp_0; FragB frag_zp_1; int zp_quant_0, zp_quant_1; @@ -1219,10 +1264,14 @@ __global__ void Marlin( frag_b1 = dequant(b_quant_1); // Apply zero-point to frag_b0 - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { sub_zp(frag_b0, frag_zp[j], 0); } + else if constexpr (has_zp && is_zp_float && group_blocks != -1) { + sub_zp(frag_b0, frag_zpf[k % 2][j], 0); + } + // Apply scale to frag_b0 if constexpr (has_act_order) { scale4(frag_b0, act_frag_s[k % 2][0][j], @@ -1235,10 +1284,14 @@ __global__ void Marlin( } // Apply zero-point to frag_b1 - if constexpr (has_zp) { + if constexpr (has_zp && !is_zp_float) { sub_zp(frag_b1, frag_zp[j], 1); } + else if constexpr (has_zp && is_zp_float && group_blocks != -1) { + sub_zp(frag_b1, frag_zpf[k % 2][j], 1); + } + // Apply scale to frag_b1 if constexpr (has_act_order) { scale4(frag_b1, act_frag_s[k % 2][0][j], @@ -1510,7 +1563,7 @@ __global__ void Marlin( fetch_scales_to_shared(true, g_idx[slice_k_start], g_idx[last_g_idx]); } - if constexpr (has_zp && group_blocks == -1) { + if constexpr (has_zp && !is_zp_float && group_blocks == -1) { if (i == 0) { fetch_zp_to_shared(); } @@ -1697,23 +1750,27 @@ __global__ void Marlin( } #define __CALL_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \ - HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS) \ + HAS_ACT_ORDER, HAS_ZP, GROUP_BLOCKS, NUM_THREADS, \ + IS_ZP_FLOAT) \ else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \ thread_n_blocks == THREAD_N_BLOCKS && \ thread_k_blocks == THREAD_K_BLOCKS && \ has_act_order == HAS_ACT_ORDER && has_zp == HAS_ZP && \ - group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS) { \ - cudaFuncSetAttribute( \ - Marlin, \ - cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ - Marlin \ - <<>>( \ - A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ - num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ + group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \ + is_zp_float == IS_ZP_FLOAT) { \ + if constexpr (!IS_ZP_FLOAT || std::is_same::value) { \ + cudaFuncSetAttribute( \ + Marlin, \ + cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \ + Marlin \ + <<>>( \ + A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, zp_ptr, g_idx_ptr, \ + num_groups, prob_m, prob_n, prob_k, locks, use_fp32_reduce); \ + } \ } typedef struct { @@ -1905,51 +1962,96 @@ exec_config_t determine_thread_config(int prob_m, int prob_n, int prob_k, } #define GPTQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, true, false, 0, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS) + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, false, 8, NUM_THREADS, \ + false) #define AWQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, \ + false) \ \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS) \ - __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS) + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, -1, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 2, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + false) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 8, NUM_THREADS, false) + + // We currently have 4-bit models only with group_blocks == 4 + #define HQQ_CALL_IF(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \ + __CALL_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, \ + true) \ + __CALL_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, true, 4, NUM_THREADS, true) template void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, @@ -1958,7 +2060,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, vllm::ScalarType const& q_type, bool has_act_order, bool is_k_full, bool has_zp, int num_groups, int group_size, int dev, cudaStream_t stream, int thread_k, int thread_n, - int sms, int max_par, bool use_fp32_reduce) { + int sms, int max_par, bool use_fp32_reduce, bool is_zp_float) { if (has_zp) { TORCH_CHECK( q_type == vllm::kU4 || q_type == vllm::kU8, @@ -2111,6 +2213,11 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s, AWQ_CALL_IF(vllm::kU8, 8, 8, 256) AWQ_CALL_IF(vllm::kU8, 8, 4, 128) AWQ_CALL_IF(vllm::kU8, 4, 8, 128) + + HQQ_CALL_IF(vllm::kU4, 16, 4, 256) + HQQ_CALL_IF(vllm::kU4, 8, 8, 256) + HQQ_CALL_IF(vllm::kU4, 8, 4, 128) + HQQ_CALL_IF(vllm::kU4, 4, 8, 128) else { TORCH_CHECK(false, "Unsupported shapes: MNK = [", prob_m, ", ", prob_n, ", ", prob_k, "]", ", has_act_order = ", has_act_order, @@ -2135,7 +2242,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, vllm::ScalarTypeId const& b_q_type_id, int64_t size_m, int64_t size_n, int64_t size_k, bool is_k_full, bool has_zp, - bool use_fp32_reduce) { + bool use_fp32_reduce, bool is_zp_float) { vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id); if (has_zp) { TORCH_CHECK( @@ -2148,6 +2255,12 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, b_q_type.str()); } + if (has_zp && is_zp_float) { + TORCH_CHECK(a.scalar_type() == at::ScalarType::Half, + "Computation type must be float16 (half) when using float zero " + "points."); + } + int pack_factor = 32 / b_q_type.size_bits(); // Verify A @@ -2257,12 +2370,22 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, if (has_zp) { int rank = b_zeros.sizes().size(); TORCH_CHECK(rank == 2, "b_zeros rank = ", rank, " is not 2"); - TORCH_CHECK(b_zeros.size(0) == num_groups, - "b_zeros dim 0 = ", b_zeros.size(0), - " is not num_groups = ", num_groups); - TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor, - "b_zeros dim 1 = ", b_zeros.size(1), - " is not size_n / pack_factor = ", size_n / pack_factor); + if (is_zp_float) { + TORCH_CHECK(b_zeros.size(1) == size_n, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not size_n = ", size_n); + TORCH_CHECK(num_groups == b_zeros.size(0), + "b_zeros dim 0 = ", b_zeros.size(0), + " is not num_groups = ", num_groups); + TORCH_CHECK(num_groups != -1, "num_groups must be != -1"); + } else { + TORCH_CHECK(b_zeros.size(0) == num_groups, + "b_zeros dim 0 = ", b_zeros.size(0), + " is not num_groups = ", num_groups); + TORCH_CHECK(b_zeros.size(1) == size_n / pack_factor, + "b_zeros dim 1 = ", b_zeros.size(1), + " is not size_n / pack_factor = ", size_n / pack_factor); + } } // Verify workspace size @@ -2282,7 +2405,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float); } else if (a.scalar_type() == at::ScalarType::BFloat16) { marlin::marlin_mm( a.data_ptr(), b_q_weight.data_ptr(), @@ -2291,7 +2414,7 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, perm.data_ptr(), a_tmp.data_ptr(), size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp, num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev), - thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce); + thread_k, thread_n, sms, marlin::max_par, use_fp32_reduce, is_zp_float); } else { TORCH_CHECK(false, "gpt_marlin_gemm only supports bfloat16 and float16"); } diff --git a/vllm/csrc/quantization/machete/generate.py b/vllm/csrc/quantization/machete/generate.py index ebbe76cfb..ac63afe79 100644 --- a/vllm/csrc/quantization/machete/generate.py +++ b/vllm/csrc/quantization/machete/generate.py @@ -3,8 +3,10 @@ import os import shutil from collections.abc import Iterable -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from copy import deepcopy +from dataclasses import dataclass, fields +from functools import reduce +from typing import Dict, List, Optional, Tuple, Union import jinja2 # yapf conflicts with isort for this block @@ -14,7 +16,10 @@ MixedInputKernelScheduleType, TileSchedulerTag, TileSchedulerType, VLLMDataType, - VLLMDataTypeNames, VLLMDataTypeTag, + VLLMDataTypeNames, + VLLMDataTypeSize, VLLMDataTypeTag, + VLLMDataTypeTorchDataTypeTag, + VLLMDataTypeVLLMScalarTypeTag, VLLMKernelScheduleTag) # yapf: enable @@ -27,49 +32,125 @@ #include "../machete_mm_launcher.cuh" namespace machete { -using GemmDispatcher_ = GemmDispatcher< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints - -{% for s in schedules %}extern torch::Tensor -impl_{{type_name}}_sch_{{ gen_sch_name(s) }}(PyTorchArguments args); -{% endfor %} -template <> -torch::Tensor GemmDispatcher_::dispatch(PyTorchArguments args) { + +{% for impl_config in impl_configs %} +{% set type_sig = gen_type_sig(impl_config.types) -%} +{% for s in impl_config.schedules %} +extern torch::Tensor impl_{{type_sig}}_sch_{{gen_sch_sig(s)}}(MMArgs); +{%- endfor %} + +torch::Tensor mm_dispatch_{{type_sig}}(MMArgs args) { [[maybe_unused]] auto M = args.A.size(0); [[maybe_unused]] auto N = args.B.size(1); [[maybe_unused]] auto K = args.A.size(1); - if (!args.schedule) { - {%- for cond, s in heuristic %} + if (!args.maybe_schedule) { + {%- for cond, s in impl_config.heuristic %} {%if cond is not none%}if ({{cond}}) {%- else %}else {%- endif %} - return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args);{% endfor %} + return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args);{% endfor %} } - {% for s in schedules %} - if (*args.schedule == "{{ gen_sch_name(s) }}") { - return impl_{{ type_name }}_sch_{{ gen_sch_name(s) }}(args); - } - {% endfor %} + {%- for s in impl_config.schedules %} + if (*args.maybe_schedule == "{{ gen_sch_sig(s) }}") + return impl_{{type_sig}}_sch_{{ gen_sch_sig(s) }}(args); + {%- endfor %} TORCH_CHECK_NOT_IMPLEMENTED(false, "machete_gemm(..) is not implemented for " - "schedule = ", *args.schedule); + "schedule = ", *args.maybe_schedule); } +{%- endfor %} + -template <> -std::vector GemmDispatcher_::supported_schedules() { - return { - {% for s in schedules -%} - "{{ gen_sch_name(s) }}"{{ ", - " if not loop.last }}{%- endfor %} - }; +static inline std::optional maybe_scalartype( + c10::optional const& t) { + if (!t) { + return std::nullopt; + } else { + return t->scalar_type(); + }; +} + +torch::Tensor mm_dispatch(MMArgs args) { + auto out_type = args.maybe_out_type.value_or(args.A.scalar_type()); + auto a_type = args.A.scalar_type(); + auto maybe_g_scales_type = maybe_scalartype(args.maybe_group_scales); + auto maybe_g_zeros_type = maybe_scalartype(args.maybe_group_zeros); + auto maybe_ch_scales_type = maybe_scalartype(args.maybe_channel_scales); + auto maybe_tok_scales_type = maybe_scalartype(args.maybe_token_scales); + + {% for impl_config in impl_configs %} + {% set t = impl_config.types -%} + {% set type_sig = gen_type_sig(t) -%} + if (args.b_type == {{VLLMScalarTypeTag[t.b]}} + && a_type == {{TorchTypeTag[t.a]}} + && out_type == {{TorchTypeTag[t.out]}} + && {%if t.b_group_scale != void -%} + maybe_g_scales_type == {{TorchTypeTag[t.b_group_scale]}} + {%- else %}!maybe_g_scales_type{%endif%} + && {%if t.b_group_zeropoint != void -%} + maybe_g_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}} + {%- else %}!maybe_g_zeros_type{%endif%} + && {%if t.b_channel_scale != void -%} + maybe_ch_scales_type == {{TorchTypeTag[t.b_channel_scale]}} + {%- else %}!maybe_ch_scales_type{%endif%} + && {%if t.a_token_scale != void -%} + maybe_tok_scales_type == {{TorchTypeTag[t.a_token_scale]}} + {%- else %}!maybe_tok_scales_type{%endif%} + ) { + return mm_dispatch_{{type_sig}}(args); + } + {%- endfor %} + + TORCH_CHECK_NOT_IMPLEMENTED( + false, "machete_mm(..) is not implemented for " + "a_type=", args.A.scalar_type(), + ", b_type=", args.b_type.str(), + ", out_type=", out_type, + ", with_group_scale_type=", maybe_g_scales_type + ? toString(*maybe_g_scales_type) : "None", + ", with_group_zeropoint_type=", maybe_g_zeros_type + ? toString(*maybe_g_zeros_type) : "None", + ", with_channel_scale_type=", maybe_ch_scales_type + ? toString(*maybe_ch_scales_type) : "None", + ", with_token_scale_type=", maybe_tok_scales_type + ? toString(*maybe_tok_scales_type) : "None", + "; implemented types are: \\n", + {%- for impl_config in impl_configs %} + {% set t = impl_config.types -%} + "\\t{{gen_type_option_name(t)}}\\n", + {%- endfor %} + ""); } +std::vector supported_schedules_dispatch( + SupportedSchedulesArgs args) { + auto out_type = args.maybe_out_type.value_or(args.a_type); + + {% for impl_config in impl_configs %} + {% set t = impl_config.types -%} + {% set schs = impl_config.schedules -%} + if (args.b_type == {{VLLMScalarTypeTag[t.b]}} + && args.a_type == {{TorchTypeTag[t.a]}} + && out_type == {{TorchTypeTag[t.out]}} + && {%if t.b_group_scale != void -%} + args.maybe_group_scales_type == {{TorchTypeTag[t.b_group_scale]}} + {%- else %}!args.maybe_group_scales_type{%endif%} + && {%if t.b_group_zeropoint != void-%} + args.maybe_group_zeros_type == {{TorchTypeTag[t.b_group_zeropoint]}} + {%- else %}!args.maybe_group_zeros_type{%endif%} + ) { + return { + {%- for s in impl_config.schedules %} + "{{gen_sch_sig(s)}}"{% if not loop.last %},{% endif %} + {%- endfor %} + }; + } + {%- endfor %} + + return {}; +}; + }; // namespace machete """ @@ -77,20 +158,10 @@ #include "../machete_mm_launcher.cuh" namespace machete { -template -using Kernel = MacheteKernelTemplate< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}, // Zeropoints - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, - Config, with_C, with_scales, with_zeropoints>; - -{% for sch in schedules %} -{% set schedule_name = gen_sch_name(sch) -%} -struct sch_{{schedule_name}} { + +{% for sch in unique_schedules(impl_configs) %} +{% set sch_sig = gen_sch_sig(sch) -%} +struct sch_{{sch_sig}} { using TileShapeNM = Shape<{{ to_cute_constant(sch.tile_shape_mn)|join(', ')}}>; using ClusterShape = Shape<{{ @@ -101,27 +172,34 @@ using TileScheduler = {{TileSchedulerTag[sch.tile_scheduler]}}; using EpilogueTileType = cutlass::epilogue::collective::EpilogueTileAuto; }; - +{% endfor %} + +{% for impl_config in impl_configs %} +{% set t = impl_config.types -%} +{% set schs = impl_config.schedules -%} +{% set type_sig = gen_type_sig(t) -%} + +template +using Kernel_{{type_sig}} = MacheteKernelTemplate< + {{DataTypeTag[t.a]}}, // ElementA + {{DataTypeTag[t.b]}}, // ElementB + {{DataTypeTag[t.out]}}, // ElementD + {{DataTypeTag[t.accumulator]}}, // Accumulator + {{DataTypeTag[t.b_group_scale]}}, // GroupScaleT + {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT + {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT + {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT + cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, + Sch>; + +{% for sch in schs %} +{% set sch_sig = gen_sch_sig(sch) -%} torch::Tensor -impl_{{type_name}}_sch_{{schedule_name}}(PyTorchArguments args) { - bool with_C = args.C.has_value(), with_scales = args.scales.has_value(), - with_zeropoints = args.zeros.has_value(); - - {% for s in specializations %} - if (with_C == {{s.with_C|lower}} - && with_zeropoints == {{s.with_zeropoints|lower}} - && with_scales == {{s.with_scales|lower}}) { - return run_impl>(args); - }{% endfor %} - - TORCH_CHECK_NOT_IMPLEMENTED( - false, "for the sake of compile times and binary size machete_mm(..) is " - " not implemented for with_C=", with_C, ", with_scales=", with_scales, - ", with_zeropoints=", with_zeropoints, - " (for {{type_name}}_sch_{{schedule_name}})"); +impl_{{type_sig}}_sch_{{sch_sig}}(MMArgs args) { + return run_impl>(args); } -{% endfor %} +{%- endfor %} +{%- endfor %} }; // namespace machete """ @@ -130,26 +208,34 @@ #include "../machete_prepack_launcher.cuh" namespace machete { -using PrepackBDispatcher_ = PrepackBDispatcher< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - {{DataTypeTag[type_config.element_b_scale]}}, // Scales - {{DataTypeTag[type_config.element_b_zeropoint]}}>; // Zeropoints - -using PrepackedLayoutB = PrepackedLayoutBTemplate< - {{DataTypeTag[type_config.element_a]}}, // ElementA - {{DataTypeTag[type_config.element_b]}}, // ElementB - {{DataTypeTag[type_config.element_d]}}, // ElementD - {{DataTypeTag[type_config.accumulator]}}, // Accumulator - cutlass::layout::ColumnMajor, - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput>; - -template <> -torch::Tensor PrepackBDispatcher_::dispatch(torch::Tensor B) { - return prepack_impl(B); + +torch::Tensor prepack_B_dispatch(PrepackBArgs args) { + auto convert_type = args.maybe_group_scales_type.value_or(args.a_type); + {%- for t in types %} + {% set b_type = unsigned_type_with_bitwidth(t.b_num_bits) %} + if (args.a_type == {{TorchTypeTag[t.a]}} + && args.b_type.size_bits() == {{t.b_num_bits}} + && convert_type == {{TorchTypeTag[t.convert]}}) { + return prepack_impl< + PrepackedLayoutBTemplate< + {{DataTypeTag[t.a]}}, // ElementA + {{DataTypeTag[b_type]}}, // ElementB + {{DataTypeTag[t.convert]}}, // ElementConvert + {{DataTypeTag[t.accumulator]}}, // Accumulator + cutlass::layout::ColumnMajor, + cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput> + >(args.B); + } + {%- endfor %} + + TORCH_CHECK_NOT_IMPLEMENTED(false, + "prepack_B_dispatch(..) is not implemented for " + "atype = ", args.a_type, + ", b_type = ", args.b_type.str(), + ", with_group_scales_type= ", args.maybe_group_scales_type ? + toString(*args.maybe_group_scales_type) : "None"); } + }; // namespace machete """ @@ -166,32 +252,34 @@ class ScheduleConfig: tile_scheduler: TileSchedulerType -@dataclass +@dataclass(frozen=True) class TypeConfig: - element_a: DataType - element_b: Union[DataType, VLLMDataType] - element_b_scale: DataType - element_b_zeropoint: DataType - element_d: DataType + a: DataType + b: Union[DataType, VLLMDataType] + b_group_scale: DataType + b_group_zeropoint: DataType + b_channel_scale: DataType + a_token_scale: DataType + out: DataType accumulator: DataType -@dataclass -class Specialization: - with_C: bool - with_zeropoints: bool - with_scales: bool +@dataclass(frozen=True) +class PrepackTypeConfig: + a: DataType + b_num_bits: int + convert: DataType + accumulator: DataType @dataclass class ImplConfig: - type_config: TypeConfig - schedule_configs: List[ScheduleConfig] - specializations: List[Specialization] + types: TypeConfig + schedules: List[ScheduleConfig] heuristic: List[Tuple[Optional[str], ScheduleConfig]] -def generate_schedule_name(schedule_config: ScheduleConfig) -> str: +def generate_sch_sig(schedule_config: ScheduleConfig) -> str: tile_shape = ( f"{schedule_config.tile_shape_mn[0]}x{schedule_config.tile_shape_mn[1]}" ) @@ -209,40 +297,34 @@ def generate_schedule_name(schedule_config: ScheduleConfig) -> str: f"_{epilogue_schedule}_{tile_scheduler}") -# mostly unique shorter schedule_name -def generate_terse_schedule_name(schedule_config: ScheduleConfig) -> str: +# mostly unique shorter sch_sig +def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str: kernel_terse_names_replace = { "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_", "TmaWarpSpecializedCooperative_": "TmaCoop_", "StreamKScheduler": "streamK", } - schedule_name = generate_schedule_name(schedule_config) + sch_sig = generate_sch_sig(schedule_config) for orig, terse in kernel_terse_names_replace.items(): - schedule_name = schedule_name.replace(orig, terse) - return schedule_name + sch_sig = sch_sig.replace(orig, terse) + return sch_sig # unique type_name -def generate_type_signature(kernel_type_config: TypeConfig): - element_a = VLLMDataTypeNames[kernel_type_config.element_a] - element_b = VLLMDataTypeNames[kernel_type_config.element_b] - element_d = VLLMDataTypeNames[kernel_type_config.element_d] - accumulator = VLLMDataTypeNames[kernel_type_config.accumulator] - element_scale = VLLMDataTypeNames[kernel_type_config.element_b_scale] - element_zeropoint = VLLMDataTypeNames[ - kernel_type_config.element_b_zeropoint] - - return (f"{element_a}{element_b}{element_d}" - f"{accumulator}{element_scale}{element_zeropoint}") - +def generate_type_signature(kernel_types: TypeConfig): + return str("".join([ + VLLMDataTypeNames[getattr(kernel_types, field.name)] + for field in fields(TypeConfig) + ])) -# non-unique shorter type_name -def generate_terse_type_signature(kernel_type_config: TypeConfig): - element_a = VLLMDataTypeNames[kernel_type_config.element_a] - element_b = VLLMDataTypeNames[kernel_type_config.element_b] - return f"{element_a}{element_b}" +def generate_type_option_name(kernel_types: TypeConfig): + return ", ".join([ + f"{field.name.replace('b_', 'with_')+'_type'}=" + + VLLMDataTypeNames[getattr(kernel_types, field.name)] + for field in fields(TypeConfig) + ]) def is_power_of_two(n): @@ -263,13 +345,36 @@ def _to_cute_constant(value: int): return _to_cute_constant(value) +def unique_schedules(impl_configs: List[ImplConfig]): + return list( + set(sch for impl_config in impl_configs + for sch in impl_config.schedules)) + + +def unsigned_type_with_bitwidth(num_bits): + return { + 4: DataType.u4, + 8: DataType.u8, + 16: DataType.u16, + 32: DataType.u32, + 64: DataType.u64, + }[num_bits] + + template_globals = { + "void": DataType.void, "DataTypeTag": VLLMDataTypeTag, + "VLLMScalarTypeTag": VLLMDataTypeVLLMScalarTypeTag, + "TorchTypeTag": VLLMDataTypeTorchDataTypeTag, "KernelScheduleTag": VLLMKernelScheduleTag, "EpilogueScheduleTag": EpilogueScheduleTag, "TileSchedulerTag": TileSchedulerTag, "to_cute_constant": to_cute_constant, - "gen_sch_name": generate_terse_schedule_name, + "gen_sch_sig": generate_terse_sch_sig, + "gen_type_sig": generate_type_signature, + "unique_schedules": unique_schedules, + "unsigned_type_with_bitwidth": unsigned_type_with_bitwidth, + "gen_type_option_name": generate_type_option_name } @@ -284,42 +389,82 @@ def create_template(template_str): prepack_dispatch_template = create_template(PREPACK_TEMPLATE) -def create_sources(impl_config: ImplConfig, num_impl_files=1): +def create_sources(impl_configs: List[ImplConfig], num_impl_files=8): sources = [] - type_name = generate_type_signature(impl_config.type_config) - terse_type_name = generate_terse_type_signature(impl_config.type_config) - sources.append(( - f"machete_mm_{terse_type_name}", - mm_dispatch_template.render(type_name=type_name, - type_config=impl_config.type_config, - schedules=impl_config.schedule_configs, - heuristic=impl_config.heuristic), + "machete_mm_dispatch", + mm_dispatch_template.render(impl_configs=impl_configs), )) + prepack_types = [] + for impl_config in impl_configs: + convert_type = impl_config.types.a \ + if impl_config.types.b_group_scale == DataType.void \ + else impl_config.types.b_group_scale + prepack_types.append( + PrepackTypeConfig( + a=impl_config.types.a, + b_num_bits=VLLMDataTypeSize[impl_config.types.b], + convert=convert_type, + accumulator=impl_config.types.accumulator, + )) + + def prepacked_type_key(prepack_type: PrepackTypeConfig): + # For now we we can just use the first accumulator type seen since + # the tensor core shapes/layouts don't vary based on accumulator + # type so we can generate less code this way + return (prepack_type.a, prepack_type.b_num_bits, prepack_type.convert) + + unique_prepack_types = [] + prepack_types_seen = set() + for prepack_type in prepack_types: + key = prepacked_type_key(prepack_type) + if key not in prepack_types_seen: + unique_prepack_types.append(prepack_type) + prepack_types_seen.add(key) + sources.append(( - f"machete_prepack_{terse_type_name}", - prepack_dispatch_template.render( - type_name=type_name, - type_config=impl_config.type_config, - ), + "machete_prepack", + prepack_dispatch_template.render(types=unique_prepack_types, ), )) - num_schedules = len(impl_config.schedule_configs) - schedules_per_file = math.ceil(num_schedules / num_impl_files) - for part, i in enumerate(range(0, num_schedules, schedules_per_file)): - file_schedules = impl_config.schedule_configs[i:i + schedules_per_file] + # Split up impls across files + num_impls = reduce(lambda x, y: x + len(y.schedules), impl_configs, 0) + num_impls_per_file = math.ceil(num_impls / num_impl_files) + + files_impls: List[List[ImplConfig]] = [[]] + + curr_num_impls_assigned = 0 + curr_impl_in_file = 0 + curr_impl_configs = deepcopy(list(reversed(impl_configs))) + + while curr_num_impls_assigned < num_impls: + room_left_in_file = num_impls_per_file - curr_impl_in_file + if room_left_in_file == 0: + files_impls.append([]) + room_left_in_file = num_impls_per_file + curr_impl_in_file = 0 + + curr_ic = curr_impl_configs[-1] + if len(curr_ic.schedules) >= room_left_in_file: + # Break apart the current impl config + tmp_ic = deepcopy(curr_ic) + tmp_ic.schedules = curr_ic.schedules[:room_left_in_file] + curr_ic.schedules = curr_ic.schedules[room_left_in_file:] + files_impls[-1].append(tmp_ic) + else: + files_impls[-1].append(curr_ic) + curr_impl_configs.pop() + curr_num_impls_assigned += len(files_impls[-1][-1].schedules) + curr_impl_in_file += len(files_impls[-1][-1].schedules) + for part, file_impls in enumerate(files_impls): sources.append(( - f"machete_mm_{terse_type_name}_impl_part{part}", - mm_impl_template.render( - type_name=type_name, - type_config=impl_config.type_config, - schedules=file_schedules, - specializations=impl_config.specializations, - ), + f"machete_mm_impl_part{part+1}", + mm_impl_template.render(impl_configs=file_impls), )) + return sources @@ -328,187 +473,169 @@ def generate(): # about how this works SCRIPT_DIR = os.path.dirname(__file__) - schedule_common_params = dict( + sch_common_params = dict( kernel_schedule=TmaMI, epilogue_schedule=TmaCoop, tile_scheduler=TileSchedulerType.StreamK, ) - # For now we use the same heuristic for all types - # Heuristic is currently tuned for H100s - default_heuristic = [ + # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + default_tile_heuristic_config = { #### M = 257+ - ( - "M > 256 && K <= 16384 && N <= 4096", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 256", - ScheduleConfig( - tile_shape_mn=(128, 256), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + "M > 256": ((128, 256), (2, 1, 1)), #### M = 129-256 - ( - "M > 128 && K <= 4096 && N <= 4096", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 128 && K <= 8192 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 128", - ScheduleConfig( - tile_shape_mn=(128, 256), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + "M > 128": ((128, 256), (2, 1, 1)), #### M = 65-128 - ( - "M > 64 && K <= 4069 && N <= 4069", - ScheduleConfig( - tile_shape_mn=(128, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64 && K <= 4069 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64 && K >= 8192 && N >= 12288", - ScheduleConfig( - tile_shape_mn=(256, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 64", - ScheduleConfig( - tile_shape_mn=(128, 128), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + "M > 64": ((128, 128), (2, 1, 1)), #### M = 33-64 - ( - "M > 32 && K <= 6144 && N <= 6144", - ScheduleConfig( - tile_shape_mn=(128, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 32 && K >= 16384 && N >= 12288", - ScheduleConfig( - tile_shape_mn=(256, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 32", - ScheduleConfig( - tile_shape_mn=(128, 64), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + "M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + "M > 32": ((128, 64), (2, 1, 1)), #### M = 17-32 - ( - "M > 16 && K <= 12288 && N <= 8192", - ScheduleConfig( - tile_shape_mn=(128, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), - ( - "M > 16", - ScheduleConfig( - tile_shape_mn=(256, 32), - cluster_shape_mnk=(2, 1, 1), - **schedule_common_params # type: ignore - )), + "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + "M > 16": ((256, 32), (2, 1, 1)), #### M = 1-16 - ( - "N >= 26624", - ScheduleConfig( - tile_shape_mn=(256, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), - ( - None, - ScheduleConfig( - tile_shape_mn=(128, 16), - cluster_shape_mnk=(1, 1, 1), - **schedule_common_params # type: ignore - )), + "N >= 26624": ((256, 16), (1, 1, 1)), + None: ((128, 16), (1, 1, 1)), + } + + # For now we use the same heuristic for all types + # Heuristic is currently tuned for H100s + default_heuristic = [ + (cond, ScheduleConfig(*tile_config, + **sch_common_params)) # type: ignore + for cond, tile_config in default_tile_heuristic_config.items() ] - # Do not use schedules = list(set(...)) because we need to make sure - # the output list is deterministic; otherwise the generated kernel file - # will be non-deterministic and causes ccache miss. - schedules = [] - for _, schedule_config in default_heuristic: - if schedule_config not in schedules: - schedules.append(schedule_config) + def get_unique_schedules(heuristic: Dict[str, ScheduleConfig]): + # Do not use schedules = list(set(...)) because we need to make sure + # the output list is deterministic; otherwise the generated kernel file + # will be non-deterministic and causes ccache miss. + schedules = [] + for _, schedule_config in heuristic: + if schedule_config not in schedules: + schedules.append(schedule_config) + return schedules impl_configs = [] GPTQ_kernel_type_configs = list( - (TypeConfig( - element_a=element_a, - element_b=element_b, - element_b_scale=element_a, - element_b_zeropoint=element_a, - element_d=element_a, + TypeConfig( + a=a, + b=b, + b_group_scale=a, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.void, + a_token_scale=DataType.void, + out=a, accumulator=DataType.f32, - ) for element_b in (VLLMDataType.u4b8, VLLMDataType.u8b128) - for element_a in (DataType.f16, DataType.bf16))) - - GPTQ_kernel_specializations = [ - Specialization(with_C=False, with_zeropoints=False, with_scales=True) - ] + ) for b in (VLLMDataType.u4b8, VLLMDataType.u8b128) + for a in (DataType.f16, DataType.bf16)) impl_configs += [ - ImplConfig(x[0], x[1], x[2], x[3]) - for x in zip(GPTQ_kernel_type_configs, itertools.repeat(schedules), - itertools.repeat(GPTQ_kernel_specializations), + ImplConfig(x[0], x[1], x[2]) + for x in zip(GPTQ_kernel_type_configs, + itertools.repeat(get_unique_schedules(default_heuristic)), itertools.repeat(default_heuristic)) ] AWQ_kernel_type_configs = list( - (TypeConfig( - element_a=element_a, - element_b=element_b, - element_b_scale=element_a, - element_b_zeropoint=element_a, - element_d=element_a, + TypeConfig( + a=a, + b=b, + b_group_scale=a, + b_group_zeropoint=a, + b_channel_scale=DataType.void, + a_token_scale=DataType.void, + out=a, accumulator=DataType.f32, - ) for element_b in (DataType.u4, DataType.u8) - for element_a in (DataType.f16, DataType.bf16))) + ) for b in (DataType.u4, DataType.u8) + for a in (DataType.f16, DataType.bf16)) + + impl_configs += [ + ImplConfig(x[0], x[1], x[2]) + for x in zip(AWQ_kernel_type_configs, + itertools.repeat(get_unique_schedules(default_heuristic)), + itertools.repeat(default_heuristic)) + ] - AWQ_kernel_specializations = [ - Specialization(with_C=False, with_zeropoints=True, with_scales=True) + # Stored as "condition": ((tile_shape_mn), (cluster_shape_mnk)) + # TODO (LucasWilkinson): Further tuning required + qqq_tile_heuristic_config = { + #### M = 257+ + # ((128, 256), (2, 1, 1)) Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + # "M > 256 && K <= 16384 && N <= 4096": ((128, 128), (2, 1, 1)), + # "M > 256": ((128, 256), (2, 1, 1)), + "M > 256": ((128, 128), (2, 1, 1)), + #### M = 129-256 + "M > 128 && K <= 4096 && N <= 4096": ((128, 64), (2, 1, 1)), + "M > 128 && K <= 8192 && N <= 8192": ((128, 128), (2, 1, 1)), + # ((128, 256), (2, 1, 1)) Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + # "M > 128": ((128, 256), (2, 1, 1)), + "M > 128": ((128, 128), (2, 1, 1)), + #### M = 65-128 + "M > 64 && K <= 4069 && N <= 4069": ((128, 32), (2, 1, 1)), + "M > 64 && K <= 4069 && N <= 8192": ((128, 64), (2, 1, 1)), + "M > 64 && K >= 8192 && N >= 12288": ((256, 128), (2, 1, 1)), + "M > 64": ((128, 128), (2, 1, 1)), + #### M = 33-64 + "M > 32 && K <= 6144 && N <= 6144": ((128, 16), (1, 1, 1)), + # Broken for QQQ types + # TODO (LucasWilkinson): Investigate further + #"M > 32 && K >= 16384 && N >= 12288": ((256, 64), (2, 1, 1)), + "M > 32": ((128, 64), (2, 1, 1)), + #### M = 17-32 + "M > 16 && K <= 12288 && N <= 8192": ((128, 32), (2, 1, 1)), + "M > 16": ((256, 32), (2, 1, 1)), + #### M = 1-16 + "N >= 26624": ((256, 16), (1, 1, 1)), + None: ((128, 16), (1, 1, 1)), + } + + # For now we use the same heuristic for all types + # Heuristic is currently tuned for H100s + qqq_heuristic = [ + (cond, ScheduleConfig(*tile_config, + **sch_common_params)) # type: ignore + for cond, tile_config in qqq_tile_heuristic_config.items() + ] + + QQQ_kernel_types = [ + *(TypeConfig( + a=DataType.s8, + b=VLLMDataType.u4b8, + b_group_scale=b_group_scale, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.f32, + a_token_scale=DataType.f32, + out=DataType.f16, + accumulator=DataType.s32, + ) for b_group_scale in (DataType.f16, DataType.void)), + *(TypeConfig( + a=DataType.e4m3, + b=VLLMDataType.u4b8, + b_group_scale=b_group_scale, + b_group_zeropoint=DataType.void, + b_channel_scale=DataType.f32, + a_token_scale=DataType.f32, + out=DataType.f16, + accumulator=DataType.f32, + ) for b_group_scale in (DataType.f16, DataType.void)), ] impl_configs += [ - ImplConfig(x[0], x[1], x[2], x[3]) - for x in zip(AWQ_kernel_type_configs, itertools.repeat(schedules), - itertools.repeat(AWQ_kernel_specializations), - itertools.repeat(default_heuristic)) + ImplConfig(x[0], x[1], x[2]) + for x in zip(QQQ_kernel_types, + itertools.repeat(get_unique_schedules(qqq_heuristic)), + itertools.repeat(qqq_heuristic)) ] output_dir = os.path.join(SCRIPT_DIR, "generated") @@ -521,12 +648,11 @@ def generate(): os.makedirs(output_dir) # Render each group of configurations into separate files - for impl_config in impl_configs: - for filename, code in create_sources(impl_config): - filepath = os.path.join(output_dir, f"{filename}.cu") - with open(filepath, "w") as output_file: - output_file.write(code) - print(f"Rendered template to {filepath}") + for filename, code in create_sources(impl_configs): + filepath = os.path.join(output_dir, f"{filename}.cu") + with open(filepath, "w") as output_file: + output_file.write(code) + print(f"Rendered template to {filepath}") if __name__ == "__main__": diff --git a/vllm/csrc/quantization/machete/machete_mainloop.cuh b/vllm/csrc/quantization/machete/machete_mainloop.cuh index e8e7b14de..816f33a10 100644 --- a/vllm/csrc/quantization/machete/machete_mainloop.cuh +++ b/vllm/csrc/quantization/machete/machete_mainloop.cuh @@ -171,6 +171,10 @@ struct MacheteCollectiveMma { make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutACopy = decltype(GmemLayoutA::TVbNbKL_to_offset_copy( + make_shape(size<0>(TileShape_MNK{}), size<2>(TileShape_MNK{}), + Int{}))); + using SmemLayoutAtomARowMajor = decltype(rs_smem_selector(TileShape_MNK{})), @@ -288,14 +292,7 @@ struct MacheteCollectiveMma { static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must evenly divide tile k shape."); - // Tile along modes in a way that maximizes the TMA box size. - using SmemLayoutACopy = decltype(tile_to_shape( - SmemLayoutAtomARowMajor{}, - make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), - Int{}), - conditional_t<::cutlass::gemm::detail::is_major<0, StrideA>(), - Step<_2, _1, _3>, Step<_1, _2, _3>>{})); - + // Tile along modes in a way that maximizes the TMA box size using SmemLayoutB = decltype(tile_to_shape( SmemLayoutAtomB{}, make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), @@ -428,12 +425,12 @@ struct MacheteCollectiveMma { // clang-format on // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx) - using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset( + using PrepackedStrideA = decltype(stride(GmemLayoutA::TVbNbKL_to_offset_copy( make_shape(int32_t(0), int32_t(0), int32_t(0))))); using ATensor = decltype(make_tensor( get_logical_ptr(static_cast(nullptr)), - shape(GmemLayoutA::TVbNbKL_to_offset( + shape(GmemLayoutA::TVbNbKL_to_offset_copy( make_shape(int32_t(0), int32_t(0), int32_t(0)))), PrepackedStrideA{})); @@ -450,8 +447,8 @@ struct MacheteCollectiveMma { static constexpr auto make_tma_copy_A(ATensor tensor_a = ATensor{}) { return make_tma_copy( - GmemTiledCopyA{}, tensor_a, SmemLayoutA{}(_, _, cute::Int<0>{}), - shape(SmemLayoutA{}(_, _, cute::Int<0>{})), + GmemTiledCopyA{}, tensor_a, SmemLayoutACopy{}(_, _, cute::Int<0>{}), + shape(SmemLayoutACopy{}(_, _, cute::Int<0>{})), size<1>(ClusterShape{})); // mcast along N mode for this M load, if any } @@ -584,7 +581,7 @@ struct MacheteCollectiveMma { typename Params::TMA_Scale tma_load_scale; typename Params::TMA_Zero tma_load_zero; - auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L)); + auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L)); tma_load_a = make_tma_copy_A( make_logical_tensor(ptr_A, shape(layout), stride(layout))); @@ -722,7 +719,7 @@ struct MacheteCollectiveMma { // (TILE_V,TILE_B,m,k,l) auto make_gA_mkl = [&]() { // ((athrid, val), (BlocksM, BlockK), L) -> (storage_idx) - auto layout = GmemLayoutA::TVbNbKL_to_offset(make_shape(M, K, L)); + auto layout = GmemLayoutA::TVbNbKL_to_offset_copy(make_shape(M, K, L)); Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(shape(layout)); return local_tile(mA_mkl, make_shape(size<0>(layout), PPBlocksPerTile_MK{}), diff --git a/vllm/csrc/quantization/machete/machete_mm_kernel.cuh b/vllm/csrc/quantization/machete/machete_mm_kernel.cuh index 4d41b8d29..d4d19ae5d 100644 --- a/vllm/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/vllm/csrc/quantization/machete/machete_mm_kernel.cuh @@ -21,6 +21,8 @@ #include "cutlass_extensions/cute_utils.cuh" #include "cutlass_extensions/vllm_numeric_conversion.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include "cutlass_extensions/torch_utils.hpp" #include "machete_collective_builder.cuh" #include "machete_prepacked_layout.cuh" #include "machete_interleaving_utils.cuh" @@ -37,27 +39,42 @@ using namespace cute; // W is quantized, in this situation or right-hand operand is quantized so // we compute the transpose to move it to the left-hand side. template + typename AccumulatorT, typename GroupScaleT, typename GroupZeroT, + typename ChannelScaleT, typename TokenScaleT, class KernelSchedule, + typename ScheduleConfig> struct MacheteKernelTemplate { + static constexpr bool with_C = false; // not ever used + static constexpr bool with_group_scales = !std::is_same_v; + static constexpr bool with_group_zeropoints = + !std::is_same_v; + static constexpr bool with_channel_scales = + !std::is_same_v; + static constexpr bool with_token_scales = !std::is_same_v; + using MmaType = ElementA_; using ElementA = ElementA_; using ElementB = ElementB_; using ElementD = ElementD_; using ElementC = cute::conditional_t; - using ElementZ = ZeroT; - using ElementS = ScaleT; - - using ElementAccumulator = - AccumulatorT; // Element type for internal accumulation + using ElementAccumulator = AccumulatorT; using ElementCompute = AccumulatorT; // For Epilogue + // Use dummy values when we don't have scales or zeropoints + using ElementZGroup = + cute::conditional_t; + using ElementSGroup = + cute::conditional_t; + using ElementConvertGroup = + cute::conditional_t; + using ElementSChannel = + cute::conditional_t; + using ElementSToken = + cute::conditional_t; using BTypeTuple = cute::conditional_t< - with_scales, - cute::conditional_t, - cute::tuple>, + with_group_scales, + cute::conditional_t, + cute::tuple>, ElementB>; using LayoutA = cutlass::layout::RowMajor; @@ -71,8 +88,8 @@ struct MacheteKernelTemplate { using StrideA = cutlass::detail::TagToStrideA_t; using StrideC = cutlass::detail::TagToStrideA_t; using StrideD = cutlass::detail::TagToStrideA_t; - using StrideS = cutlass::detail::TagToStrideA_t; - using StrideZ = StrideS; + using StrideSGroup = cutlass::detail::TagToStrideA_t; + using StrideZGroup = StrideSGroup; using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; @@ -85,8 +102,8 @@ struct MacheteKernelTemplate { using OperatorClass = cutlass::arch::OpClassTensorOp; using PrepackedLayoutB = - PrepackedLayoutBTemplate; + PrepackedLayoutBTemplate; static int constexpr TileShapeK = 128 * 8 / cutlass::sizeof_bits::value; @@ -103,12 +120,42 @@ struct MacheteKernelTemplate { using EpilogueTileType = typename ScheduleConfig::EpilogueTileType; using TileScheduler = typename ScheduleConfig::TileScheduler; + static_assert( + (!with_channel_scales && !with_token_scales) || + ((with_channel_scales && with_token_scales) && + std::is_same_v), + "Currently token and channel scales (if present) must be the same type"); + + using EpilogueDescriptor = + cutlass::epilogue::collective::detail::EpilogueDescriptor< + TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD, + ElementD, EpilogueSchedule>; + + // Currently only supports float scales + using ChTokScalesEpilogue = + typename vllm::c3x::ScaledEpilogue; + static_assert((with_channel_scales || with_token_scales) || + (std::is_same_v && + std::is_same_v), + "Currently token and channel scales (if present) must be float " + "(and if one is present the other must be too)"); + + using StoreEpilogueCompute = typename cutlass::epilogue::fusion::Sm90EVT< + cutlass::epilogue::fusion::Sm90AccFetch>; + + using EVTCompute = + std::conditional_t; + + // EVTCompute using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< ArchTag, OperatorClass, TileShape, ClusterShape, EpilogueTileType, - ElementAccumulator, ElementAccumulator, ElementC, LayoutC_Transpose, - AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, - EpilogueSchedule>::CollectiveOp; + ElementAccumulator, ElementSChannel, ElementC, LayoutC_Transpose, + AlignmentC, ElementD, LayoutD_Transpose, AlignmentD, EpilogueSchedule, + EVTCompute>::CollectiveOp; using CollectiveMainloop = typename cutlass::gemm::collective::VLLMCollectiveBuilder< @@ -131,26 +178,44 @@ struct MacheteKernelTemplate { using MainloopArguments = typename GemmKernel::MainloopArguments; using EpilogueArguments = typename GemmKernel::EpilogueArguments; - template static Arguments create_arguments( cudaStream_t stream, - ElementA const* A_ptr, // A is an MxK matrix - Layout const& layout_A, - ElementB const* B_ptr, // B is an KxN prepacked matrix - ElementD* D_ptr, // D is an MxN matrix - Layout const& layout_D, - ElementC const* C_ptr, // C is an MxN matrix - std::optional> const& layout_C, - ElementS const* S_ptr, // S is an scale_KxN matrix - std::optional> const& layout_S, - ElementZ const* Z_ptr, // Z is an scale_KxN matrix - std::optional> const& layout_Z, - ElementCompute alpha, ElementCompute beta, - std::optional maybe_group_size) { - static_assert(!with_zeropoints || with_scales); - - int M = size<0>(layout_A), N = size<1>(layout_D), K = size<1>(layout_A); + torch::Tensor const& A, // MxK matrix + torch::Tensor const& B, // KxN prepacked matrix + torch::Tensor& D, // MxN matrix + c10::optional const& maybe_g_scales, // scale_KxN matrix + c10::optional const& maybe_g_zeros, // scale_KxN matrix + c10::optional maybe_group_size, + c10::optional const& maybe_ch_scales, // len N vector + c10::optional const& maybe_tok_scales) // len M vector + { + static_assert(!with_group_zeropoints || with_group_scales); + + int M = A.size(0), N = B.size(1), K = A.size(1); + TORCH_CHECK(D.size(0) == M && D.size(1) == N); + + auto layout_A = make_cute_layout(A, "A"); + auto layout_D = make_cute_layout(D, "D"); + auto layout_S_group = + maybe_make_cute_layout(maybe_g_scales, "group_scales"); + auto layout_Z_group = + maybe_make_cute_layout(maybe_g_zeros, "group_zeros"); + int64_t numel_S_channel = maybe_ch_scales ? maybe_ch_scales->numel() : 0; + int64_t numel_S_token = maybe_tok_scales ? maybe_tok_scales->numel() : 0; + + auto unwrap = [](auto const& t) { + return t ? t->const_data_ptr() : nullptr; + }; + auto A_ptr = static_cast(A.const_data_ptr()); + auto B_ptr = static_cast(B.const_data_ptr()); + auto D_ptr = static_cast(D.mutable_data_ptr()); + auto S_group_ptr = + static_cast(unwrap(maybe_g_scales)); + auto Z_group_ptr = static_cast(unwrap(maybe_g_zeros)); + auto S_channel_ptr = + static_cast(unwrap(maybe_ch_scales)); + auto S_token_ptr = + static_cast(unwrap(maybe_tok_scales)); int const group_size = maybe_group_size == -1 ? K : maybe_group_size.value_or(K); @@ -159,26 +224,28 @@ struct MacheteKernelTemplate { TORCH_CHECK(size<0>(layout_A) == M && size<1>(layout_A) == K); TORCH_CHECK(size<0>(layout_D) == M && size<1>(layout_D) == N); - if constexpr (with_C) { - TORCH_CHECK(C_ptr && layout_C); + if constexpr (with_group_scales) { + TORCH_CHECK(S_group_ptr && layout_S_group); + TORCH_CHECK((size<0>(*layout_S_group) == scale_k && + size<1>(*layout_S_group) == N)); } else { - TORCH_CHECK(!C_ptr, "C not supported"); + TORCH_CHECK(!S_group_ptr, "Scales not supported"); } - if constexpr (with_scales) { - TORCH_CHECK(S_ptr && layout_S); - TORCH_CHECK((size<0>(*layout_S) == scale_k && size<1>(*layout_S) == N)); + if constexpr (with_group_zeropoints) { + TORCH_CHECK(Z_group_ptr && layout_Z_group); + TORCH_CHECK((size<0>(*layout_Z_group) == scale_k && + size<1>(*layout_Z_group) == N)); + TORCH_CHECK(layout_S_group && *layout_Z_group == *layout_S_group, + "Scales and zeros must have the same layout"); } else { - TORCH_CHECK(!S_ptr, "Scales not supported"); + TORCH_CHECK(!Z_group_ptr, "Zeropoints not supported"); } - if constexpr (with_zeropoints) { - TORCH_CHECK(Z_ptr && layout_Z); - TORCH_CHECK((size<0>(*layout_Z) == scale_k && size<1>(*layout_Z) == N)); - TORCH_CHECK(layout_S && *layout_Z == *layout_S, - "Scales and zeros must have the same layout"); - } else { - TORCH_CHECK(!Z_ptr, "Zeropoints not supported"); + if constexpr (with_channel_scales || with_token_scales) { + TORCH_CHECK( + (maybe_ch_scales->numel() == N || maybe_ch_scales->numel() == 1) && + (maybe_tok_scales->numel() == M || maybe_tok_scales->numel() == 1)); } // Transpose A and D @@ -186,24 +253,33 @@ struct MacheteKernelTemplate { // for B (which is At) auto stride_At = layout_A.stride(); auto stride_Dt = permute_layout<1, 0, 2>(layout_D).stride(); - auto stride_Ct = stride_Dt; - if (layout_C) { - stride_Ct = permute_layout<1, 0, 2>(*layout_C).stride(); - } MainloopArguments mainloop_arguments{}; - EpilogueArguments epilogue_arguments{ - {alpha, beta}, C_ptr, stride_Ct, D_ptr, stride_Dt}; + // {Accum, C, C_layout, D, D} + EpilogueArguments epilogue_arguments{}; + + if constexpr (with_channel_scales || with_token_scales) { + epilogue_arguments = + EpilogueArguments{ChTokScalesEpilogue::prepare_args( + *maybe_ch_scales, *maybe_tok_scales), + nullptr, + {}, + D_ptr, + stride_Dt}; + } else { + epilogue_arguments = EpilogueArguments{{}, nullptr, {}, D_ptr, stride_Dt}; + } - if constexpr (with_scales && with_zeropoints) { - auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride(); - mainloop_arguments = - MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At, - S_ptr, stride_S, group_size, Z_ptr}; - } else if constexpr (with_scales) { - auto stride_S = permute_layout<1, 0, 2>(*layout_S).stride(); + if constexpr (with_group_scales && with_group_zeropoints) { + auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride(); mainloop_arguments = MainloopArguments{ - B_ptr, _StrideB{}, A_ptr, stride_At, S_ptr, stride_S, group_size}; + B_ptr, _StrideB{}, A_ptr, stride_At, + S_group_ptr, stride_S_group, group_size, Z_group_ptr}; + } else if constexpr (with_group_scales) { + auto stride_S_group = permute_layout<1, 0, 2>(*layout_S_group).stride(); + mainloop_arguments = + MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At, + S_group_ptr, stride_S_group, group_size}; } else { mainloop_arguments = MainloopArguments{B_ptr, _StrideB{}, A_ptr, stride_At}; diff --git a/vllm/csrc/quantization/machete/machete_mm_launcher.cuh b/vllm/csrc/quantization/machete/machete_mm_launcher.cuh index 60a4ed605..4b0da5b30 100644 --- a/vllm/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/vllm/csrc/quantization/machete/machete_mm_launcher.cuh @@ -5,73 +5,61 @@ #include "machete_mm_kernel.cuh" #include "cutlass_extensions/torch_utils.hpp" +#include "core/scalar_type.hpp" namespace machete { -struct PyTorchArguments { +struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; - c10::optional const& scales; - c10::optional const& zeros; - c10::optional group_size; - c10::optional const& C; - c10::optional alpha; - c10::optional beta; - c10::optional schedule; + vllm::ScalarType const& b_type; + c10::optional const& maybe_out_type; + c10::optional const& maybe_group_scales; + c10::optional const& maybe_group_zeros; + c10::optional maybe_group_size; + c10::optional const& maybe_channel_scales; + c10::optional const& maybe_token_scales; + c10::optional maybe_schedule; }; +struct SupportedSchedulesArgs { + at::ScalarType a_type; + vllm::ScalarType b_type; + c10::optional maybe_group_scales_type; + c10::optional maybe_group_zeros_type; + c10::optional maybe_channel_scales_type; + c10::optional maybe_token_scales_type; + c10::optional maybe_out_type; +}; + +torch::Tensor mm_dispatch(MMArgs args); + +std::vector supported_schedules_dispatch( + SupportedSchedulesArgs args); + template -torch::Tensor run_impl(PyTorchArguments args) { +torch::Tensor run_impl(MMArgs args) { const at::cuda::OptionalCUDAGuard device_guard(device_of(args.A)); auto device = args.A.device(); auto stream = at::cuda::getCurrentCUDAStream(device.index()); - using EleA = typename MacheteKernel::ElementA; - using EleB = typename MacheteKernel::ElementB; - using EleC = typename MacheteKernel::ElementC; - using EleD = typename MacheteKernel::ElementD; - using EleScale = typename MacheteKernel::ElementS; - using EleZero = typename MacheteKernel::ElementZ; - - using StrideA = typename MacheteKernel::StrideA; - using StrideC = typename MacheteKernel::StrideC; - using StrideD = typename MacheteKernel::StrideD; - using StrideS = typename MacheteKernel::StrideS; - using StrideZ = typename MacheteKernel::StrideZ; - int M = args.A.size(0); int N = args.B.size(1); int K = args.A.size(1); // Allocate output - torch::Tensor D = - torch::empty({M, N}, torch::TensorOptions() - .dtype(equivalent_scalar_type_v) - .device(device)); - - auto const &A = args.A, &B = args.B; - auto const &C = args.C, &scales = args.scales, &zeros = args.zeros; - - auto layout_A = make_cute_layout(A, "A"); - auto layout_D = make_cute_layout(D, "D"); - auto layout_C = maybe_make_cute_layout(C, "C"); - auto layout_S = maybe_make_cute_layout(scales, "scales"); - auto layout_Z = maybe_make_cute_layout(zeros, "zeros"); - - auto A_ptr = static_cast(A.const_data_ptr()); - auto B_ptr = static_cast(B.const_data_ptr()); - auto D_ptr = static_cast(D.mutable_data_ptr()); - auto C_ptr = static_cast(C ? C->const_data_ptr() : nullptr); - auto S_ptr = - static_cast(scales ? scales->const_data_ptr() : nullptr); - auto Z_ptr = - static_cast(zeros ? zeros->const_data_ptr() : nullptr); + torch::Tensor D = torch::empty( + {M, N}, + torch::TensorOptions() + .dtype(equivalent_scalar_type_v) + .device(device)); auto arguments = MacheteKernel::create_arguments( - stream, A_ptr, layout_A, B_ptr, D_ptr, layout_D, C_ptr, layout_C, S_ptr, - layout_S, Z_ptr, layout_Z, args.alpha.value_or(1), args.beta.value_or(0), - args.group_size); + stream, // + args.A, args.B, D, args.maybe_group_scales, args.maybe_group_zeros, + args.maybe_group_size, args.maybe_channel_scales, + args.maybe_token_scales); TORCH_CHECK(MacheteKernel::can_implement(arguments), "Machete kernel cannot be run with these arguments"); @@ -84,12 +72,4 @@ torch::Tensor run_impl(PyTorchArguments args) { return D; }; -template -struct GemmDispatcher { - static torch::Tensor dispatch(PyTorchArguments args); - static std::vector supported_schedules(); -}; - }; // namespace machete \ No newline at end of file diff --git a/vllm/csrc/quantization/machete/machete_prepack_kernel.cuh b/vllm/csrc/quantization/machete/machete_prepack_kernel.cuh index f23483f92..d002355ca 100644 --- a/vllm/csrc/quantization/machete/machete_prepack_kernel.cuh +++ b/vllm/csrc/quantization/machete/machete_prepack_kernel.cuh @@ -6,31 +6,49 @@ namespace machete { -template -static __global__ void prepack_B_kernel(BInTensor B_in, - BTiledOutTensor B_tiled_out) { - auto tB_in = local_tile(B_in, TileShapeNKL{}, - make_coord(blockIdx.x, blockIdx.y, blockIdx.z)); - auto tB_out = B_tiled_out(make_coord(_, _), - make_coord(blockIdx.x, blockIdx.y), blockIdx.z); +template +static __global__ void prepack_B_kernel(BInTensor B_in, ElementB* B_out_ptr) { + auto constexpr block_size = + Int{}; + auto constexpr eles_per_thread = Int{}; + static_assert(block_size % threads == 0, + "block_size must be divisible by the number of threads"); - auto tiled_copy = make_tiled_copy(Copy_Atom{}, - Layout, Stride<_32, _1>>{}, - Layout>{}); + // Which pre-packed are we responsible for + auto blk_coord = make_coord(blockIdx.x, blockIdx.y, blockIdx.z); + auto tB_in = local_tile( + B_in, append(typename PrepackedLayoutB::PPBlockShape_NK{}, _1{}), + blk_coord); - auto thr_copy = tiled_copy.get_thread_slice(threadIdx.x); + // Find the start offset in the output for this pre-packed block + auto bNbKL_to_offset = PrepackedLayoutB::bNbKL_to_offset(shape(B_in)); - Tensor thr_tile_S = thr_copy.partition_S(tB_in); - Tensor thr_tile_D = thr_copy.partition_D(tB_out); + // Tensor representing a 1:1 mapping to the output space in 1D + auto tB_out_linear = + make_tensor(get_logical_ptr(B_out_ptr) + bNbKL_to_offset(blk_coord), + make_layout(make_shape(block_size))); + // Mapping from output space (1D) to input space + auto tB_in_linear = make_tensor( + tB_in.data(), + tB_in.layout() + .compose(right_inverse(PrepackedLayoutB::ppblock_ilvd_NK_to_offset())) + .with_shape(make_shape(block_size))); + + // Tile for this specific thread (could have used a TiledCopy but these work + // best with 2d layouts, this is a simple 1d layout so local_tile is enough, + // we are also not that concerned with performance for this kernel) + auto thr_tB_in_linear = + local_tile(tB_in_linear, make_shape(eles_per_thread), threadIdx.x); + auto thr_tB_out_linear = + local_tile(tB_out_linear, make_shape(eles_per_thread), threadIdx.x); // Construct a register-backed Tensor with the same shape as each thread's // partition - auto fragment = make_tensor(shape(thr_tile_D)); + auto fragment = make_tensor(shape(thr_tB_in_linear)); - // Copy from GMEM to RMEM and from RMEM to GMEM - copy(tiled_copy, thr_tile_S, fragment); - copy(Copy_Atom{}, fragment, thr_tile_D); + copy(thr_tB_in_linear, fragment); + copy(Copy_Atom{}, fragment, thr_tB_out_linear); } template @@ -44,18 +62,15 @@ static void prepack_B_template( TORCH_CHECK(size<0>(B_layout) % size<0>(TileShapeNKL{}) == 0); TORCH_CHECK(size<1>(B_layout) % size<1>(TileShapeNKL{}) == 0); - TORCH_CHECK(size<2>(B_layout) % size<2>(TileShapeNKL{}) == 0); auto N_tiles = size<0>(B_layout) / size<0>(TileShapeNKL{}); auto K_tiles = size<1>(B_layout) / size<1>(TileShapeNKL{}); - auto L_tiles = size<2>(B_layout) / size<2>(TileShapeNKL{}); + auto L_tiles = size<2>(B_layout); auto B_in = make_tensor(get_logical_ptr(B_in_ptr), B_layout); - auto B_tiled_out = - make_tensor(get_logical_ptr(B_out_ptr), ilvd_NKbNbKL_to_offset); - prepack_B_kernel - <<>>(B_in, B_tiled_out); + prepack_B_kernel<128, PrepackedLayoutB> + <<>>(B_in, B_out_ptr); } }; // namespace machete \ No newline at end of file diff --git a/vllm/csrc/quantization/machete/machete_prepack_launcher.cuh b/vllm/csrc/quantization/machete/machete_prepack_launcher.cuh index a33d8f948..3486d28be 100644 --- a/vllm/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/vllm/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -2,9 +2,17 @@ #include "machete_prepack_kernel.cuh" #include "cutlass_extensions/torch_utils.hpp" +#include "core/scalar_type.hpp" namespace machete { +struct PrepackBArgs { + torch::Tensor const& B; + at::ScalarType a_type; + vllm::ScalarType b_type; + c10::optional maybe_group_scales_type; +}; + template torch::Tensor prepack_impl(torch::Tensor const B) { const at::cuda::OptionalCUDAGuard device_guard(device_of(B)); @@ -61,11 +69,6 @@ torch::Tensor prepack_impl(torch::Tensor const B) { return D; }; -template -struct PrepackBDispatcher { - static torch::Tensor dispatch(torch::Tensor B); -}; +torch::Tensor prepack_B_dispatch(PrepackBArgs args); }; // namespace machete \ No newline at end of file diff --git a/vllm/csrc/quantization/machete/machete_prepacked_layout.cuh b/vllm/csrc/quantization/machete/machete_prepacked_layout.cuh index 78e2cc5ee..680a858a8 100644 --- a/vllm/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/vllm/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -41,7 +41,7 @@ struct IlvBlkLayoutAuto {}; // The contract here is that the `TiledMma` determined below matches the one // ultimately used in the kernel. (this is also why the other element types are // required along with the kernel schedule) -template // clang-format on @@ -49,20 +49,27 @@ struct PrepackedLayoutBTemplate { using MmaType = ElementA_; using ElementA = ElementA_; using ElementB = ElementB_; - using ElementD = ElementD_; - using ElementAccumulator = - AccumulatorT; // Element type for internal accumulation + using ElementAccumulator = AccumulatorT; using ElementMma = MmaType; - // Only use interleaved layouts for subbyte weights, prmt instructions makes - // non-interleaved layouts for 8bit+ weights efficient enough we don't need - // iterleaved layouts + // Interleave for 4bit bit types when we are not upconverting to fp8 or int8, + // in those cases case we use a LUT using prmt instructions to upconvert and + // is more efficient if the data is not interleaved For 8bit+ prmt + // instructions makes non-interleaved layouts efficient enough we don't need + // iterleaved layouts (and can reuse more of the existing cutlass converts) + static constexpr bool should_interleave = + sizeof_bits_v <= 4 && + !std::is_same_v && + !std::is_same_v; + + // Only use interleaved layouts for subbyte weights, using IlvdBlkLayout = std::conditional_t< std::is_same_v, - std::conditional_t <= 4, - decltype(get_interleaved_blk_layout< - ElementB, sizeof_bits_v, 32>()), - void>, + std::conditional_t< + should_interleave, + decltype(get_interleaved_blk_layout< + ElementB, sizeof_bits_v, 32>()), + void>, IlvBlkLayout_>; // TODO (LucasWilkinson): compare the performance for other sizes @@ -135,7 +142,8 @@ struct PrepackedLayoutBTemplate { // then ((IlvBlk), FrgB) is {A, C, B, D, C, G, D, H} auto frgV = get<1, 0>(layout_no_interleave); auto ilvdBlk = IlvdBlkLayout{}; - static_assert(size(frgV) % 4 == 0, "FrgV must be divisible by 4"); + static_assert(size(frgV) % size(ilvdBlk) == 0, + "FrgV must be divisible by size(ilvdBlk)"); auto ilvd_FrgV = make_layout( make_shape(shape(ilvdBlk), Int{}), make_stride(stride(ilvdBlk), size(ilvdBlk))); @@ -175,6 +183,15 @@ struct PrepackedLayoutBTemplate { return group<1, 3>(result(_, repeat(result)>(_))); } + // ((athrid_val), (BlocksN, BlocksK, L)) -> (N, K, L) + template + CUTE_HOST_DEVICE static constexpr auto TVbNbKL_to_offset_copy( + Shape_NKL shape_mkl) { + auto layout = TVbNbKL_to_offset(shape_mkl); + return make_layout(coalesce(get<0>(layout)), get<1>(layout), + get<2>(layout)); + } + // ((BlockN, BlockK), (BlocksN, BlocksK), L) -> (storage_idx) template CUTE_HOST_DEVICE static constexpr auto ilvd_NKbNbKL_to_offset( @@ -197,6 +214,19 @@ struct PrepackedLayoutBTemplate { return group<1, 3>(result(_, repeat(result)>(_))); } + // (BlocksN, BlocksK, L) -> (storage_idx) + template + CUTE_HOST_DEVICE static constexpr auto bNbKL_to_offset(Shape_NKL shape_mkl) { + // (BlocksN, BlocksK, L) + auto blocks_shape = + cute::transform(shape_mkl, append(PPBlockShape_NK{}, _1{}), + [](auto x, auto y) { return x / y; }); + auto stride = size(PPBlockShape_NK{}); + + // (BlocksN, BlocksK, L) -> (storage_idx) + return make_layout(blocks_shape, compact_col_major(blocks_shape, stride)); + } + // ((athrid, val), (BlocksN, BlocksK, L)) -> (N, K, L) template CUTE_HOST_DEVICE static auto TVbNbK_to_NKL(Shape_NKL shape_mkl) { diff --git a/vllm/csrc/quantization/machete/machete_pytorch.cu b/vllm/csrc/quantization/machete/machete_pytorch.cu index 9f9073ded..da2c2fb0d 100644 --- a/vllm/csrc/quantization/machete/machete_pytorch.cu +++ b/vllm/csrc/quantization/machete/machete_pytorch.cu @@ -8,89 +8,61 @@ namespace machete { using namespace vllm; -// -// Utils (type dispatching) -// - -template -static auto scalar_type_dispatch(ScalarType const& type, Fn fn) { - if (type == vllm::kU4) { - return fn(cutlass::uint4b_t{}); - } else if (type == vllm::kU8) { - return fn(cutlass::uint8_t{}); - } else if (type == vllm::kU4B8) { - return fn(cutlass::vllm_uint4b8_t{}); - } else if (type == vllm::kU8B128) { - return fn(cutlass::vllm_uint8b128_t{}); - } else { - TORCH_CHECK(false, "Unsupported type ", type.str()); - } -} - -#define AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(...) \ - AT_DISPATCH_CASE_REDUCED_FLOATING_TYPES(__VA_ARGS__) - -#define AT_DISPATCH_SUPPORTED_COMPUTE_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, \ - AT_DISPATCH_CASE_SUPPORTED_COMPUTE_TYPES(__VA_ARGS__)) - -// -// Interface -// - -std::vector supported_schedules(ScalarTypeId const btype_id) { -#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 - vllm::ScalarType b_type = ScalarType::from_id(btype_id); - return scalar_type_dispatch(b_type, [&](auto BType) { - return GemmDispatcher::supported_schedules(); +std::vector supported_schedules( + at::ScalarType a_type, int64_t b_type_id, + c10::optional maybe_group_scales_type, + c10::optional maybe_group_zeros_type, + c10::optional maybe_channel_scales_type, + c10::optional maybe_token_scales_type, + c10::optional maybe_out_type) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return supported_schedules_dispatch({ + .a_type = a_type, + .b_type = b_type, + .maybe_group_scales_type = maybe_group_scales_type, + .maybe_group_zeros_type = maybe_group_zeros_type, + .maybe_channel_scales_type = maybe_channel_scales_type, + .maybe_token_scales_type = maybe_token_scales_type, + .maybe_out_type = maybe_out_type, }); -#else - TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); -#endif } -torch::Tensor gemm(torch::Tensor const& A, torch::Tensor const& B, - ScalarTypeId const btype_id, - c10::optional const& scales, - c10::optional const& zeros, - c10::optional group_size, - c10::optional const& C, - c10::optional alpha, c10::optional beta, - c10::optional schedule) { -#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12 - ScalarType const btype = ScalarType::from_id(btype_id); - auto args = PyTorchArguments{.A = A, - .B = B, - .scales = scales, - .zeros = zeros, - .group_size = group_size, - .C = C, - .alpha = alpha, - .beta = beta, - .schedule = schedule}; - - return scalar_type_dispatch(btype, [&](auto BType) { - return AT_DISPATCH_SUPPORTED_COMPUTE_TYPES( - A.scalar_type(), "machete_gemm", [&] { - using ComputeType = equivalent_cutlass_type_t; - return GemmDispatcher::dispatch(args); - }); - }); -#else - TORCH_CHECK(false, "Machete requires CUDA 12.0 or later"); -#endif +torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, + int64_t b_type_id, + c10::optional const& maybe_out_type, + c10::optional const& maybe_group_scales, + c10::optional const& maybe_group_zeros, + c10::optional maybe_group_size, + c10::optional const& maybe_channel_scales, + c10::optional const& maybe_token_scales, + c10::optional maybe_schedule) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return mm_dispatch({.A = A, + .B = B, + .b_type = b_type, + .maybe_out_type = maybe_out_type, + .maybe_group_scales = maybe_group_scales, + .maybe_group_zeros = maybe_group_zeros, + .maybe_group_size = maybe_group_size, + .maybe_channel_scales = maybe_channel_scales, + .maybe_token_scales = maybe_token_scales, + .maybe_schedule = maybe_schedule}); } -torch::Tensor prepack_B(torch::Tensor const& B, ScalarTypeId const btype_id) { - ScalarType const btype = ScalarType::from_id(btype_id); - return scalar_type_dispatch(btype, [&](auto BType) { - return PrepackBDispatcher::dispatch(B); - }); +torch::Tensor prepack_B( + torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, + c10::optional const& maybe_group_scales_type) { + ScalarType const b_type = ScalarType::from_id(b_type_id); + return prepack_B_dispatch( + {.B = B, + .a_type = a_type, + .b_type = b_type, + .maybe_group_scales_type = maybe_group_scales_type}); } TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { m.impl("machete_prepack_B", &prepack_B); - m.impl("machete_gemm", &gemm); + m.impl("machete_mm", &mm); } // use CatchAll since supported_schedules has no tensor arguments diff --git a/vllm/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/vllm/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu index a33e2660d..178373513 100644 --- a/vllm/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu +++ b/vllm/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu @@ -296,13 +296,9 @@ __global__ void Marlin_24( // We use a different scale layout for grouped and column-wise quantization as // we scale a `half2` tile in column-major layout in the former and in // row-major in the latter case. - if (group_blocks != -1) { - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - } else { - s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + - (threadIdx.x % 32) / 4; - } + s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + + (threadIdx.x % 32) / 4; // Note that in the original Marlin kernel + // this is (threadIdx.x % 32) / 4 // Precompute which thread should not read memory in which iterations; this is // needed if there are more threads than required for a certain tilesize or @@ -910,13 +906,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, // than better compute utilization thread_k = 128; thread_m = 128; - } else if (prob_n <= 256) { + } else { thread_k = 64; thread_m = 256; - } else { - thread_k = 32; - thread_m = 512; } + // Also had + // if prob_n > 256 + // thread_k = 32; + // thread_m = 512; + // but this is broken, + // TODO(Lucas, Alex M): figure out why } int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction @@ -1079,6 +1078,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, // Verify A device and strides TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); + TORCH_CHECK(a.dtype() == torch::kFloat16, + "A is not float16, currently only float16 is supported"); // Verify B device and strides TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); @@ -1091,6 +1092,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, // Verify scales device and strides TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); + TORCH_CHECK(b_scales.dtype() == torch::kFloat16, + "A is not float16, currently only float16 is supported"); // Alloc C matrix const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); diff --git a/vllm/csrc/torch_bindings.cpp b/vllm/csrc/torch_bindings.cpp index b8185c24d..4e64b9c92 100644 --- a/vllm/csrc/torch_bindings.cpp +++ b/vllm/csrc/torch_bindings.cpp @@ -101,7 +101,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( - "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> " + "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> " "()"); ops.impl("rms_norm", torch::kCUDA, &rms_norm); @@ -111,6 +111,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "float epsilon) -> ()"); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); + // Layernorm-quant + // Apply Root Mean Square (RMS) Normalization to the input tensor. + ops.def( + "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, " + "Tensor scale, float epsilon) -> " + "()"); + ops.impl("rms_norm_static_fp8_quant", torch::kCUDA, + &rms_norm_static_fp8_quant); + + // In-place fused Add and RMS Normalization. + ops.def( + "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, " + "Tensor! residual, Tensor weight, " + "Tensor scale, float epsilon) -> ()"); + ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA, + &fused_add_rms_norm_static_fp8_quant); + // Rotary embedding // Apply GPT-NeoX or GPT-J style rotary embedding to query and key. ops.def( @@ -186,13 +203,36 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // conditionally compiled so impl in source file // Machete (Dense) Optimized Mixed Precision GEMM for Hopper. - ops.def("machete_supported_schedules(int btype) -> str[]"); ops.def( - "machete_gemm(Tensor A, Tensor B, int btype, " - " Tensor? scales, Tensor? zeros, int? group_size, " - " Tensor? C, float? alpha, float? beta, str? schedule)" - "-> Tensor"); - ops.def("machete_prepack_B(Tensor B, int btype) -> Tensor"); + "machete_supported_schedules(" + " ScalarType a_type," + " int b_type," + " ScalarType? maybe_group_scales_type," + " ScalarType? maybe_group_zeros_type," + " ScalarType? maybe_channel_scales_type," + " ScalarType? maybe_token_scales_type," + " ScalarType? maybe_out_type" + ") -> str[]"); + ops.def( + "machete_mm(" + " Tensor A," + " Tensor B," + " int b_type," + " ScalarType? out_type," + " Tensor? group_scales," + " Tensor? group_zeros," + " int? group_size," + " Tensor? channel_scales," + " Tensor? token_scales," + " str? schedule" + ") -> Tensor"); + ops.def( + "machete_prepack_B(" + " Tensor B," + " ScalarType a_type," + " int b_type," + " ScalarType? group_scales_type" + ") -> Tensor"); // conditionally compiled so impl registration is in source file ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor"); @@ -204,7 +244,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, " "int b_q_type, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " - "bool has_zp, bool use_fp32_reduce) -> Tensor"); + "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor"); // conditionally compiled so impl registration is in source file // gptq_marlin repack from GPTQ. @@ -218,6 +258,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, " "SymInt size_n, int num_bits) -> Tensor"); // conditionally compiled so impl registrations are in source file +#endif // Dequantization for GGML. ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor"); @@ -234,6 +275,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor"); ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); +#ifndef USE_ROCM // fp8_marlin Optimized Quantized GEMM for FP8 weight-only. ops.def( "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " @@ -322,18 +364,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute FP8 quantized tensor for given scaling factor. ops.def( - "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()"); + "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> " + "()"); ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant); // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor. ops.def( - "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> " + "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) " + "-> " "()"); ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant); // Compute dynamic-per-token FP8 quantized tensor and scaling factor. ops.def( - "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, " + "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, " "Tensor! scale, Tensor? scale_ub) -> " "()"); ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, @@ -341,13 +385,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // Compute int8 quantized tensor for given scaling factor. ops.def( - "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," + "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale," "Tensor? azp) -> ()"); ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant); // Compute int8 quantized tensor and scaling factor ops.def( - "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " + "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, " "Tensor!? azp) -> ()"); ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, &dynamic_scaled_int8_quant); @@ -411,27 +455,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { // Custom all-reduce kernels custom_ar.def( - "init_custom_ar(Tensor meta, Tensor rank_data, " - "str[] handles, int[] offsets, int rank, " - "bool full_nvlink) -> int"); + "init_custom_ar(int[] ipc_tensors, Tensor rank_data, " + "int rank, bool full_nvlink) -> int"); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); - - custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()"); - custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg); - custom_ar.def( - "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> " - "()"); - custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg); + "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, " + "int reg_buffer_sz_bytes) -> ()"); + custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce); custom_ar.def("dispose", &dispose); custom_ar.def("meta_size", &meta_size); - custom_ar.def( - "register_buffer(int fa, Tensor t, str[] handles, " - "int[] offsets) -> ()"); - custom_ar.impl("register_buffer", torch::kCUDA, ®ister_buffer); - + custom_ar.def("register_buffer", ®ister_buffer); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("register_graph_buffers", ®ister_graph_buffers); } diff --git a/vllm/csrc/type_convert.cuh b/vllm/csrc/type_convert.cuh new file mode 100644 index 000000000..21b9d0ae5 --- /dev/null +++ b/vllm/csrc/type_convert.cuh @@ -0,0 +1,165 @@ +#pragma once + +#include + +#ifndef USE_ROCM + #include + #include +#else + #include + #include + +using __nv_bfloat16 = __hip_bfloat16; +using __nv_bfloat162 = __hip_bfloat162; +#endif + +namespace vllm { +/* Converter structs for the conversion from torch types to HIP/CUDA types, + and the associated type conversions within HIP/CUDA. These helpers need + to be implemented for now because the relevant type conversion + operators/constructors are not consistently implemented by HIP/CUDA, so + a generic conversion via type casts cannot be implemented. + + Each struct should have the member static constexpr bool `exists`: + If false, the optimized kernel is not used for the corresponding torch type. + If true, the struct should be fully defined as shown in the examples below. + */ +template +struct _typeConvert { + static constexpr bool exists = false; +}; + +#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) +// CUDA < 12.0 runs into issues with packed type conversion +template <> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __half; + using packed_hip_type = __half2; + + __device__ static inline float convert(hip_type x) { return __half2float(x); } + __device__ static inline float2 convert(packed_hip_type x) { + return __half22float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2half_rn(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22half2_rn(x); + } +}; + + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +// CUDA_ARCH < 800 does not have BF16 support +// TODO: Add in ROCm support once public headers handle bf16 maturely +template <> +struct _typeConvert { + static constexpr bool exists = true; + using hip_type = __nv_bfloat16; + using packed_hip_type = __nv_bfloat162; + + __device__ static inline float convert(hip_type x) { + return __bfloat162float(x); + } + __device__ static inline float2 convert(packed_hip_type x) { + return __bfloat1622float2(x); + } + __device__ static inline hip_type convert(float x) { + return __float2bfloat16(x); + } + __device__ static inline packed_hip_type convert(float2 x) { + return __float22bfloat162_rn(x); + } +}; + #endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= + // 12000)) + +/* Vector POD struct to generate vectorized and packed FP16/BF16 ops + for appropriate specializations of fused_add_rms_norm_kernel. + Only functions that are necessary in that kernel are implemented. + Alignment to 16 bytes is required to use 128-bit global memory ops. + */ +template +struct alignas(16) _f16Vec { + /* Not theoretically necessary that width is a power of 2 but should + almost always be the case for optimization purposes */ + static_assert(width > 0 && (width & (width - 1)) == 0, + "Width is not a positive power of 2!"); + using Converter = _typeConvert; + using T1 = typename Converter::hip_type; + using T2 = typename Converter::packed_hip_type; + T1 data[width]; + + __device__ _f16Vec& operator+=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i + 1]}; + temp += T2{other.data[i], other.data[i + 1]}; + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) data[i] += other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const _f16Vec& other) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + T2 temp{data[i], data[i + 1]}; + temp *= T2{other.data[i], other.data[i + 1]}; + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) data[i] *= other.data[i]; + } + return *this; + } + + __device__ _f16Vec& operator*=(const float scale) { + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 temp_f = Converter::convert(T2{data[i], data[i + 1]}); + temp_f.x *= scale; + temp_f.y *= scale; + T2 temp = Converter::convert(temp_f); + data[i] = temp.x; + data[i + 1] = temp.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) { + float temp = Converter::convert(data[i]) * scale; + data[i] = Converter::convert(temp); + } + } + return *this; + } + + __device__ float sum_squares() const { + float result = 0.0f; + if constexpr (width % 2 == 0) { +#pragma unroll + for (int i = 0; i < width; i += 2) { + float2 z = Converter::convert(T2{data[i], data[i + 1]}); + result += z.x * z.x + z.y * z.y; + } + } else { +#pragma unroll + for (int i = 0; i < width; ++i) { + float x = Converter::convert(data[i]); + result += x * x; + } + } + return result; + } +}; +} // namespace vllm \ No newline at end of file diff --git a/vllm/docs/requirements-docs.txt b/vllm/docs/requirements-docs.txt index d58f22613..5c80645b4 100644 --- a/vllm/docs/requirements-docs.txt +++ b/vllm/docs/requirements-docs.txt @@ -12,6 +12,9 @@ pydantic >= 2.8 torch py-cpuinfo transformers -mistral_common >= 1.3.4 +mistral_common >= 1.5.0 +aiohttp +starlette openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args -partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args \ No newline at end of file +partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args +requests diff --git a/vllm/docs/source/_static/custom.js b/vllm/docs/source/_static/custom.js index f475be71f..18b502c78 100644 --- a/vllm/docs/source/_static/custom.js +++ b/vllm/docs/source/_static/custom.js @@ -9,6 +9,8 @@ document.addEventListener("DOMContentLoaded", function () { script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "vLLM"); script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-position-y", "20%"); + script.setAttribute("runllm-position-x", "3%"); script.setAttribute("runllm-assistant-id", "207"); script.async = true; diff --git a/vllm/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/vllm/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png new file mode 100644 index 000000000..bbf46286c Binary files /dev/null and b/vllm/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png differ diff --git a/vllm/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/vllm/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png new file mode 100644 index 000000000..ade1d602a Binary files /dev/null and b/vllm/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png differ diff --git a/vllm/docs/source/assets/design/hierarchy.png b/vllm/docs/source/assets/design/hierarchy.png new file mode 100644 index 000000000..6a1b4ba95 Binary files /dev/null and b/vllm/docs/source/assets/design/hierarchy.png differ diff --git a/vllm/docs/source/automatic_prefix_caching/details.md b/vllm/docs/source/automatic_prefix_caching/details.md index 2d3214e28..17f806217 100644 --- a/vllm/docs/source/automatic_prefix_caching/details.md +++ b/vllm/docs/source/automatic_prefix_caching/details.md @@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. -# Generalized Caching Policy +## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. diff --git a/vllm/docs/source/community/meetups.rst b/vllm/docs/source/community/meetups.rst index a3962e96e..c87f01aa2 100644 --- a/vllm/docs/source/community/meetups.rst +++ b/vllm/docs/source/community/meetups.rst @@ -5,6 +5,7 @@ vLLM Meetups We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- `The seventh vLLM meetup `__, with Snowflake, November 14th 2024. `[Slides] `__ - `The sixth vLLM meetup `__, with NVIDIA, September 9th 2024. `[Slides] `__ - `The fifth vLLM meetup `__, with AWS, July 24th 2024. `[Slides] `__ - `The fourth vLLM meetup `__, with Cloudflare and BentoML, June 11th 2024. `[Slides] `__ diff --git a/vllm/docs/source/community/sponsors.md b/vllm/docs/source/community/sponsors.md index 52fbf9a57..c6f83b3a9 100644 --- a/vllm/docs/source/community/sponsors.md +++ b/vllm/docs/source/community/sponsors.md @@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a - Dropbox - Google Cloud - Lambda Lab +- Nebius - NVIDIA - Replicate - Roblox diff --git a/vllm/docs/source/conf.py b/vllm/docs/source/conf.py index 8435129e7..e9d9ac68c 100644 --- a/vllm/docs/source/conf.py +++ b/vllm/docs/source/conf.py @@ -10,11 +10,13 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. +import inspect import logging import os import sys from typing import List +import requests from sphinx.ext import autodoc logger = logging.getLogger(__name__) @@ -34,6 +36,7 @@ extensions = [ "sphinx.ext.napoleon", "sphinx.ext.viewcode", + "sphinx.ext.linkcode", "sphinx.ext.intersphinx", "sphinx_copybutton", "sphinx.ext.autodoc", @@ -94,9 +97,71 @@ def setup(app): generate_examples() +_cached_base: str = "" +_cached_branch: str = "" + + +def get_repo_base_and_branch(pr_number): + global _cached_base, _cached_branch + if _cached_base and _cached_branch: + return _cached_base, _cached_branch + + url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}" + response = requests.get(url) + if response.status_code == 200: + data = response.json() + _cached_base = data['head']['repo']['full_name'] + _cached_branch = data['head']['ref'] + return _cached_base, _cached_branch + else: + logger.error("Failed to fetch PR details: %s", response) + return None, None + + +def linkcode_resolve(domain, info): + if domain != 'py': + return None + if not info['module']: + return None + filename = info['module'].replace('.', '/') + module = info['module'] + + # try to determine the correct file and line number to link to + obj = sys.modules[module] + + # get as specific as we can + lineno: int = 0 + filename: str = "" + try: + for part in info['fullname'].split('.'): + obj = getattr(obj, part) + + if not (inspect.isclass(obj) or inspect.isfunction(obj) + or inspect.ismethod(obj)): + obj = obj.__class__ # Get the class of the instance + + lineno = inspect.getsourcelines(obj)[1] + filename = (inspect.getsourcefile(obj) + or f"{filename}.py").split("vllm/", 1)[1] + except Exception: + # For some things, like a class member, won't work, so + # we'll use the line number of the parent (the class) + pass + + if filename.startswith("checkouts/"): + # a PR build on readthedocs + pr_number = filename.split("/")[1] + filename = filename.split("/", 2)[2] + base, branch = get_repo_base_and_branch(pr_number) + if base and branch: + return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}" + + # Otherwise, link to the source file on the main branch + return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}" + + # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ - "aiohttp", "compressed_tensors", "cpuinfo", "cv2", @@ -113,10 +178,12 @@ def setup(app): "tensorizer", "pynvml", "outlines", + "xgrammar," "librosa", "soundfile", "gguf", "lark", + "decord", ] for mock_target in autodoc_mock_imports: @@ -143,6 +210,7 @@ def add_line(self, line: str, source: str, *lineno: int) -> None: "python": ("https://docs.python.org/3", None), "typing_extensions": ("https://typing-extensions.readthedocs.io/en/latest", None), + "aiohttp": ("https://docs.aiohttp.org/en/stable", None), "pillow": ("https://pillow.readthedocs.io/en/stable", None), "numpy": ("https://numpy.org/doc/stable", None), "torch": ("https://pytorch.org/docs/stable", None), diff --git a/vllm/docs/source/dev/dockerfile/dockerfile.rst b/vllm/docs/source/contributing/dockerfile/dockerfile.rst similarity index 100% rename from vllm/docs/source/dev/dockerfile/dockerfile.rst rename to vllm/docs/source/contributing/dockerfile/dockerfile.rst diff --git a/vllm/docs/source/contributing/overview.rst b/vllm/docs/source/contributing/overview.rst new file mode 100644 index 000000000..4cea0afda --- /dev/null +++ b/vllm/docs/source/contributing/overview.rst @@ -0,0 +1,164 @@ +Contributing to vLLM +===================== + +Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: + +- Identify and report any issues or bugs. +- Request or add support for a new model. +- Suggest or implement new features. +- Improve documentation or contribute a how-to guide. + +We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions. + +Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository! + +License +------- + +See `LICENSE `_. + +Developing +---------- + +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source `_ documentation for details. + +Testing +------- + +.. code-block:: bash + + pip install -r requirements-dev.txt + + # linting and formatting + bash format.sh + # Static type checking + mypy + # Unit tests + pytest tests/ + +.. note:: Currently, the repository does not pass the ``mypy`` tests. + +Contribution Guidelines +======================= + +Issues +------ + +If you encounter a bug or have a feature request, please `search existing issues `_ first to see if it has already been reported. If not, please `file a new issue `_, providing as much relevant information as possible. + +.. important:: + If you discover a security vulnerability, please follow the instructions `here `_. + +Pull Requests & Code Reviews +---------------------------- + +Thank you for your contribution to vLLM! Before submitting the pull request, +please ensure the PR meets the following criteria. This helps vLLM maintain the +code quality and improve the efficiency of the review process. + +DCO and Signed-off-by +^^^^^^^^^^^^^^^^^^^^^ + +When contributing changes to this project, you must agree to the `DCO `_. +Commits must include a ``Signed-off-by:`` header which certifies agreement with +the terms of the `DCO `_. + +Using ``-s`` with ``git commit`` will automatically add this header. + +PR Title and Classification +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Only specific types of PRs will be reviewed. The PR title is prefixed +appropriately to indicate the type of change. Please use one of the following: + +- ``[Bugfix]`` for bug fixes. +- ``[CI/Build]`` for build or continuous integration improvements. +- ``[Doc]`` for documentation fixes and improvements. +- ``[Model]`` for adding a new model or improving an existing model. Model name + should appear in the title. +- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server, + ``LLM`` class, etc.) +- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels. +- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``, + ``AsyncLLMEngine``, ``Scheduler``, etc.) +- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should + appear in the prefix (e.g., ``[Hardware][AMD]``). +- ``[Misc]`` for PRs that do not fit the above categories. Please use this + sparingly. + +.. note:: + If the PR spans more than one category, please include all relevant prefixes. + +Code Quality +^^^^^^^^^^^^ + +The PR needs to meet the following code quality standards: + +- We adhere to `Google Python style guide + `_ and `Google C++ style guide + `_. +- Pass all linter checks. Please use `format.sh + `_ to format your + code. +- The code needs to be well-documented to ensure future contributors can easily + understand the code. +- Include sufficient tests to ensure the project stays correct and robust. This + includes both unit tests and integration tests. +- Please add documentation to ``docs/source/`` if the PR modifies the + user-facing behaviors of vLLM. It helps vLLM users understand and utilize the + new features or changes. + +Adding or Changing Kernels +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each custom kernel needs a schema and one or more implementations to be registered with PyTorch. + +- Make sure custom ops are registered following PyTorch guidelines: + `Custom C++ and CUDA Operators `_ + and `The Custom Operators Manual `_. +- Custom operations that return ``Tensors`` require meta-functions. + Meta-functions should be implemented and registered in Python so that dynamic + dims can be handled automatically. See above documents for a description of + meta-functions. +- Use `torch.library.opcheck() `_ + to test the function registration and meta-function for any registered ops. + See ``tests/kernels`` for examples. +- When changing the C++ signature of an existing op, the schema must be updated + to reflect the changes. +- If a new custom type is needed, see the following document: + `Custom Class Support in PT2 `_. + +Notes for Large Changes +^^^^^^^^^^^^^^^^^^^^^^^ + +Please keep the changes as concise as possible. For major architectural changes +(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue +(RFC) discussing the technical design and justification. Otherwise, we will tag +it with ``rfc-required`` and might not go through the PR. + +What to Expect for the Reviews +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The goal of the vLLM team is to be a *transparent reviewing machine*. We would +like to make the review process transparent and efficient and make sure no +contributor feels confused or frustrated. However, the vLLM team is small, so we +need to prioritize some PRs over others. Here is what you can expect from the +review process: + +- After the PR is submitted, the PR will be assigned to a reviewer. Every + reviewer will pick up the PRs based on their expertise and availability. +- After the PR is assigned, the reviewer will provide status updates every 2-3 + days. If the PR is not reviewed within 7 days, please feel free to ping the + reviewer or the vLLM team. +- After the review, the reviewer will put an ``action-required`` label on the PR + if there are changes required. The contributor should address the comments and + ping the reviewer to re-review the PR. +- Please respond to all comments within a reasonable time frame. If a comment + isn't clear or you disagree with a suggestion, feel free to ask for + clarification or discuss the suggestion. + +Thank You +--------- + +Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. +All of your contributions help make vLLM a great tool and community for everyone! diff --git a/vllm/docs/source/dev/profiling/profiling_index.rst b/vllm/docs/source/contributing/profiling/profiling_index.rst similarity index 97% rename from vllm/docs/source/dev/profiling/profiling_index.rst rename to vllm/docs/source/contributing/profiling/profiling_index.rst index 9e8b2f181..a422b1fcd 100644 --- a/vllm/docs/source/dev/profiling/profiling_index.rst +++ b/vllm/docs/source/contributing/profiling/profiling_index.rst @@ -1,5 +1,6 @@ -Profiling vLLM -================================= +============== +Profiling vLLM +============== We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` diff --git a/vllm/docs/source/design/arch_overview.rst b/vllm/docs/source/design/arch_overview.rst new file mode 100644 index 000000000..bc3f509f0 --- /dev/null +++ b/vllm/docs/source/design/arch_overview.rst @@ -0,0 +1,274 @@ +.. _arch_overview: + +Architecture Overview +====================== + +This document provides an overview of the vLLM architecture. + +.. contents:: Table of Contents + :local: + :depth: 2 + +Entrypoints +----------- + +vLLM provides a number of entrypoints for interacting with the system. The +following diagram shows the relationship between them. + +.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png + :alt: Entrypoints Diagram + +LLM Class +^^^^^^^^^ + +The LLM class provides the primary Python interface for doing offline inference, +which is interacting with a model without using a separate model inference +server. + +Here is a sample of `LLM` class usage: + +.. code-block:: python + + from vllm import LLM, SamplingParams + + # Define a list of input prompts + prompts = [ + "Hello, my name is", + "The capital of France is", + "The largest ocean is", + ] + + # Define sampling parameters + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Initialize the LLM engine with the OPT-125M model + llm = LLM(model="facebook/opt-125m") + + # Generate outputs for the input prompts + outputs = llm.generate(prompts, sampling_params) + + # Print the generated outputs + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +More API details can be found in the :doc:`Offline Inference +` section of the API docs. + +The code for the `LLM` class can be found in `vllm/entrypoints/llm.py +`_. + +OpenAI-compatible API server +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The second primary interface to vLLM is via its OpenAI-compatible API server. +This server can be started using the `vllm serve` command. + +.. code-block:: bash + + vllm serve + +The code for the `vllm` CLI can be found in `vllm/scripts.py +`_. + +Sometimes you may see the API server entrypoint used directly instead of via the +`vllm` CLI command. For example: + +.. code-block:: bash + + python -m vllm.entrypoints.openai.api_server --model + +That code can be found in `vllm/entrypoints/openai/api_server.py +`_. + +More details on the API server can be found in the :doc:`OpenAI Compatible +Server ` document. + +LLM Engine +---------- + +The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of +the vLLM system, handling model inference and asynchronous request processing. + +.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png + :alt: LLMEngine Diagram + +LLMEngine +^^^^^^^^^ + +The `LLMEngine` class is the core component of the vLLM engine. It is +responsible for receiving requests from clients and generating outputs from the +model. The `LLMEngine` includes input processing, model execution (possibly +distributed across multiple hosts and/or GPUs), scheduling, and output +processing. + +- **Input Processing**: Handles tokenization of input text using the specified + tokenizer. + +- **Scheduling**: Chooses which requests are processed in each step. + +- **Model Execution**: Manages the execution of the language model, including + distributed execution across multiple GPUs. + +- **Output Processing**: Processes the outputs generated by the model, decoding the + token IDs from a language model into human-readable text. + +The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_. + +.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py + +AsyncLLMEngine +^^^^^^^^^^^^^^ + +The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class. +It uses `asyncio` to create a background loop that continuously processes +incoming requests. The `AsyncLLMEngine` is designed for online serving, where it +can handle multiple concurrent requests and stream outputs to clients. + +The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo +API server that serves as a simpler example in +`vllm/entrypoints/api_server.py`_. + +.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py + +The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_. + +.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py + +Worker +------ + +A worker is a process that runs the model inference. vLLM follows the common +practice of using one process to control one accelerator device, such as GPUs. +For example, if we use tensor parallelism of size 2 and pipeline parallelism of +size 2, we will have 4 workers in total. Workers are identified by their +``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while +``local_rank`` is mainly used for assigning the accelerator device and accessing +local resources such as the file system and shared memory. + +Model Runner +------------ + +Every worker has one model runner object, responsible for loading and running +the model. Much of the model execution logic resides here, such as preparing +input tensors and capturing cudagraphs. + +Model +----- + +Every model runner object has one model object, which is the actual +``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various +configurations affect the class we ultimately get. + +Class Hierarchy +--------------- + +The following figure shows the class hierarchy of vLLM: + + .. figure:: /assets/design/hierarchy.png + :alt: query + :width: 100% + :align: center + +There are several important design choices behind this class hierarchy: + +1. **Extensibility**: All classes in the hierarchy accept a configuration object +containing all the necessary information. The `VllmConfig +`__ +class is the main configuration object that is passed around. The class +hierarchy is quite deep, and every class needs to read the configuration it is +interested in. By encapsulating all configurations in one object, we can easily +pass the configuration object around and access the configuration we need. +Suppose we want to add a new feature (this is often the case given how fast the +field of LLM inference is evolving) that only touches the model runner. We will +have to add a new configuration option in the `VllmConfig` class. Since we pass +the whole config object around, we only need to add the configuration option to +the `VllmConfig` class, and the model runner can access it directly. We don't +need to change the constructor of the engine, worker, or model class to pass the +new configuration option. + +2. **Uniformity**: The model runner needs a unified interface to create and +initialize the model. vLLM supports more than 50 types of popular open-source +models. Each model has its own initialization logic. If the constructor +signature varies with models, the model runner does not know how to call the +constructor accordingly, without complicated and error-prone inspection logic. +By making the constructor of the model class uniform, the model runner can +easily create and initialize the model without knowing the specific model type. +This is also useful for composing models. Vision-language models often consist +of a vision model and a language model. By making the constructor uniform, we +can easily create a vision model and a language model and compose them into a +vision-language model. + +.. note:: + + To support this change, all vLLM models' signatures have been updated to: + + .. code-block:: python + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + + To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: + + .. code-block:: python + + class MyOldModel(nn.Module): + def __init__( + self, + config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + prefix: str = "", + ) -> None: + ... + + from vllm.config import VllmConfig + class MyNewModel(MyOldModel): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + super().__init__(config, cache_config, quant_config, lora_config, prefix) + + if __version__ >= "0.6.4": + MyModel = MyNewModel + else: + MyModel = MyOldModel + + This way, the model can work with both old and new versions of vLLM. + +3. **Sharding and Quantization at Initialization**: Certain features require +changing the model weights. For example, tensor parallelism needs to shard the +model weights, and quantization needs to quantize the model weights. There are +two possible ways to implement this feature. One way is to change the model +weights after the model is initialized. The other way is to change the model +weights during the model initialization. vLLM chooses the latter. The first +approach is not scalable to large models. Suppose we want to run a 405B model +(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should +only load 50GB weights. If we change the model weights after the model is +initialized, we need to load the full 810GB weights to every GPU and then shard +the weights, leading to a huge memory overhead. Instead, if we shard the weights +during the model initialization, every layer will only create a shard of the +weights it needs, leading to a much smaller memory overhead. The same idea +applies to quantization. Note that we also add an additional argument ``prefix`` +to the model's constructor so that the model can initialize itself differently +based on the prefix. This is useful for non-uniform quantization, where +different parts of the model are quantized differently. The ``prefix`` is +usually an empty string for the top-level model and a string like ``"vision"`` +or ``"language"`` for the sub-models. In general, it matches the name of the +module's state dict in the checkpoint file. + +One disadvantage of this design is that it is hard to write unit tests for +individual components in vLLM because every component needs to be initialized by +a complete config object. We solve this problem by providing a default +initialization function that creates a default config object with all fields set +to ``None``. If the component we want to test only cares about a few fields in +the config object, we can create a default config object and set the fields we +care about. This way, we can test the component in isolation. Note that many +tests in vLLM are end-to-end tests that test the whole system, so this is not a +big problem. + +In summary, the complete config object ``VllmConfig`` can be treated as an +engine-level global state that is shared among all vLLM classes. diff --git a/vllm/docs/source/design/huggingface_integration.rst b/vllm/docs/source/design/huggingface_integration.rst new file mode 100644 index 000000000..e6c1cea60 --- /dev/null +++ b/vllm/docs/source/design/huggingface_integration.rst @@ -0,0 +1,40 @@ +.. _huggingface_integration: + +Integration with HuggingFace +=================================== + +This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``. + +Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``. + +1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet `__ for the implementation. Within this process: + + - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path. + + - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website `__ for more information on how the HuggingFace cache works. + + - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function `__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json `__ file. + +2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet `__ for the implementation. + +3. Next, vLLM `inspects `__ the ``model_type`` field in the config dictionary to `generate `__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here `__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained `__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that: + + - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here `__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek `__ for an example. + + - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled. + +4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here `__ for the implementation. + +5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry `__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code `__. This class will initialize itself depending on various configs. + +Beyond that, there are two more things vLLM depends on HuggingFace for. + +1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained `__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer `__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer `__. + +2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights. + + - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation `__ for more information on the safetensors format. This part of the logic can be found `here `__. Please note that: + +This completes the integration between vLLM and HuggingFace. + +In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository. diff --git a/vllm/docs/source/dev/input_processing/input_processing_pipeline.rst b/vllm/docs/source/design/input_processing/input_processing_pipeline.rst similarity index 100% rename from vllm/docs/source/dev/input_processing/input_processing_pipeline.rst rename to vllm/docs/source/design/input_processing/input_processing_pipeline.rst diff --git a/vllm/docs/source/dev/input_processing/model_inputs_index.rst b/vllm/docs/source/design/input_processing/model_inputs_index.rst similarity index 100% rename from vllm/docs/source/dev/input_processing/model_inputs_index.rst rename to vllm/docs/source/design/input_processing/model_inputs_index.rst diff --git a/vllm/docs/source/dev/kernel/paged_attention.rst b/vllm/docs/source/design/kernel/paged_attention.rst similarity index 100% rename from vllm/docs/source/dev/kernel/paged_attention.rst rename to vllm/docs/source/design/kernel/paged_attention.rst diff --git a/vllm/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/vllm/docs/source/design/multimodal/adding_multimodal_plugin.rst similarity index 100% rename from vllm/docs/source/dev/multimodal/adding_multimodal_plugin.rst rename to vllm/docs/source/design/multimodal/adding_multimodal_plugin.rst diff --git a/vllm/docs/source/dev/multimodal/multimodal_index.rst b/vllm/docs/source/design/multimodal/multimodal_index.rst similarity index 88% rename from vllm/docs/source/dev/multimodal/multimodal_index.rst rename to vllm/docs/source/design/multimodal/multimodal_index.rst index e112b43aa..c6d47f90b 100644 --- a/vllm/docs/source/dev/multimodal/multimodal_index.rst +++ b/vllm/docs/source/design/multimodal/multimodal_index.rst @@ -7,7 +7,7 @@ Multi-Modality vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. -Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` +Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities @@ -15,9 +15,6 @@ by following :ref:`this guide `. Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here `. -.. - TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported - Guides ++++++ @@ -53,7 +50,7 @@ Base Classes .. autodata:: vllm.multimodal.MultiModalDataDict -.. autoclass:: vllm.multimodal.MultiModalInputs +.. autoclass:: vllm.multimodal.MultiModalKwargs :members: :show-inheritance: diff --git a/vllm/docs/source/design/plugin_system.rst b/vllm/docs/source/design/plugin_system.rst new file mode 100644 index 000000000..5a96cc8b3 --- /dev/null +++ b/vllm/docs/source/design/plugin_system.rst @@ -0,0 +1,62 @@ +.. _plugin_system: + +vLLM's Plugin System +==================== + +The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM. + +How Plugins Work in vLLM +------------------------ + +Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`arch_overview`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins `__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work. + +How vLLM Discovers Plugins +-------------------------- + +vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: + +.. code-block:: python + + # inside `setup.py` file + from setuptools import setup + + setup(name='vllm_add_dummy_model', + version='0.1', + packages=['vllm_add_dummy_model'], + entry_points={ + 'vllm.general_plugins': + ["register_dummy_model = vllm_add_dummy_model:register"] + }) + + # inside `vllm_add_dummy_model.py` file + def register(): + from vllm import ModelRegistry + + if "MyLlava" not in ModelRegistry.get_supported_archs(): + ModelRegistry.register_model("MyLlava", + "vllm_add_dummy_model.my_llava:MyLlava") + +For more information on adding entry points to your package, please check the `official documentation `__. + +Every plugin has three parts: + +1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins. + +2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name. + +3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module. + +What Can Plugins Do? +-------------------- + +Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. + +Guidelines for Writing Plugins +------------------------------ + +- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes. + +Compatibility Guarantee +----------------------- + +vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development. diff --git a/vllm/docs/source/dev/pooling_params.rst b/vllm/docs/source/dev/pooling_params.rst new file mode 100644 index 000000000..334e0287a --- /dev/null +++ b/vllm/docs/source/dev/pooling_params.rst @@ -0,0 +1,5 @@ +Pooling Parameters +================== + +.. autoclass:: vllm.PoolingParams + :members: diff --git a/vllm/docs/source/getting_started/amd-installation.rst b/vllm/docs/source/getting_started/amd-installation.rst index 301337aeb..ece5d785e 100644 --- a/vllm/docs/source/getting_started/amd-installation.rst +++ b/vllm/docs/source/getting_started/amd-installation.rst @@ -13,8 +13,6 @@ Requirements * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) * ROCm 6.2 -Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8. - Installation options: #. :ref:`Build from source with docker ` diff --git a/vllm/docs/source/getting_started/arm-installation.rst b/vllm/docs/source/getting_started/arm-installation.rst new file mode 100644 index 000000000..7b457df92 --- /dev/null +++ b/vllm/docs/source/getting_started/arm-installation.rst @@ -0,0 +1,50 @@ +.. _installation_arm: + +Installation for ARM CPUs +========================= + +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: + +* CPU backend inference capabilities +* Relevant runtime environment variables +* Performance optimization tips + +ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes. +Contents: + +1. :ref:`Requirements ` +2. :ref:`Quick Start with Dockerfile ` +3. :ref:`Building from Source ` + +.. _arm_backend_requirements: + +Requirements +------------ + +* **Operating System**: Linux or macOS +* **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +* **Instruction Set Architecture (ISA)**: NEON support is required + +.. _arm_backend_quick_start_dockerfile: + +Quick Start with Dockerfile +--------------------------- + +You can quickly set up vLLM on ARM using Docker: + +.. code-block:: console + + $ docker build -f Dockerfile.arm -t vllm-cpu-env --shm-size=4g . + $ docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env + +.. _build_arm_backend_from_source: + +Building from Source +-------------------- + +To build vLLM from source on Ubuntu 22.04 or other Linux distributions, follow a similar process as with x86. Testing has been conducted on AWS Graviton3 instances for compatibility. diff --git a/vllm/docs/source/getting_started/cpu-installation.rst b/vllm/docs/source/getting_started/cpu-installation.rst index d12aeebbb..649de1cd9 100644 --- a/vllm/docs/source/getting_started/cpu-installation.rst +++ b/vllm/docs/source/getting_started/cpu-installation.rst @@ -3,13 +3,13 @@ Installation with CPU ======================== -vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features: +vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: -- Tensor Parallel (``-tp = N``) -- Quantization (``INT8 W8A8, AWQ``) - -.. note:: - FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon. +- Tensor Parallel +- Model Quantization (``INT8 W8A8, AWQ``) +- Chunked-prefill +- Prefix-caching +- FP8-E5M2 KV-Caching (TODO) Table of contents: @@ -72,8 +72,6 @@ Build from source $ VLLM_TARGET_DEVICE=cpu python setup.py install .. note:: - - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support. - - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. diff --git a/vllm/docs/source/getting_started/debugging.rst b/vllm/docs/source/getting_started/debugging.rst index 91978065f..0c1afcbd7 100644 --- a/vllm/docs/source/getting_started/debugging.rst +++ b/vllm/docs/source/getting_started/debugging.rst @@ -20,6 +20,10 @@ Hangs loading a model from disk If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. +.. note:: + + To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. + Model is too large ---------------------------------------- If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. @@ -75,11 +79,13 @@ If GPU/CPU communication cannot be established, you can use the following Python print("PyTorch GLOO is successful!") + if world_size <= 1: + exit() + # Test vLLM NCCL, with cuda graph from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank) - pynccl.disabled = False s = torch.cuda.Stream() with torch.cuda.stream(s): @@ -119,6 +125,8 @@ If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes` If the script runs successfully, you should see the message ``sanity check is successful!``. +If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation `__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. + .. note:: A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: diff --git a/vllm/docs/source/getting_started/gaudi-installation.rst b/vllm/docs/source/getting_started/gaudi-installation.rst new file mode 100644 index 000000000..249e08278 --- /dev/null +++ b/vllm/docs/source/getting_started/gaudi-installation.rst @@ -0,0 +1,402 @@ +Installation with Intel® Gaudi® AI Accelerators +=============================================== + +This README provides instructions on running vLLM with Intel Gaudi devices. + +Requirements and Installation +----------------------------- + +Please follow the instructions provided in the `Gaudi Installation +Guide `__ +to set up the execution environment. To achieve the best performance, +please follow the methods outlined in the `Optimizing Training Platform +Guide `__. + +Requirements +~~~~~~~~~~~~ + +- OS: Ubuntu 22.04 LTS +- Python: 3.10 +- Intel Gaudi accelerator +- Intel Gaudi software version 1.18.0 + + +Quick start using Dockerfile +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. code:: console + + $ docker build -f Dockerfile.hpu -t vllm-hpu-env . + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env + + +.. tip:: + If you're observing the following error: ``docker: Error response from daemon: Unknown runtime specified habana.``, please refer to "Install Using Containers" section of `Intel Gaudi Software Stack and Driver Installation `__. Make sure you have ``habana-container-runtime`` package installed and that ``habana`` container runtime is registered. + + +Build from source +~~~~~~~~~~~~~~~~~ + +Environment verification +^^^^^^^^^^^^^^^^^^^^^^^^ + +To verify that the Intel Gaudi software was correctly installed, run: + +.. code:: console + + $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible + $ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed + $ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed + $ pip list | grep neural # verify that neural_compressor is installed + +Refer to `Intel Gaudi Software Stack +Verification `__ +for more details. + +Run Docker Image +^^^^^^^^^^^^^^^^ + +It is highly recommended to use the latest Docker image from Intel Gaudi +vault. Refer to the `Intel Gaudi +documentation `__ +for more details. + +Use the following commands to run a Docker image: + +.. code:: console + + $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + +Build and Install vLLM +^^^^^^^^^^^^^^^^^^^^^^ + +To build and install vLLM from source, run: + +.. code:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python setup.py develop + + +Currently, the latest features and performance optimizations are developed in Gaudi's `vLLM-fork `__ and we periodically upstream them to vLLM main repo. To install latest `HabanaAI/vLLM-fork `__, run the following: + +.. code:: console + + $ git clone https://github.com/HabanaAI/vllm-fork.git + $ cd vllm-fork + $ git checkout habana_main + $ python setup.py develop + + +Supported Features +------------------ + +- `Offline batched + inference `__ +- Online inference via `OpenAI-Compatible + Server `__ +- HPU autodetection - no need to manually select device within vLLM +- Paged KV cache with algorithms enabled for Intel Gaudi accelerators +- Custom Intel Gaudi implementations of Paged Attention, KV cache ops, + prefill attention, Root Mean Square Layer Normalization, Rotary + Positional Encoding +- Tensor parallelism support for multi-card inference +- Inference with `HPU Graphs `__ + for accelerating low-batch latency and throughput +- Attention with Linear Biases (ALiBi) + +Unsupported Features +-------------------- + +- Beam search +- LoRA adapters +- Quantization +- Prefill chunking (mixed-batch inferencing) + +Supported Configurations +------------------------ + +The following configurations have been validated to be function with +Gaudi2 devices. Configurations that are not listed may or may not work. + +- `meta-llama/Llama-2-7b `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-7b-chat-hf `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-8B-Instruct `__ + on single HPU, or with tensor parallelism on 2x and 8x HPU, BF16 + datatype with random or greedy sampling +- `meta-llama/Llama-2-70b `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Llama-2-70b-chat-hf `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling +- `meta-llama/Meta-Llama-3.1-70B-Instruct `__ + with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling + +Performance Tuning +------------------ + +Execution modes +~~~~~~~~~~~~~~~ + +Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via ``PT_HPU_LAZY_MODE`` environment variable), and ``--enforce-eager`` flag. + +.. list-table:: vLLM execution modes + :widths: 25 25 50 + :header-rows: 1 + + * - ``PT_HPU_LAZY_MODE`` + - ``enforce_eager`` + - execution mode + * - 0 + - 0 + - torch.compile + * - 0 + - 1 + - PyTorch eager mode + * - 1 + - 0 + - HPU Graphs + * - 1 + - 1 + - PyTorch lazy mode + +.. warning:: + In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. + + +Bucketing mechanism +~~~~~~~~~~~~~~~~~~~ + +Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. `Intel Gaudi Graph Compiler `__ is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. +In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - ``batch_size`` and ``sequence_length``. + +.. note:: + Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase. + +Bucketing ranges are determined with 3 parameters - ``min``, ``step`` and ``max``. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: + +.. code-block:: + + INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + +``min`` determines the lowest value of the bucket. ``step`` determines the interval between buckets, and ``max`` determines the upper bound of the bucket. Furthermore, interval between ``min`` and ``step`` has special handling - ``min`` gets multiplied by consecutive powers of two, until ``step`` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. + +Example (with ramp-up) + +.. code-block:: + + min = 2, step = 32, max = 64 + => ramp_up = (2, 4, 8, 16) + => stable = (32, 64) + => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) + +Example (without ramp-up) + +.. code-block:: + + min = 128, step = 128, max = 512 + => ramp_up = () + => stable = (128, 256, 384, 512) + => buckets = ramp_up + stable => (128, 256, 384, 512) + + +In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. + +.. warning:: + If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. + +As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as ``(4, 512)`` prefill bucket, as ``batch_size`` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as ``(4, 512)`` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a ``(2, 512)`` bucket, or context length increases above 512 tokens, in which case it will become ``(4, 640)`` bucket. + +.. note:: + Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. + +Warmup +~~~~~~ + +Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: + +.. code-block:: + + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB + INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB + INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB + ... + INFO 08-01 22:26:59 hpu_model_runner.py:1066] [Warmup][Prompt][24/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][1/48] batch_size:4 seq_len:2048 free_mem:55.43 GiB + INFO 08-01 22:27:00 hpu_model_runner.py:1066] [Warmup][Decode][2/48] batch_size:4 seq_len:1920 free_mem:55.43 GiB + INFO 08-01 22:27:01 hpu_model_runner.py:1066] [Warmup][Decode][3/48] batch_size:4 seq_len:1792 free_mem:55.43 GiB + ... + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size:2 seq_len:128 free_mem:55.43 GiB + INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + +This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. + +.. tip:: + Compiling all the buckets might take some time and can be turned off with ``VLLM_SKIP_WARMUP=true`` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. + +HPU Graph capture +~~~~~~~~~~~~~~~~~ + +`HPU Graphs `__ are currently the most performant execution method of vLLM on Intel Gaudi. When HPU Graphs are enabled, execution graphs will be traced (recorded) ahead of time (after performing warmup), to be later replayed during inference, significantly reducing host overheads. Recording can take large amounts of memory, which needs to be taken into account when allocating KV cache. Enabling HPU Graphs will impact the number of available KV cache blocks, but vLLM provides user-configurable variables to control memory management. + + +When HPU Graphs are being used, they share the common memory pool ("usable memory") as KV cache, determined by ``gpu_memory_utilization`` flag (``0.9`` by default). +Before KV cache gets allocated, model weights are loaded onto the device, and a forward pass of the model is executed on dummy data, to estimate memory usage. +Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value, will mark 90% of free device memory at that point as usable. +Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. +Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. +With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. +Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.3``), both stages have equal memory constraints. +Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. + +.. note:: + ``gpu_memory_utilization`` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, ``gpu_memory_utilization`` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory. + +User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented: +- ``max_bs`` - graph capture queue will sorted in descending order by their batch sizes. Buckets with equal batch sizes are sorted by sequence length in ascending order (e.g. ``(64, 128)``, ``(64, 256)``, ``(32, 128)``, ``(32, 256)``, ``(1, 128)``, ``(1,256)``), default strategy for decode +- ``min_tokens`` - graph capture queue will be sorted in ascending order by the number of tokens each graph processes (``batch_size*sequence_length``), default strategy for prompt + +When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by ``max_bs`` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in ``min_tokens`` strategy. + + +.. note:: + ``VLLM_GRAPH_PROMPT_RATIO`` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * ``VLLM_GRAPH_PROMPT_RATIO``) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below. + + +Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): + +.. code-block:: + + INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] + INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] + INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] + INFO 08-02 17:37:44 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:37:52 hpu_model_runner.py:430] Pre-loading model weights on hpu:0 took 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:438] Wrapping in HPU Graph took 0 B of device memory (14.97 GiB/94.62 GiB used) and -252 KiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:52 hpu_model_runner.py:442] Loading model weights took in total 14.97 GiB of device memory (14.97 GiB/94.62 GiB used) and 2.95 GiB of host memory (475.2 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:134] Model profiling run took 504 MiB of device memory (15.46 GiB/94.62 GiB used) and 180.9 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_worker.py:158] Free device memory: 79.16 GiB, 39.58 GiB usable (gpu_memory_utilization=0.5), 15.83 GiB reserved for HPUGraphs (VLLM_GRAPH_RESERVED_MEM=0.4), 23.75 GiB reserved for KV cache + INFO 08-02 17:37:54 hpu_executor.py:85] # HPU blocks: 1519, # CPU blocks: 0 + INFO 08-02 17:37:54 hpu_worker.py:190] Initializing cache engine took 23.73 GiB of device memory (39.2 GiB/94.62 GiB used) and -1.238 MiB of host memory (475.4 GiB/1007 GiB used) + INFO 08-02 17:37:54 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:55.43 GiB + ... + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB + INFO 08-02 17:38:22 hpu_model_runner.py:1159] Using 15.85 GiB/55.43 GiB of free device memory for HPUGraphs, 7.923 GiB for prompt and 7.923 GiB for decode (VLLM_GRAPH_PROMPT_RATIO=0.3) + INFO 08-02 17:38:22 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][1/24] batch_size:1 seq_len:128 free_mem:55.43 GiB + ... + INFO 08-02 17:38:26 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][11/24] batch_size:1 seq_len:896 free_mem:48.77 GiB + INFO 08-02 17:38:27 hpu_model_runner.py:1066] [Warmup][Graph/Decode][1/48] batch_size:4 seq_len:128 free_mem:47.51 GiB + ... + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Decode][48/48] batch_size:1 seq_len:2048 free_mem:47.35 GiB + INFO 08-02 17:38:41 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][12/24] batch_size:4 seq_len:256 free_mem:47.35 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][13/24] batch_size:2 seq_len:512 free_mem:45.91 GiB + INFO 08-02 17:38:42 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][14/24] batch_size:1 seq_len:1024 free_mem:44.48 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1066] [Warmup][Graph/Prompt][15/24] batch_size:2 seq_len:640 free_mem:43.03 GiB + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Prompt captured:15 (62.5%) used_mem:14.03 GiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (4, 128), (4, 256)] + INFO 08-02 17:38:43 hpu_model_runner.py:1128] Graph/Decode captured:48 (100.0%) used_mem:161.9 MiB buckets:[(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] + INFO 08-02 17:38:43 hpu_model_runner.py:1206] Warmup finished in 49 secs, allocated 14.19 GiB of device memory + INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of device memory (53.39 GiB/94.62 GiB used) and 57.86 MiB of host memory (475.4 GiB/1007 GiB used) + + +Recommended vLLM Parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- We recommend running inference on Gaudi 2 with ``block_size`` of 128 + for BF16 data type. Using default values (16, 32) might lead to + sub-optimal performance due to Matrix Multiplication Engine + under-utilization (see `Gaudi + Architecture `__). +- For max throughput on Llama 7B, we recommend running with batch size + of 128 or 256 and max context length of 2048 with HPU Graphs enabled. + If you encounter out-of-memory issues, see troubleshooting section. + +Environment variables +~~~~~~~~~~~~~~~~~~~~~ + +**Diagnostic and profiling knobs:** + +- ``VLLM_PROFILER_ENABLED``: if ``true``, high level profiler will be enabled. Resulting JSON traces can be viewed in `perfetto.habana.ai `__. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION``: if ``true``, will log graph compilations per each vLLM engine step, only when there was any - highly recommended to use alongside ``PT_HPU_METRICS_GC_DETAILS=1``. Disabled by default. +- ``VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL``: if ``true``, will log graph compilations per each vLLM engine step, always, even if there were none. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS``: if ``true``, will log cpu fallbacks per each vLLM engine step, only when there was any. Disabled by default. +- ``VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL``: if ``true``, will log cpu fallbacks per each vLLM engine step, always, even if there were none. Disabled by default. + +**Performance tuning knobs:** + +- ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default +- ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default +- ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.3`` by default +- ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default +- ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default +- ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism + + - ``{phase}`` is either ``PROMPT`` or ``DECODE`` + - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK`` + - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX`` + - Default values: + + - Prompt: + - batch size min (``VLLM_PROMPT_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_PROMPT_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_PROMPT_BS_BUCKET_MAX``): ``min(max_num_seqs, 64)`` + - sequence length min (``VLLM_PROMPT_SEQ_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_PROMPT_SEQ_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_PROMPT_SEQ_BUCKET_MAX``): ``max_model_len`` + + - Decode: + - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``1`` + - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)`` + - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs`` + - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``block_size`` + - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``block_size`` + - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)`` + + +Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: + +- ``PT_HPU_LAZY_MODE``: if ``0``, PyTorch Eager backend for Gaudi will be used, if ``1`` PyTorch Lazy backend for Gaudi will be used, ``1`` is default +- ``PT_HPU_ENABLE_LAZY_COLLECTIVES``: required to be ``true`` for tensor parallel inference with HPU Graphs + +Troubleshooting: Tweaking HPU Graphs +------------------------------------ + +If you experience device out-of-memory issues or want to attempt +inference at higher batch sizes, try tweaking HPU Graphs by following +the below: + +- Tweak ``gpu_memory_utilization`` knob. It will decrease the + allocation of KV cache, leaving some headroom for capturing graphs + with larger batch size. By default ``gpu_memory_utilization`` is set + to 0.9. It attempts to allocate ~90% of HBM left for KV cache after + short profiling run. Note that decreasing reduces the number of KV + cache blocks you have available, and therefore reduces the effective + maximum number of tokens you can handle at a given time. + +- If this method is not efficient, you can disable ``HPUGraph`` + completely. With HPU Graphs disabled, you are trading latency and + throughput at lower batches for potentially higher throughput on + higher batches. You can do that by adding ``--enforce-eager`` flag to + server (for online inference), or by passing ``enforce_eager=True`` + argument to LLM constructor (for offline inference). diff --git a/vllm/docs/source/getting_started/installation.rst b/vllm/docs/source/getting_started/installation.rst index a706b285e..9b6cb0e80 100644 --- a/vllm/docs/source/getting_started/installation.rst +++ b/vllm/docs/source/getting_started/installation.rst @@ -10,7 +10,7 @@ Requirements ============ * OS: Linux -* Python: 3.8 - 3.12 +* Python: 3.9 -- 3.12 * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Install released versions @@ -21,7 +21,7 @@ You can install vLLM using pip: .. code-block:: console $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y + $ conda create -n myenv python=3.12 -y $ conda activate myenv $ # Install vLLM with CUDA 12.1. @@ -66,14 +66,14 @@ If you want to access the wheels for previous commits, you can specify the commi $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl -Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. +Note that the wheels are built with Python 3.8 ABI (see `PEP 425 `_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. Another way to access the latest code is to use the docker images: .. code-block:: console $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch - $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT} + $ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. @@ -89,45 +89,24 @@ Build from source Python-only build (without compilation) --------------------------------------- -If you only need to change Python code, you can simply build vLLM without compilation. - -The first step is to install the latest vLLM wheel: - -.. code-block:: console - - pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl - -You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. - -After verifying that the installation is successful, you can use `the following script `_: +If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag `_, changes you make to the code will be reflected when you run vLLM: .. code-block:: console $ git clone https://github.com/vllm-project/vllm.git $ cd vllm - $ python python_only_dev.py - -The script will: - -* Find the installed vLLM package in the current environment. -* Copy built files to the current directory. -* Rename the installed vLLM package. -* Symbolically link the current directory to the installed vLLM package. + $ VLLM_USE_PRECOMPILED=1 pip install --editable . -Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM. +This will download the latest nightly wheel and use the compiled libraries from there in the install. -Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script `_ with the ``--quit-dev`` (or ``-q`` for short) flag: +The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel `_: .. code-block:: console - $ python python_only_dev.py --quit-dev - -The ``--quit-dev`` flag will: - -* Remove the symbolic link from the current directory to the vLLM package. -* Restore the original vLLM package from the backup. + $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl + $ pip install --editable . -If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again. +You can find more information about vLLM's wheels `above <#install-the-latest-code>`_. .. note:: @@ -148,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T .. tip:: Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. - For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . + + For example, you can install `ccache `_ using ``conda install ccache`` or ``apt install ccache`` . As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster. + `sccache `_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments. + The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``. + Use an existing PyTorch installation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -170,6 +153,18 @@ To build vLLM using an existing PyTorch installation: $ pip install -e . --no-build-isolation +Use the local cutlass for compilation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Currently, before starting the build process, vLLM fetches cutlass code from GitHub. However, there may be scenarios where you want to use a local version of cutlass instead. +To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . + + Troubleshooting ~~~~~~~~~~~~~~~ @@ -181,8 +176,8 @@ to be run simultaneously, via the environment variable ``MAX_JOBS``. For example $ export MAX_JOBS=6 $ pip install -e . -This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. -A side effect is a much slower build process. +This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default `_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory. +A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. @@ -209,7 +204,7 @@ Here is a sanity check to verify that the CUDA Toolkit is correctly installed: Unsupported OS build -------------------- -vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. +vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems. Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing: diff --git a/vllm/docs/source/getting_started/neuron-installation.rst b/vllm/docs/source/getting_started/neuron-installation.rst index ec99fc013..025ba6ef7 100644 --- a/vllm/docs/source/getting_started/neuron-installation.rst +++ b/vllm/docs/source/getting_started/neuron-installation.rst @@ -11,7 +11,7 @@ Requirements ------------ * OS: Linux -* Python: 3.8 -- 3.11 +* Python: 3.9 -- 3.11 * Accelerator: NeuronCore_v2 (in trn1/inf2 instances) * Pytorch 2.0.1/2.1.1 * AWS Neuron SDK 2.16/2.17 (Verified on python 3.8) diff --git a/vllm/docs/source/getting_started/quickstart.rst b/vllm/docs/source/getting_started/quickstart.rst index f0e6cddf0..0c0491c86 100644 --- a/vllm/docs/source/getting_started/quickstart.rst +++ b/vllm/docs/source/getting_started/quickstart.rst @@ -12,7 +12,7 @@ This guide will help you quickly get started with vLLM to: Prerequisites -------------- - OS: Linux -- Python: 3.8 - 3.12 +- Python: 3.9 -- 3.12 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) Installation @@ -138,10 +138,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep A more detailed client example can be found `here `__. -OpenAI Chat API with vLLM -~~~~~~~~~~~~~~~~~~~~~~~~~~ +OpenAI Chat Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +vLLM is designed to also support the OpenAI Chat Completions API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. You can use the `create chat completion `_ endpoint to interact with the model: @@ -157,7 +157,7 @@ You can use the `create chat completion `_. +For more information on the TPU versions supported with vLLM, see: + +* `TPU v6e `_ +* `TPU v5e `_ +* `TPU v5p `_ +* `TPU v4 `_ + +These TPU versions allow you to configure the physical arrangements of the TPU +chips. This can improve throughput and networking performance. For more +information see: + +* `TPU v6e topologies `_ +* `TPU v5e topologies `_ +* `TPU v5p topologies `_ +* `TPU v4 topologies `_ + +In order for you to use Cloud TPUs you need to have TPU quota granted to your +Google Cloud Platform project. TPU quotas specify how many TPUs you can use in a +GPC project and are specified in terms of TPU version, the number of TPU you +want to use, and quota type. For more information, see `TPU quota `_. + +For TPU pricing information, see `Cloud TPU pricing `_. + +You may need additional persistent storage for your TPU VMs. For more +information, see `Storage options for Cloud TPU data `_. Requirements ------------ -* Google Cloud TPU VM (single & multi host) -* TPU versions: v5e, v5p, v4 -* Python: 3.10 - -Installation options: +* Google Cloud TPU VM +* TPU versions: v6e, v5e, v5p, v4 +* Python: 3.10 or newer -1. :ref:`Build a docker image with Dockerfile `. -2. :ref:`Build from source `. +Provision Cloud TPUs +==================== -.. _build_docker_tpu: +You can provision Cloud TPUs using the `Cloud TPU API `_ +or the `queued resources `_ +API. This section shows how to create TPUs using the queued resource API. For +more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API `_. +Queued resources enable you to request Cloud TPU resources in a queued manner. +When you request queued resources, the request is added to a queue maintained by +the Cloud TPU service. When the requested resource becomes available, it's +assigned to your Google Cloud project for your immediate exclusive use. -Build a docker image with :code:`Dockerfile.tpu` ------------------------------------------------- +.. note:: + In all of the following commands, replace the ALL CAPS parameter names with + appropriate values. See the parameter descriptions table for more information. -`Dockerfile.tpu `_ is provided to build a docker image with TPU support. +Provision a Cloud TPU with the queued resource API +-------------------------------------------------- +Create a TPU v5e with 4 TPU chips: .. code-block:: console - $ docker build -f Dockerfile.tpu -t vllm-tpu . + gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ + --node-id TPU_NAME \ + --project PROJECT_ID \ + --zone ZONE \ + --accelerator-type ACCELERATOR_TYPE \ + --runtime-version RUNTIME_VERSION \ + --service-account SERVICE_ACCOUNT + +.. list-table:: Parameter descriptions + :header-rows: 1 -You can run the docker image with the following command: + * - Parameter name + - Description + * - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. + * - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. + * - PROJECT_ID + - Your Google Cloud project + * - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ + * - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. + * - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. + * - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` -.. code-block:: console +Connect to your TPU using SSH: - $ # Make sure to add `--privileged --net host --shm-size=16G`. - $ docker run --privileged --net host --shm-size=16G -it vllm-tpu +.. code-block:: bash + gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE -.. _build_from_source_tpu: +Install Miniconda -Build from source ------------------ +.. code-block:: bash -You can also build and install the TPU backend from source. + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + bash Miniconda3-latest-Linux-x86_64.sh + source ~/.bashrc -First, install the dependencies: +Create and activate a Conda environment for vLLM: -.. code-block:: console +.. code-block:: bash + + conda create -n vllm python=3.10 -y + conda activate vllm + +Clone the vLLM repository and go to the vLLM directory: + +.. code-block:: bash - $ # (Recommended) Create a new conda environment. - $ conda create -n myenv python=3.10 -y - $ conda activate myenv + git clone https://github.com/vllm-project/vllm.git && cd vllm - $ # Clean up the existing torch and torch-xla packages. - $ pip uninstall torch torch-xla -y +Uninstall the existing `torch` and `torch_xla` packages: - $ # Install PyTorch and PyTorch XLA. - $ export DATE="20240828" - $ export TORCH_VERSION="2.5.0" - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl +.. code-block:: bash - $ # Install JAX and Pallas. - $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html - $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + pip uninstall torch torch-xla -y - $ # Install other build dependencies. - $ pip install -r requirements-tpu.txt +Install build dependencies: +.. code-block:: bash -Next, build vLLM from source. This will only take a few seconds: + pip install -r requirements-tpu.txt + sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev + +Run the setup script: + +.. code-block:: bash + + VLLM_TARGET_DEVICE="tpu" python setup.py develop + + +Provision Cloud TPUs with GKE +----------------------------- + +For more information about using TPUs with GKE, see +https://cloud.google.com/kubernetes-engine/docs/how-to/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/tpus +https://cloud.google.com/kubernetes-engine/docs/concepts/plan-tpus + +.. _build_docker_tpu: + +Build a docker image with :code:`Dockerfile.tpu` +------------------------------------------------ + +You can use `Dockerfile.tpu `_ +to build a Docker image with TPU support. .. code-block:: console - $ VLLM_TARGET_DEVICE="tpu" python setup.py develop + $ docker build -f Dockerfile.tpu -t vllm-tpu . +Run the Docker image with the following command: -.. note:: +.. code-block:: console - Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape. - The compilation time may take 20~30 minutes in the first run. - However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). + $ # Make sure to add `--privileged --net host --shm-size=16G`. + $ docker run --privileged --net host --shm-size=16G -it vllm-tpu + +.. note:: + Since TPU relies on XLA which requires static shapes, vLLM bucketizes the + possible input shapes and compiles an XLA graph for each shape. The + compilation time may take 20~30 minutes in the first run. However, the + compilation time reduces to ~5 minutes afterwards because the XLA graphs are + cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default). .. tip:: @@ -90,10 +188,11 @@ Next, build vLLM from source. This will only take a few seconds: .. code-block:: console from torch._C import * # noqa: F403 - ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory + ImportError: libopenblas.so.0: cannot open shared object file: No such + file or directory - Please install OpenBLAS with the following command: + Install OpenBLAS with the following command: .. code-block:: console diff --git a/vllm/docs/source/getting_started/xpu-installation.rst b/vllm/docs/source/getting_started/xpu-installation.rst index 151ebb5f1..b1868acbc 100644 --- a/vllm/docs/source/getting_started/xpu-installation.rst +++ b/vllm/docs/source/getting_started/xpu-installation.rst @@ -60,3 +60,21 @@ Build from source - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. + +Distributed inference and serving +--------------------------------- + +XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: + +.. code-block:: console + + $ python -m vllm.entrypoints.openai.api_server \ + $ --model=facebook/opt-13b \ + $ --dtype=bfloat16 \ + $ --device=xpu \ + $ --max_model_len=1024 \ + $ --distributed-executor-backend=ray \ + $ --pipeline-parallel-size=2 \ + $ -tp=8 + +By default, a ray instance will be launched automatically if no existing one is detected in system, with ``num-gpus`` equals to ``parallel_config.world_size``. We recommend properly starting a ray cluster before execution, referring helper `script `_. diff --git a/vllm/docs/source/index.rst b/vllm/docs/source/index.rst index c328c049b..86b1eed2d 100644 --- a/vllm/docs/source/index.rst +++ b/vllm/docs/source/index.rst @@ -43,7 +43,7 @@ vLLM is flexible and easy to use with: * Tensor parallelism and pipeline parallelism support for distributed inference * Streaming outputs * OpenAI-compatible API server -* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. +* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators. * Prefix caching support * Multi-lora support @@ -66,6 +66,8 @@ Documentation getting_started/amd-installation getting_started/openvino-installation getting_started/cpu-installation + getting_started/gaudi-installation + getting_started/arm-installation getting_started/neuron-installation getting_started/tpu-installation getting_started/xpu-installation @@ -83,12 +85,8 @@ Documentation serving/deploying_with_nginx serving/distributed_serving serving/metrics - serving/env_vars - serving/usage_stats serving/integrations serving/tensorizer - serving/compatibility_matrix - serving/faq .. toctree:: :maxdepth: 1 @@ -97,11 +95,21 @@ Documentation models/supported_models models/adding_model models/enabling_multimodal_inputs - models/engine_args - models/lora - models/vlm - models/spec_decode - models/performance + +.. toctree:: + :maxdepth: 1 + :caption: Usage + + usage/lora + usage/multimodal_inputs + usage/structured_outputs + usage/spec_decode + usage/compatibility_matrix + usage/performance + usage/faq + usage/engine_args + usage/env_vars + usage/usage_stats .. toctree:: :maxdepth: 1 @@ -125,29 +133,52 @@ Documentation .. toctree:: :maxdepth: 1 - :caption: Performance benchmarks + :caption: Performance + + performance/benchmarks + +.. Community: User community resources + +.. toctree:: + :maxdepth: 1 + :caption: Community + + community/meetups + community/sponsors - performance_benchmark/benchmarks +.. API Documentation: API reference aimed at vllm library usage .. toctree:: :maxdepth: 2 - :caption: Developer Documentation + :caption: API Documentation dev/sampling_params + dev/pooling_params dev/offline_inference/offline_index dev/engine/engine_index - dev/kernel/paged_attention - dev/input_processing/model_inputs_index - dev/multimodal/multimodal_index - dev/dockerfile/dockerfile - dev/profiling/profiling_index + +.. Design: docs about vLLM internals .. toctree:: - :maxdepth: 1 - :caption: Community + :maxdepth: 2 + :caption: Design - community/meetups - community/sponsors + design/arch_overview + design/huggingface_integration + design/plugin_system + design/input_processing/model_inputs_index + design/kernel/paged_attention + design/multimodal/multimodal_index + +.. For Developers: contributing to the vLLM project + +.. toctree:: + :maxdepth: 2 + :caption: For Developers + + contributing/overview + contributing/profiling/profiling_index + contributing/dockerfile/dockerfile Indices and tables ================== diff --git a/vllm/docs/source/models/adding_model.rst b/vllm/docs/source/models/adding_model.rst index c6d88cc38..df06d736c 100644 --- a/vllm/docs/source/models/adding_model.rst +++ b/vllm/docs/source/models/adding_model.rst @@ -38,41 +38,70 @@ For instance, vLLM's `OPT model Union[Tuple, CausalLMOutputWithPast]: - + positions: torch.Tensor, - + kv_caches: List[torch.Tensor], - + attn_metadata: AttentionMetadata, - + ) -> Optional[SamplerOutput]: - -1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. -2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture. +To ensure compatibility with vLLM, your model must meet the following requirements: + +Initialization Code +^^^^^^^^^^^^^^^^^^^ + +All vLLM modules within the model must include a ``prefix`` argument in their constructor. This ``prefix`` is typically the full name of the module in the model's state dictionary and is crucial for: + +* Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +* Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the ``prefix`` during initialization, vLLM can match the current layer's ``prefix`` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +.. code-block:: python + + from torch import nn + from vllm.config import VllmConfig + from vllm.attention import Attention + + class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + + class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + + class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + + class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") + +Computation Code +^^^^^^^^^^^^^^^^ + +Rewrite the :meth:`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat ``input_ids`` and ``positions`` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +.. code-block:: python + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + ... .. note:: Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +For reference, check out the `LLAMA model `__. vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the `vLLM models `__ directory for more examples. 3. (Optional) Implement tensor parallelism and quantization support ------------------------------------------------------------------- @@ -102,11 +131,11 @@ This method should load the weights from the HuggingFace's checkpoint file and a Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py `_. 6. Out-of-Tree Model Integration --------------------------------------------- +-------------------------------- -We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5. +You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`. -Just add the following lines in your code: +To register the model, use the following code: .. code-block:: python @@ -114,7 +143,7 @@ Just add the following lines in your code: from your_code import YourModelForCausalLM ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: .. code-block:: python @@ -123,19 +152,8 @@ If your model imports modules that initialize CUDA, consider instead lazy-import ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") .. important:: - If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. + If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. Read more about that :ref:`here `. -If you are running api server with :code:`vllm serve `, you can wrap the entrypoint with the following code: - -.. code-block:: python - - from vllm import ModelRegistry - from your_code import YourModelForCausalLM - ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) - - if __name__ == '__main__': - import runpy - runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__') - -Save the above code in a file and run it with :code:`python your_file.py `. +.. note:: + Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. diff --git a/vllm/docs/source/models/enabling_multimodal_inputs.rst b/vllm/docs/source/models/enabling_multimodal_inputs.rst index 3d0d1aec6..5c1236e1a 100644 --- a/vllm/docs/source/models/enabling_multimodal_inputs.rst +++ b/vllm/docs/source/models/enabling_multimodal_inputs.rst @@ -3,7 +3,7 @@ Enabling Multimodal Inputs ========================== -This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal ` inputs. +This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal inputs `. .. seealso:: :ref:`adding_a_new_model` @@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i 3. Register maximum number of multi-modal tokens ------------------------------------------------ -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. .. code-block:: diff diff --git a/vllm/docs/source/models/supported_models.rst b/vllm/docs/source/models/supported_models.rst index ff893b613..5b416e04d 100644 --- a/vllm/docs/source/models/supported_models.rst +++ b/vllm/docs/source/models/supported_models.rst @@ -139,6 +139,11 @@ Text Generation - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. - ✅︎ - ✅︎ + * - :code:`GlmForCausalLM` + - GLM-4 + - :code:`THUDM/glm-4-9b-chat-hf`, etc. + - ✅︎ + - ✅︎ * - :code:`GPT2LMHeadModel` - GPT-2 - :code:`gpt2`, :code:`gpt2-xl`, etc. @@ -160,13 +165,13 @@ Text Generation - - ✅︎ * - :code:`GraniteForCausalLM` - - PowerLM - - :code:`ibm/PowerLM-3b` etc. + - Granite 3.0, PowerLM + - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.0-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - ✅︎ - ✅︎ * - :code:`GraniteMoeForCausalLM` - - PowerMoE - - :code:`ibm/PowerMoE-3b` etc. + - Granite 3.0 MoE, PowerMoE + - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - ✅︎ - ✅︎ * - :code:`InternLMForCausalLM` @@ -177,7 +182,7 @@ Text Generation * - :code:`InternLM2ForCausalLM` - InternLM2 - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - + - ✅︎ - ✅︎ * - :code:`JAISLMHeadModel` - Jais @@ -234,6 +239,11 @@ Text Generation - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. - - ✅︎ + * - :code:`OLMo2ForCausalLM` + - OLMo2 + - :code:`allenai/OLMo2-7B-1124`, etc. + - + - ✅︎ * - :code:`OLMoEForCausalLM` - OLMoE - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. @@ -277,11 +287,11 @@ Text Generation * - :code:`QWenLMHeadModel` - Qwen - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2ForCausalLM` - Qwen2 - - :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc. + - :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ * - :code:`Qwen2MoeForCausalLM` @@ -304,6 +314,11 @@ Text Generation - :code:`upstage/solar-pro-preview-instruct`, etc. - ✅︎ - ✅︎ + * - :code:`TeleChat2ForCausalLM` + - TeleChat2 + - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. + - ✅︎ + - ✅︎ * - :code:`XverseForCausalLM` - XVERSE - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. @@ -325,21 +340,55 @@ Text Embedding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`BertModel` + - BERT-based + - :code:`BAAI/bge-base-en-v1.5`, etc. + - + - * - :code:`Gemma2Model` - Gemma2-based - :code:`BAAI/bge-multilingual-gemma2`, etc. - - ✅︎ - * - :code:`MistralModel` - - Mistral-based + * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. + - Llama-based - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - - ✅︎ + - ✅︎ + * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` + - Qwen2-based + - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + - ✅︎ + - ✅︎ + * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` + - RoBERTa-based + - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. + - + - + * - :code:`XLMRobertaModel` + - XLM-RoBERTa-based + - :code:`intfloat/multilingual-e5-large`, etc. + - + - .. important:: Some model architectures support both generation and embedding tasks. In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + +.. note:: + :code:`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. + You should manually set mean pooling by passing :code:`--override-pooler-config '{"pooling_type": "MEAN"}'`. + +.. note:: + Unlike base Qwen2, :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. + You can set :code:`--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. + + On the other hand, its 1.5B variant (:code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`) uses causal attention + despite being described otherwise on its model card. + Reward Modeling --------------- @@ -352,14 +401,23 @@ Reward Modeling - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`LlamaForCausalLM` + - Llama-based + - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. + - ✅︎ + - ✅︎ * - :code:`Qwen2ForRewardModel` - Qwen2-based - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - - ✅︎ + - ✅︎ + +.. important:: + For process-supervised reward models such as :code:`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, + e.g.: :code:`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. .. note:: - As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. + As an interim measure, these models are supported in both offline and online inference via Embeddings API. Classification --------------- @@ -376,12 +434,44 @@ Classification * - :code:`Qwen2ForSequenceClassification` - Qwen2-based - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - + - ✅︎ - ✅︎ .. note:: - As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now). + As an interim measure, these models are supported in both offline and online inference via Embeddings API. +Sentence Pair Scoring +--------------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`BertForSequenceClassification` + - BERT-based + - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + - + - + * - :code:`RobertaForSequenceClassification` + - RoBERTa-based + - :code:`cross-encoder/quora-roberta-base`, etc. + - + - + * - :code:`XLMRobertaForSequenceClassification` + - XLM-RoBERTa-based + - :code:`BAAI/bge-reranker-v2-m3`, etc. + - + - + +.. note:: + These models are supported in both offline and online inference via Score API. + +.. _supported_mm_models: Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -401,8 +491,6 @@ On the other hand, modalities separated by :code:`/` are mutually exclusive. - e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. -.. _supported_vlms: - Text Generation --------------- @@ -416,6 +504,12 @@ Text Generation - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`AriaForConditionalGeneration` + - Aria + - T + I + - :code:`rhymes-ai/Aria` + - + - ✅︎ * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - T + I\ :sup:`E` @@ -438,8 +532,20 @@ Text Generation - GLM-4V - T + I - :code:`THUDM/glm-4v-9b` etc. + - ✅︎ + - ✅︎ + * - :code:`H2OVLChatModel` + - H2OVL + - T + I\ :sup:`E+` + - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - ✅︎ + * - :code:`Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - * - :code:`InternVLChatModel` - InternVL2 - T + I\ :sup:`E+` @@ -466,7 +572,7 @@ Text Generation - ✅︎ * - :code:`LlavaOnevisionForConditionalGeneration` - LLaVA-Onevision - - T + I\ :sup:`+` + V + - T + I\ :sup:`+` + V\ :sup:`+` - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ @@ -478,7 +584,7 @@ Text Generation - ✅︎ * - :code:`MllamaForConditionalGeneration` - Llama 3.2 - - T + I + - T + I\ :sup:`+` - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - @@ -516,7 +622,7 @@ Text Generation - Qwen-VL - T + I\ :sup:`E+` - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - + - ✅︎ - ✅︎ * - :code:`Qwen2AudioForConditionalGeneration` - Qwen2-Audio @@ -526,9 +632,9 @@ Text Generation - ✅︎ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`+` + - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - + - ✅︎ - ✅︎ * - :code:`UltravoxModel` - Ultravox @@ -540,8 +646,26 @@ Text Generation | :sup:`E` Pre-computed embeddings can be inputted for this modality. | :sup:`+` Multiple items can be inputted per text prompt for this modality. +.. important:: + To enable multiple multi-modal items per text prompt, you have to set :code:`limit_mm_per_prompt` (offline inference) + or :code:`--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + + .. code-block:: python + + llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, + ) + + .. code-block:: bash + + vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 + +.. note:: + vLLM currently only supports adding LoRA to the language backbone of multimodal models. + .. note:: - For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. + The official :code:`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 Multimodal Embedding @@ -569,11 +693,20 @@ Multimodal Embedding - :code:`TIGER-Lab/VLM2Vec-Full` - 🚧 - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. In this case, you have to pass :code:`--task embedding` to run the model in embedding mode. +.. tip:: + You can override the model's pooling method by passing :code:`--override-pooler-config`. + Model Support Policy ===================== @@ -583,6 +716,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. +.. tip:: + When comparing the output of :code:`model.generate` from HuggingFace Transformers with the output of :code:`llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., `generation_config.json `__) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. diff --git a/vllm/docs/source/models/vlm.rst b/vllm/docs/source/models/vlm.rst deleted file mode 100644 index a47902ab4..000000000 --- a/vllm/docs/source/models/vlm.rst +++ /dev/null @@ -1,255 +0,0 @@ -.. _vlm: - -Using VLMs -========== - -vLLM provides experimental support for Vision Language Models (VLMs). See the :ref:`list of supported VLMs here `. -This document shows you how to run and serve these models using vLLM. - -.. note:: - We are actively iterating on VLM support. See `this RFC `_ for upcoming changes, - and `open an issue on GitHub `_ if you have any feedback or feature requests. - -Offline Inference ------------------ - -Single-image input -^^^^^^^^^^^^^^^^^^ - -The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models. - -.. code-block:: python - - llm = LLM(model="llava-hf/llava-1.5-7b-hf") - -To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: - -* ``prompt``: The prompt should follow the format that is documented on HuggingFace. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. - -.. code-block:: python - - # Refer to the HuggingFace repo for the correct format to use - prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" - - # Load the image using PIL.Image - image = PIL.Image.open(...) - - # Single prompt inference - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input - image_embeds = torch.load(...) # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": {"image": image_embeds}, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Inference with image embeddings as input with additional parameters - # Specifically, we are conducting a trial run of Qwen2VL and MiniCPM-V with the new input format, which utilizes additional parameters. - mm_data = {} - - image_embeds = torch.load(...) # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) - # For Qwen2VL, image_grid_thw is needed to calculate positional encoding. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_grid_thw": torch.load(...) # torch.Tensor of shape (1, 3), - } - # For MiniCPM-V, image_size_list is needed to calculate details of the sliced image. - mm_data['image'] = { - "image_embeds": image_embeds, - "image_size_list": [image.size] # list of image sizes - } - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": mm_data, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - # Batch inference - image_1 = PIL.Image.open(...) - image_2 = PIL.Image.open(...) - outputs = llm.generate( - [ - { - "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_1}, - }, - { - "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", - "multi_modal_data": {"image": image_2}, - } - ] - ) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language.py `_. - -Multi-image input -^^^^^^^^^^^^^^^^^ - -Multi-image input is only supported for a subset of VLMs, as shown :ref:`here `. - -To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class. - -.. code-block:: python - - llm = LLM( - model="microsoft/Phi-3.5-vision-instruct", - trust_remote_code=True, # Required to load Phi-3.5-vision - max_model_len=4096, # Otherwise, it may not fit in smaller GPUs - limit_mm_per_prompt={"image": 2}, # The maximum number to accept - ) - -Instead of passing in a single image, you can pass in a list of images. - -.. code-block:: python - - # Refer to the HuggingFace repo for the correct format to use - prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" - - # Load the images using PIL.Image - image1 = PIL.Image.open(...) - image2 = PIL.Image.open(...) - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [image1, image2] - }, - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. - -Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: - -.. code-block:: python - - # Specify the maximum number of frames per video to be 4. This can be changed. - llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) - - # Create the request payload. - video_frames = ... # load your video making sure it only has the number of frames specified earlier. - message = { - "role": "user", - "content": [ - {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, - ], - } - for i in range(len(video_frames)): - base64_image = encode_image(video_frames[i]) # base64 encoding. - new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} - message["content"].append(new_image) - - # Perform inference and log output. - outputs = llm.chat([message]) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - -Online Inference ----------------- - -OpenAI Vision API -^^^^^^^^^^^^^^^^^ - -You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. - -Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server. - -.. code-block:: bash - - vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ - --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 - -.. important:: - Since OpenAI Vision API is based on `Chat Completions `_ API, - a chat template is **required** to launch the API server. - - Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. - The chat template can be inferred based on the documentation on the model's HuggingFace repo. - For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `_. - -To consume the server, you can use the OpenAI client like in the example below: - -.. code-block:: python - - from openai import OpenAI - - openai_api_key = "EMPTY" - openai_api_base = "http://localhost:8000/v1" - - client = OpenAI( - api_key=openai_api_key, - base_url=openai_api_base, - ) - - # Single-image input inference - image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - # NOTE: The prompt formatting with the image token `` is not needed - # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What’s in this image?"}, - {"type": "image_url", "image_url": {"url": image_url}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - - # Multi-image input inference - image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" - image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" - - chat_response = client.chat.completions.create( - model="microsoft/Phi-3.5-vision-instruct", - messages=[{ - "role": "user", - "content": [ - {"type": "text", "text": "What are the animals in these images?"}, - {"type": "image_url", "image_url": {"url": image_url_duck}}, - {"type": "image_url", "image_url": {"url": image_url_lion}}, - ], - }], - ) - print("Chat completion output:", chat_response.choices[0].message.content) - - -A full code example can be found in `examples/openai_api_client_for_multimodal.py `_. - -.. note:: - - By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable: - - .. code-block:: console - - $ export VLLM_IMAGE_FETCH_TIMEOUT= - -.. note:: - There is no need to format the prompt in the API request since it will be handled by the server. diff --git a/vllm/docs/source/performance/benchmarks.rst b/vllm/docs/source/performance/benchmarks.rst new file mode 100644 index 000000000..6d4d7b544 --- /dev/null +++ b/vllm/docs/source/performance/benchmarks.rst @@ -0,0 +1,33 @@ +.. _benchmarks: + +================ +Benchmark Suites +================ + +vLLM contains two sets of benchmarks: + ++ :ref:`Performance benchmarks ` ++ :ref:`Nightly benchmarks ` + + +.. _performance_benchmarks: + +Performance Benchmarks +---------------------- + +The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM. + +The latest performance results are hosted on the public `vLLM Performance Dashboard `_. + +More information on the performance benchmarks and their parameters can be found `here `__. + +.. _nightly_benchmarks: + +Nightly Benchmarks +------------------ + +These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. + +The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 `_. + +More information on the nightly benchmarks and their parameters can be found `here `__. \ No newline at end of file diff --git a/vllm/docs/source/performance_benchmark/benchmarks.rst b/vllm/docs/source/performance_benchmark/benchmarks.rst deleted file mode 100644 index e5c8d6a55..000000000 --- a/vllm/docs/source/performance_benchmark/benchmarks.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. _benchmarks: - -Benchmark suites of vLLM -======================== - - - -vLLM contains two sets of benchmarks: - -+ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard `_ for the latest performance results. - -+ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README `_. - - -Trigger a benchmark -------------------- - -The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`. - - -.. note:: - - Please refer to `vLLM performance benchmark descriptions `_ and `vLLM nightly benchmark descriptions `_ for detailed descriptions on benchmark environment, workload and metrics. diff --git a/vllm/docs/source/quantization/fp8_e5m2_kvcache.rst b/vllm/docs/source/quantization/fp8_e5m2_kvcache.rst index 9ae07bcd3..b2d824427 100644 --- a/vllm/docs/source/quantization/fp8_e5m2_kvcache.rst +++ b/vllm/docs/source/quantization/fp8_e5m2_kvcache.rst @@ -4,7 +4,7 @@ FP8 E5M2 KV Cache ================== The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bflaot16 and fp8 to each other. +The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. Here is an example of how to enable this feature: diff --git a/vllm/docs/source/quantization/supported_hardware.rst b/vllm/docs/source/quantization/supported_hardware.rst index 9bf0cdb80..09f8e7112 100644 --- a/vllm/docs/source/quantization/supported_hardware.rst +++ b/vllm/docs/source/quantization/supported_hardware.rst @@ -27,7 +27,7 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✗ - - ✗ + - ✅︎ - ✅︎ - ✗ - ✗ @@ -38,8 +38,8 @@ The table below shows the compatibility of various quantization implementations - ✅︎ - ✅︎ - ✗ - - ✗ - - ✗ + - ✅︎ + - ✅︎ - ✗ - ✗ * - Marlin (GPTQ/AWQ/FP8) @@ -129,4 +129,4 @@ Notes: Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. -For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. \ No newline at end of file +For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory `_ or consult with the vLLM development team. diff --git a/vllm/docs/source/serving/distributed_serving.rst b/vllm/docs/source/serving/distributed_serving.rst index fcb2646df..4d57206e5 100644 --- a/vllm/docs/source/serving/distributed_serving.rst +++ b/vllm/docs/source/serving/distributed_serving.rst @@ -22,7 +22,7 @@ After adding enough GPUs and nodes to hold the model, you can run vLLM first, wh Details for Distributed Inference and Serving ---------------------------------------------- -vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We also support pipeline parallel as a beta feature for online serving. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. +vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with either `Ray `_ or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured :code:`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the :code:`LLM` class :code:`distributed-executor-backend` argument or :code:`--distributed-executor-backend` API server argument. Set it to :code:`mp` for multiprocessing or :code:`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. @@ -49,9 +49,6 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip $ --tensor-parallel-size 4 \ $ --pipeline-parallel-size 2 -.. note:: - Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models. - Multi-Node Inference and Serving -------------------------------- diff --git a/vllm/docs/source/serving/integrations.rst b/vllm/docs/source/serving/integrations.rst index 7882e14f3..f39997e0e 100644 --- a/vllm/docs/source/serving/integrations.rst +++ b/vllm/docs/source/serving/integrations.rst @@ -13,3 +13,4 @@ Integrations deploying_with_dstack serving_with_langchain serving_with_llamaindex + serving_with_llamastack diff --git a/vllm/docs/source/serving/metrics.rst b/vllm/docs/source/serving/metrics.rst index 15e57bd3f..231111cd7 100644 --- a/vllm/docs/source/serving/metrics.rst +++ b/vllm/docs/source/serving/metrics.rst @@ -2,9 +2,34 @@ Production Metrics ================== vLLM exposes a number of metrics that can be used to monitor the health of the -system. These metrics are exposed via the `/metrics` endpoint on the vLLM +system. These metrics are exposed via the ``/metrics`` endpoint on the vLLM OpenAI compatible API server. +You can start the server using Python, or using [Docker](deploying_with_docker.rst): + +.. code-block:: console + + $ vllm serve unsloth/Llama-3.2-1B-Instruct + +Then query the endpoint to get the latest metrics from the server: + +.. code-block:: console + + $ curl http://0.0.0.0:8000/metrics + + # HELP vllm:iteration_tokens_total Histogram of number of tokens per engine_step. + # TYPE vllm:iteration_tokens_total histogram + vllm:iteration_tokens_total_sum{model_name="unsloth/Llama-3.2-1B-Instruct"} 0.0 + vllm:iteration_tokens_total_bucket{le="1.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="8.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="16.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="32.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="64.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="128.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="256.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-Instruct"} 3.0 + ... + The following metrics are exposed: .. literalinclude:: ../../../vllm/engine/metrics.py diff --git a/vllm/docs/source/serving/openai_compatible_server.md b/vllm/docs/source/serving/openai_compatible_server.md index 413c87ab2..d75e90807 100644 --- a/vllm/docs/source/serving/openai_compatible_server.md +++ b/vllm/docs/source/serving/openai_compatible_server.md @@ -26,13 +26,168 @@ print(completion.choices[0].message) ``` ## API Reference -Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-reference) for more information on the API. We support all parameters except: -- Chat: `tools`, and `tool_choice`. -- Completions: `suffix`. -vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst). +We currently support the following OpenAI APIs: + +- [Completions API](https://platform.openai.com/docs/api-reference/completions) + - *Note: `suffix` parameter is not supported.* +- [Chat Completions API](https://platform.openai.com/docs/api-reference/chat) + - [Vision](https://platform.openai.com/docs/guides/vision)-related parameters are supported; see [Multimodal Inputs](../usage/multimodal_inputs.rst). + - *Note: `image_url.detail` parameter is not supported.* + - We also support `audio_url` content type for audio files. + - Refer to [vllm.entrypoints.chat_utils](https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/chat_utils.py) for the exact schema. + - *TODO: Support `input_audio` content type as defined [here](https://github.com/openai/openai-python/blob/v1.52.2/src/openai/types/chat/chat_completion_content_part_input_audio_param.py).* + - *Note: `parallel_tool_calls` and `user` parameters are ignored.* +- [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings) + - Instead of `inputs`, you can pass in a list of `messages` (same schema as Chat Completions API), + which will be treated as a single prompt to the model according to its chat template. + - This enables multi-modal inputs to be passed to embedding models, see [this page](../usage/multimodal_inputs.rst) for details. + - *Note: You should run `vllm serve` with `--task embedding` to ensure that the model is being run in embedding mode.* + +## Score API for Cross Encoder Models + +vLLM supports *cross encoders models* at the **/v1/score** endpoint, which is not an OpenAI API standard endpoint. You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +A ***Cross Encoder*** takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 … 1. + +### Example of usage for a pair of a string and a list of texts + +In this case, the model will compare the first given text to each of the texts containing the list. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "text_1": "What is the capital of France?", + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693570, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 0.001094818115234375 + ] + }, + { + "index": 1, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` + +### Example of usage for a pair of two lists of texts + +In this case, the model will compare the one by one, making pairs by same index correspondent in each list. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": [ + "What is the capital of Brazil?", + "What is the capital of France?" + ], + "text_2": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris." + ] +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 1 + ] + }, + { + "index": 1, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` + +### Example of usage for a pair of two strings + +In this case, the model will compare the strings of texts. + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/score' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-v2-m3", + "encoding_format": "float", + "text_1": "What is the capital of France?", + "text_2": "The capital of France is Paris." +}' +``` + +Response: + +```bash +{ + "id": "score-request-id", + "object": "list", + "created": 693447, + "model": "BAAI/bge-reranker-v2-m3", + "data": [ + { + "index": 0, + "object": "score", + "score": [ + 1 + ] + } + ], + "usage": {} +} +``` ## Extra Parameters + vLLM supports a set of parameters that are not part of the OpenAI API. In order to use them, you can pass them as extra parameters in the OpenAI client. Or directly merge them into the JSON payload if you are using HTTP call directly. @@ -49,7 +204,52 @@ completion = client.chat.completions.create( ) ``` -### Extra Parameters for Chat API +### Extra HTTP Headers + +Only `X-Request-Id` HTTP request header is supported for now. + +```python +completion = client.chat.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_headers={ + "x-request-id": "sentiment-classification-00001", + } +) +print(completion._request_id) + +completion = client.completions.create( + model="NousResearch/Meta-Llama-3-8B-Instruct", + prompt="A robot may not injure a human being", + extra_headers={ + "x-request-id": "completion-test", + } +) +print(completion._request_id) +``` + +### Extra Parameters for Completions API + +The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-sampling-params +:end-before: end-completion-sampling-params +``` + +The following extra parameters are supported: + +```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-completion-extra-params +:end-before: end-completion-extra-params +``` + +### Extra Parameters for Chat Completions API + The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py @@ -66,21 +266,22 @@ The following extra parameters are supported: :end-before: end-chat-completion-extra-params ``` -### Extra Parameters for Completions API -The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported. +### Extra Parameters for Embeddings API + +The following [pooling parameters (click through to see documentation)](../dev/pooling_params.rst) are supported. ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-sampling-params -:end-before: end-completion-sampling-params +:start-after: begin-embedding-pooling-params +:end-before: end-embedding-pooling-params ``` The following extra parameters are supported: ```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python -:start-after: begin-completion-extra-params -:end-before: end-completion-extra-params +:start-after: begin-embedding-extra-params +:end-before: end-embedding-extra-params ``` ## Chat Template @@ -103,7 +304,7 @@ vllm serve --chat-template ./path-to-chat-template.jinja vLLM community provides a set of chat templates for popular models. You can find them in the examples directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) -With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies +With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: ```python completion = client.chat.completions.create( @@ -113,12 +314,20 @@ completion = client.chat.completions.create( ] ) ``` -Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like -`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which -format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify -between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match -this, unless explicitly specified. +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the +request. vLLM provides best-effort support to detect this automatically, which is logged as a string like +*"Detected the chat template content format to be..."*, and internally converts incoming requests to match +the detected format, which can be one of: + +- `"string"`: A string. + - Example: `"Hello world"` +- `"openai"`: A list of dictionaries, similar to OpenAI schema. + - Example: `[{"type": "text", "text": "Hello world!"}]` + +If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument +to override which format to use. ## Command line arguments for the server @@ -127,20 +336,13 @@ this, unless explicitly specified. :func: create_parser_for_docs :prog: vllm serve ``` -## Tool Calling in the Chat Completion API -### Named Function Calling -vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is -enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a -high-quality one. -To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and -specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. ### Config file The `serve` module can also accept arguments from a config file in -`yaml` format. The arguments in the yaml must be specified using the -long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): +`yaml` format. The arguments in the yaml must be specified using the +long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): For example: @@ -156,56 +358,70 @@ uvicorn-log-level: "info" $ vllm serve SOME_MODEL --config config.yaml ``` --- -**NOTE** +**NOTE** In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence. The order of priorities is `command line > config file values > defaults`. --- ## Tool calling in the chat completion API -vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap. +vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap. It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. +Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models. + + +### Named Function Calling +vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is +enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +high-quality one. vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and +specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. + ### Automatic Function Calling To enable this feature, you should set the following flags: -* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. -* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages -that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their -`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat +* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their +`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) -If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! +If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + #### Hermes Models (`hermes`) + All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` -_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge -step in their creation_. +_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge +step in their creation_. Flags: `--tool-call-parser hermes` + #### Mistral Models (`mistral`) + Supported models: * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. Known issues: -1. Mistral 7B struggles to generate parallel tool calls correctly. -2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is -much shorter than what vLLM generates. Since an exception is thrown when this condition +1. Mistral 7B struggles to generate parallel tool calls correctly. +2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is +much shorter than what vLLM generates. Since an exception is thrown when this condition is not met, the following additional chat templates are provided: * `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that @@ -216,18 +432,20 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + #### Llama Models (`llama3_json`) + Supported models: * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` -The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: -1. Parallel tool calls are not supported. +1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -236,7 +454,24 @@ it works better with vLLM. Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` +#### IBM Granite + +Supported models: +* `ibm-granite/granite-3.0-8b-instruct` + +Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` + +`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. + +* `ibm-granite/granite-20b-functioncalling` + +Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` + +`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + + #### InternLM Models (`internlm`) + Supported models: * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well @@ -246,6 +481,7 @@ Known issues: Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` + #### Jamba Models (`jamba`) AI21's Jamba-1.5 models are supported. * `ai21labs/AI21-Jamba-1.5-Mini` @@ -255,6 +491,34 @@ AI21's Jamba-1.5 models are supported. Flags: `--tool-call-parser jamba` +#### Models with Pythonic Tool Calls (`pythonic`) + +A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. + +As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: +```python +[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] +``` + +Limitations: +* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) +* Llama's smaller models struggle to use tools effectively. + +Example supported models: +* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) + +Flags: `--tool-call-parser pythonic --chat-template {see_above}` + +--- +**WARNING** +Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary. + +--- + + ### How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -312,5 +576,5 @@ Then you can use this plugin in the command line like this. --tool-parser-plugin --tool-call-parser example \ --chat-template \ -``` +``` diff --git a/vllm/docs/source/serving/run_on_sky.rst b/vllm/docs/source/serving/run_on_sky.rst index 674b14a87..227e6fd2a 100644 --- a/vllm/docs/source/serving/run_on_sky.rst +++ b/vllm/docs/source/serving/run_on_sky.rst @@ -109,7 +109,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 .. raw:: html @@ -129,7 +129,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. @@ -255,7 +255,7 @@ This will scale the service up to when the QPS exceeds 2 for each replica. messages: - role: user content: Hello! What is your name? - max_tokens: 1 + max_completion_tokens: 1 resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. diff --git a/vllm/docs/source/serving/serving_with_llamastack.rst b/vllm/docs/source/serving/serving_with_llamastack.rst new file mode 100644 index 000000000..8ef96c4e5 --- /dev/null +++ b/vllm/docs/source/serving/serving_with_llamastack.rst @@ -0,0 +1,42 @@ +.. _run_on_llamastack: + +Serving with Llama Stack +============================ + +vLLM is also available via `Llama Stack `_ . + +To install Llama Stack, run + +.. code-block:: console + + $ pip install llama-stack -q + +Inference using OpenAI Compatible API +------------------------------------- + +Then start Llama Stack server pointing to your vLLM server with the following configuration: + +.. code-block:: yaml + + inference: + - provider_id: vllm0 + provider_type: remote::vllm + config: + url: http://127.0.0.1:8000 + +Please refer to `this guide `_ for more details on this remote vLLM provider. + +Inference via Embedded vLLM +--------------------------- + +An `inline vLLM provider +`_ +is also available. This is a sample of configuration using that method: + +.. code-block:: yaml + + inference + - provider_type: vllm + config: + model: Llama3.1-8B-Instruct + tensor_parallel_size: 4 diff --git a/vllm/docs/source/serving/compatibility_matrix.rst b/vllm/docs/source/usage/compatibility_matrix.rst similarity index 90% rename from vllm/docs/source/serving/compatibility_matrix.rst rename to vllm/docs/source/usage/compatibility_matrix.rst index cac0605ca..a93632ff3 100644 --- a/vllm/docs/source/serving/compatibility_matrix.rst +++ b/vllm/docs/source/usage/compatibility_matrix.rst @@ -39,12 +39,13 @@ Feature x Feature - :abbr:`prmpt adptr (Prompt Adapter)` - :ref:`SD ` - CUDA graph + - :abbr:`emd (Embedding Models)` - :abbr:`enc-dec (Encoder-Decoder Models)` - :abbr:`logP (Logprobs)` - :abbr:`prmpt logP (Prompt Logprobs)` - :abbr:`async output (Async Output Processing)` - multi-step - - :abbr:`MM (Multimodal)` + - :abbr:`mm (Multimodal)` - best-of - beam-search - :abbr:`guided dec (Guided Decoding)` @@ -64,6 +65,7 @@ Feature x Feature - - - + - * - :ref:`APC ` - ✅ - @@ -80,6 +82,7 @@ Feature x Feature - - - + - * - :ref:`LoRA ` - `✗ `__ - ✅ @@ -96,6 +99,7 @@ Feature x Feature - - - + - * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ - ✅ @@ -112,8 +116,9 @@ Feature x Feature - - - + - * - :ref:`SD ` - - ✗ + - ✅ - ✅ - ✗ - ✅ @@ -128,6 +133,7 @@ Feature x Feature - - - + - * - CUDA graph - ✅ - ✅ @@ -144,6 +150,24 @@ Feature x Feature - - - + - + * - :abbr:`emd (Embedding Models)` + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ + - + - + - + - + - + - + - + - + - + - * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✗ - `✗ `__ @@ -151,6 +175,7 @@ Feature x Feature - ✗ - `✗ `__ - ✅ + - ✅ - - - @@ -166,7 +191,8 @@ Feature x Feature - ✅ - ✅ - ✅ - - ✅ + - ✅ + - ✗ - ✅ - - @@ -183,7 +209,8 @@ Feature x Feature - ✅ - `✗ `__ - ✅ - - ✅ + - ✗ + - ✅ - ✅ - - @@ -199,6 +226,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - ✅ @@ -215,6 +243,7 @@ Feature x Feature - ✅ - ✗ - ✅ + - ✗ - ✗ - ✅ - `✗ `__ @@ -224,14 +253,15 @@ Feature x Feature - - - - * - :abbr:`MM (Multimodal)` - - `✗ `__ + * - :abbr:`mm (Multimodal)` + - ✅ - `✗ `__ - `✗ `__ - ? - ? - ✅ - - ✗ + - ✅ + - ✅ - ✅ - ✅ - ✅ @@ -247,6 +277,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -263,6 +294,7 @@ Feature x Feature - ✅ - `✗ `__ - ✅ + - ✗ - ✅ - ✅ - ✅ @@ -279,11 +311,12 @@ Feature x Feature - ? - ✅ - ✅ + - ✗ - ? - ✅ - ✅ - ✅ - - ✗ + - `✗ `__ - ? - ✅ - ✅ @@ -311,7 +344,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - ✗ + - ✅ - ✅ * - :ref:`APC ` - `✗ `__ @@ -319,7 +352,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - - ✗ + - ✅ - ✅ * - :ref:`LoRA ` - ✅ @@ -353,15 +386,23 @@ Feature x Hardware - ✅ - ✗ - ✅ + * - :abbr:`emd (Embedding Models)` + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ✅ + - ? * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✅ - ✅ - ✅ - ✅ - ✅ - - `✗ `__ + - ✅ - ✗ - * - :abbr:`logP (Logprobs)` + * - :abbr:`mm (Multimodal)` - ✅ - ✅ - ✅ @@ -369,7 +410,7 @@ Feature x Hardware - ✅ - ✅ - ✅ - * - :abbr:`prmpt logP (Prompt Logprobs)` + * - :abbr:`logP (Logprobs)` - ✅ - ✅ - ✅ @@ -377,29 +418,29 @@ Feature x Hardware - ✅ - ✅ - ✅ - * - :abbr:`async output (Async Output Processing)` + * - :abbr:`prmpt logP (Prompt Logprobs)` - ✅ - ✅ - ✅ - ✅ - ✅ - - ✗ - - ✗ - * - multi-step - ✅ - ✅ + * - :abbr:`async output (Async Output Processing)` - ✅ - ✅ - ✅ - - `✗ `__ - ✅ - * - :abbr:`MM (Multimodal)` - ✅ + - ✗ + - ✗ + * - multi-step - ✅ - ✅ - ✅ - ✅ - ✅ + - `✗ `__ - ✅ * - best-of - ✅ diff --git a/vllm/docs/source/models/engine_args.rst b/vllm/docs/source/usage/engine_args.rst similarity index 100% rename from vllm/docs/source/models/engine_args.rst rename to vllm/docs/source/usage/engine_args.rst diff --git a/vllm/docs/source/serving/env_vars.rst b/vllm/docs/source/usage/env_vars.rst similarity index 100% rename from vllm/docs/source/serving/env_vars.rst rename to vllm/docs/source/usage/env_vars.rst diff --git a/vllm/docs/source/serving/faq.rst b/vllm/docs/source/usage/faq.rst similarity index 99% rename from vllm/docs/source/serving/faq.rst rename to vllm/docs/source/usage/faq.rst index 9e858e612..ce327abd5 100644 --- a/vllm/docs/source/serving/faq.rst +++ b/vllm/docs/source/usage/faq.rst @@ -1,3 +1,5 @@ +.. _faq: + Frequently Asked Questions =========================== diff --git a/vllm/docs/source/models/lora.rst b/vllm/docs/source/usage/lora.rst similarity index 99% rename from vllm/docs/source/models/lora.rst rename to vllm/docs/source/usage/lora.rst index ef0177eaf..c2c6fa2ae 100644 --- a/vllm/docs/source/models/lora.rst +++ b/vllm/docs/source/usage/lora.rst @@ -1,7 +1,7 @@ .. _lora: -Using LoRA adapters -=================== +LoRA Adapters +============= This document shows you how to use `LoRA adapters `_ with vLLM on top of a base model. diff --git a/vllm/docs/source/usage/multimodal_inputs.rst b/vllm/docs/source/usage/multimodal_inputs.rst new file mode 100644 index 000000000..c93f65327 --- /dev/null +++ b/vllm/docs/source/usage/multimodal_inputs.rst @@ -0,0 +1,404 @@ +.. _multimodal_inputs: + +Multimodal Inputs +================= + +This page teaches you how to pass multi-modal inputs to :ref:`multi-modal models ` in vLLM. + +.. note:: + We are actively iterating on multi-modal support. See `this RFC `_ for upcoming changes, + and `open an issue on GitHub `_ if you have any feedback or feature requests. + +Offline Inference +----------------- + +To input multi-modal data, follow this schema in :class:`vllm.inputs.PromptType`: + +* ``prompt``: The prompt should follow the format that is documented on HuggingFace. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. + +Image +^^^^^ + +You can pass a single image to the :code:`'image'` field of the multi-modal dictionary, as shown in the following examples: + +.. code-block:: python + + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Load the image using PIL.Image + image = PIL.Image.open(...) + + # Single prompt inference + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + # Batch inference + image_1 = PIL.Image.open(...) + image_2 = PIL.Image.open(...) + outputs = llm.generate( + [ + { + "prompt": "USER: \nWhat is the content of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_1}, + }, + { + "prompt": "USER: \nWhat's the color of this image?\nASSISTANT:", + "multi_modal_data": {"image": image_2}, + } + ] + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +A code example can be found in `examples/offline_inference_vision_language.py `_. + +To substitute multiple images inside the same text prompt, you can pass in a list of images instead: + +.. code-block:: python + + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept + ) + + # Refer to the HuggingFace repo for the correct format to use + prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + + # Load the images using PIL.Image + image1 = PIL.Image.open(...) + image2 = PIL.Image.open(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. + +Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL `_ as it supports videos: + +.. code-block:: python + + # Specify the maximum number of frames per video to be 4. This can be changed. + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + + # Create the request payload. + video_frames = ... # load your video making sure it only has the number of frames specified earlier. + message = { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, + ], + } + for i in range(len(video_frames)): + base64_image = encode_image(video_frames[i]) # base64 encoding. + new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} + message["content"].append(new_image) + + # Perform inference and log output. + outputs = llm.chat([message]) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +Video +^^^^^ + +You can pass a list of NumPy arrays directly to the :code:`'video'` field of the multi-modal dictionary +instead of using multi-image input. + +Please refer to `examples/offline_inference_vision_language.py `_ for more details. + +Audio +^^^^^ + +You can pass a tuple :code:`(array, sampling_rate)` to the :code:`'audio'` field of the multi-modal dictionary. + +Please refer to `examples/offline_inference_audio_language.py `_ for more details. + +Embedding +^^^^^^^^^ + +To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, +pass a tensor of shape :code:`(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. + +.. code-block:: python + + # Inference with image embeddings as input + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + + # Refer to the HuggingFace repo for the correct format to use + prompt = "USER: \nWhat is the content of this image?\nASSISTANT:" + + # Embeddings for single image + # torch.Tensor of shape (1, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": {"image": image_embeds}, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: + +.. code-block:: python + + # Construct the prompt based on your model + prompt = ... + + # Embeddings for multiple images + # torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) + image_embeds = torch.load(...) + + # Qwen2-VL + llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_grid_thw is needed to calculate positional encoding. + "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), + } + } + + # MiniCPM-V + llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) + mm_data = { + "image": { + "image_embeds": image_embeds, + # image_size_list is needed to calculate details of the sliced image. + "image_size_list": [image.size for image in images], # list of image sizes + } + } + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": mm_data, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +Online Inference +---------------- + +Our OpenAI-compatible server accepts multi-modal data via the `Chat Completions API `_. + +.. important:: + A chat template is **required** to use Chat Completions API. + + Although most models come with a chat template, for others you have to define one yourself. + The chat template can be inferred based on the documentation on the model's HuggingFace repo. + For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `__. + +Image +^^^^^ + +Image input is supported according to `OpenAI Vision API `_. +Here is a simple example using Phi-3.5-Vision. + +First, launch the OpenAI-compatible server: + +.. code-block:: bash + + vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ + --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 + +Then, you can use the OpenAI client as follows: + +.. code-block:: python + + from openai import OpenAI + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Single-image input inference + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + # NOTE: The prompt formatting with the image token `` is not needed + # since the prompt will be processed automatically by the API server. + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + + # Multi-image input inference + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. + +.. tip:: + Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via ``--allowed-local-media-path`` when launching the API server/engine, + and pass the file path as ``url`` in the API request. + +.. tip:: + There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. + In fact, you can place image placeholders in the middle of the text by interleaving text and image content. + +.. note:: + + By default, the timeout for fetching images through HTTP URL is ``5`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_IMAGE_FETCH_TIMEOUT= + +Video +^^^^^ + +Instead of :code:`image_url`, you can pass a video file via :code:`video_url`. + +You can use `these tests `_ as reference. + +.. note:: + + By default, the timeout for fetching videos through HTTP URL url is ``30`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_VIDEO_FETCH_TIMEOUT= + +Audio +^^^^^ + +Instead of :code:`image_url`, you can pass an audio file via :code:`audio_url`. + +A full code example can be found in `examples/openai_chat_completion_client_for_multimodal.py `_. + +.. note:: + + By default, the timeout for fetching audios through HTTP URL is ``10`` seconds. + You can override this by setting the environment variable: + + .. code-block:: console + + $ export VLLM_AUDIO_FETCH_TIMEOUT= + +Embedding +^^^^^^^^^ + +vLLM's Embeddings API is a superset of OpenAI's `Embeddings API `_, +where a list of chat ``messages`` can be passed instead of batched ``inputs``. This enables multi-modal inputs to be passed to embedding models. + +.. tip:: + The schema of ``messages`` is exactly the same as in Chat Completions API. + You can refer to the above tutorials for more details on how to pass each type of multi-modal data. + +Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. +Refer to the examples below for illustration. + +Here is an end-to-end example using VLM2Vec. To serve the model: + +.. code-block:: bash + + vllm serve TIGER-Lab/VLM2Vec-Full --task embedding \ + --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja + +.. important:: + + Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass ``--task embedding`` + to run this model in embedding mode instead of text generation mode. + + The custom chat template is completely different from the original one for this model, + and can be found `here `__. + +Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level ``requests`` library: + +.. code-block:: python + + import requests + + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": image_url}}, + {"type": "text", "text": "Represent the given image."}, + ], + }], + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + print("Embedding output:", response_json["data"][0]["embedding"]) + +Below is another example, this time using the ``MrLight/dse-qwen2-2b-mrl-v1`` model. + +.. code-block:: bash + + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + +.. important:: + + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. + + Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, which is handled + by `this custom chat template `__. + +.. important:: + + Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + +A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/vllm/docs/source/models/performance.rst b/vllm/docs/source/usage/performance.rst similarity index 100% rename from vllm/docs/source/models/performance.rst rename to vllm/docs/source/usage/performance.rst diff --git a/vllm/docs/source/models/spec_decode.rst b/vllm/docs/source/usage/spec_decode.rst similarity index 98% rename from vllm/docs/source/models/spec_decode.rst rename to vllm/docs/source/usage/spec_decode.rst index b02c80aeb..67e8ede76 100644 --- a/vllm/docs/source/models/spec_decode.rst +++ b/vllm/docs/source/usage/spec_decode.rst @@ -1,7 +1,7 @@ .. _spec_decode: -Speculative decoding in vLLM -============================ +Speculative decoding +==================== .. warning:: Please note that speculative decoding in vLLM is not yet optimized and does @@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_. + titled *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. **Conclusion** @@ -197,7 +197,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the :ref:`FAQs `. Resources for vLLM contributors ------------------------------- diff --git a/vllm/docs/source/usage/structured_outputs.rst b/vllm/docs/source/usage/structured_outputs.rst new file mode 100644 index 000000000..484e1f17d --- /dev/null +++ b/vllm/docs/source/usage/structured_outputs.rst @@ -0,0 +1,267 @@ +.. _structured_outputs: + +Structured Outputs +================== + +vLLM supports the generation of structured outputs using `outlines `_ or `lm-format-enforcer `_ as backends for the guided decoding. +This document shows you some examples of the different options that are available to generate structured outputs. + + +Online Inference (OpenAI API) +----------------------------- + +You can generate structured outputs using the OpenAI's `Completions `_ and `Chat `_ API. + +The following parameters are supported, which must be added as extra parameters: + +- ``guided_choice``: the output will be exactly one of the choices. +- ``guided_regex``: the output will follow the regex pattern. +- ``guided_json``: the output will follow the JSON schema. +- ``guided_grammar``: the output will follow the context free grammar. +- ``guided_whitespace_pattern``: used to override the default whitespace pattern for guided json decoding. +- ``guided_decoding_backend``: used to select the guided decoding backend to use. + +You can see the complete list of supported parameters on the `OpenAI Compatible Server `_ page. + +Now let´s see an example for each of the cases, starting with the ``guided_choice``, as it´s the easiest one: + +.. code-block:: python + + from openai import OpenAI + client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", + ) + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + ], + extra_body={"guided_choice": ["positive", "negative"]}, + ) + print(completion.choices[0].message.content) + + +The next example shows how to use the ``guided_regex``. The idea is to generate an email address, given a simple regex template: + +.. code-block:: python + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an example email address for Alan Turing, who works in Enigma. End in .com and new line. Example result: alan.turing@enigma.com\n", + } + ], + extra_body={"guided_regex": "\w+@\w+\.com\n", "stop": ["\n"]}, + ) + print(completion.choices[0].message.content) + +One of the most relevant features in structured text generation is the option to generate a valid JSON with pre-defined fields and formats. +For this we can use the ``guided_json`` parameter in two different ways: + +- Using directly a `JSON Schema `_ +- Defining a `Pydantic model `_ and then extracting the JSON Schema from it (which is normally an easier option). + +The next example shows how to use the ``guided_json`` parameter with a Pydantic model: + +.. code-block:: python + + from pydantic import BaseModel + from enum import Enum + + class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + + class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + + json_schema = CarDescription.model_json_schema() + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate a JSON with the brand, model and car_type of the most iconic car from the 90's", + } + ], + extra_body={"guided_json": json_schema}, + ) + print(completion.choices[0].message.content) + +.. tip:: + While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. + This can improve the results notably in most cases. + + +Finally we have the ``guided_grammar``, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. +It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: + +.. code-block:: python + + simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ + """ + + completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[ + { + "role": "user", + "content": "Generate an SQL query to show the 'username' and 'email' from the 'users' table.", + } + ], + extra_body={"guided_grammar": simplified_sql_grammar}, + ) + print(completion.choices[0].message.content) + +The complete code of the examples can be found on `examples/openai_chat_completion_structured_outputs.py `_. + +Experimental Automatic Parsing (OpenAI API) +-------------------------------------------- + +This section covers the OpenAI beta wrapper over the ``client.chat.completions.create()`` method that provides richer integrations with Python specific types. + +At the time of writing (``openai==1.54.4``), this is a "beta" feature in the OpenAI client library. Code reference can be found `here `_. + +For the following examples, vLLM was setup using ``vllm serve meta-llama/Llama-3.1-8B-Instruct`` + +Here is a simple example demonstrating how to get structured output using Pydantic models: + +.. code-block:: python + + from pydantic import BaseModel + from openai import OpenAI + + + class Info(BaseModel): + name: str + age: int + + + client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") + completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "My name is Cameron, I'm 28. What's my name and age?"}, + ], + response_format=Info, + extra_body=dict(guided_decoding_backend="outlines"), + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + print("Name:", message.parsed.name) + print("Age:", message.parsed.age) + +Output: + +.. code-block:: console + + ParsedChatCompletionMessage[Testing](content='{"name": "Cameron", "age": 28}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=Testing(name='Cameron', age=28)) + Name: Cameron + Age: 28 + + +Here is a more complex example using nested Pydantic models to handle a step-by-step math solution: + +.. code-block:: python + + from typing import List + from pydantic import BaseModel + from openai import OpenAI + + + class Step(BaseModel): + explanation: str + output: str + + + class MathResponse(BaseModel): + steps: List[Step] + final_answer: str + + + client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy") + completion = client.beta.chat.completions.parse( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful expert math tutor."}, + {"role": "user", "content": "Solve 8x + 31 = 2."}, + ], + response_format=MathResponse, + extra_body=dict(guided_decoding_backend="outlines"), + ) + + message = completion.choices[0].message + print(message) + assert message.parsed + for i, step in enumerate(message.parsed.steps): + print(f"Step #{i}:", step) + print("Answer:", message.parsed.final_answer) + +Output: + +.. code-block:: console + + ParsedChatCompletionMessage[MathResponse](content='{ "steps": [{ "explanation": "First, let\'s isolate the term with the variable \'x\'. To do this, we\'ll subtract 31 from both sides of the equation.", "output": "8x + 31 - 31 = 2 - 31"}, { "explanation": "By subtracting 31 from both sides, we simplify the equation to 8x = -29.", "output": "8x = -29"}, { "explanation": "Next, let\'s isolate \'x\' by dividing both sides of the equation by 8.", "output": "8x / 8 = -29 / 8"}], "final_answer": "x = -29/8" }', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=MathResponse(steps=[Step(explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation.", output='8x + 31 - 31 = 2 - 31'), Step(explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.', output='8x = -29'), Step(explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8.", output='8x / 8 = -29 / 8')], final_answer='x = -29/8')) + Step #0: explanation="First, let's isolate the term with the variable 'x'. To do this, we'll subtract 31 from both sides of the equation." output='8x + 31 - 31 = 2 - 31' + Step #1: explanation='By subtracting 31 from both sides, we simplify the equation to 8x = -29.' output='8x = -29' + Step #2: explanation="Next, let's isolate 'x' by dividing both sides of the equation by 8." output='8x / 8 = -29 / 8' + Answer: x = -29/8 + +Offline Inference +----------------- + +Offline inference allows for the same types of guided decoding. +To use it, we´ll need to configure the guided decoding using the class ``GuidedDecodingParams`` inside ``SamplingParams``. +The main available options inside ``GuidedDecodingParams`` are: + +- ``json`` +- ``regex`` +- ``choice`` +- ``grammar`` +- ``backend`` +- ``whitespace_pattern`` + +These parameters can be used in the same way as the parameters from the Online Inference examples above. +One example for the usage of the ``choices`` parameter is shown below: + +.. code-block:: python + + from vllm import LLM, SamplingParams + from vllm.sampling_params import GuidedDecodingParams + + llm = LLM(model="HuggingFaceTB/SmolLM2-1.7B-Instruct") + + guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) + sampling_params = SamplingParams(guided_decoding=guided_decoding_params) + outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, + ) + print(outputs[0].outputs[0].text) + +A complete example with all options can be found in `examples/offline_inference_structured_outputs.py `_. diff --git a/vllm/docs/source/serving/usage_stats.md b/vllm/docs/source/usage/usage_stats.md similarity index 100% rename from vllm/docs/source/serving/usage_stats.md rename to vllm/docs/source/usage/usage_stats.md diff --git a/vllm/examples/disaggregated_prefill.sh b/vllm/examples/disaggregated_prefill.sh new file mode 100644 index 000000000..87155273a --- /dev/null +++ b/vllm/examples/disaggregated_prefill.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# This file demonstrates the example usage of disaggregated prefilling +# We will launch 2 vllm instances (1 for prefill and 1 for decode), +# and then transfer the KV cache between them. + +echo "🚧🚧 Warning: The usage of disaggregated prefill is experimental and subject to change 🚧🚧" +sleep 1 + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'cleanup' INT + +# Cleanup function +cleanup() { + echo "Caught Ctrl+C, cleaning up..." + # Cleanup commands + pgrep python | xargs kill -9 + pkill -f python + echo "Cleanup complete. Exiting." + exit 0 +} + +export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + +# install quart first -- required for disagg prefill proxy serve +if python3 -c "import quart" &> /dev/null; then + echo "Quart is already installed." +else + echo "Quart is not installed. Installing..." + python3 -m pip install quart +fi + +# a function that waits vLLM server to start +wait_for_server() { + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + + +# You can also adjust --kv-ip and --kv-port for distributed inference. + +# prefilling instance, which is the KV producer +CUDA_VISIBLE_DEVICES=0 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8100 \ + --max-model-len 100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & + +# decoding instance, which is the KV consumer +CUDA_VISIBLE_DEVICES=1 vllm serve meta-llama/Meta-Llama-3.1-8B-Instruct \ + --port 8200 \ + --max-model-len 100 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & + +# wait until prefill and decode instances are ready +wait_for_server 8100 +wait_for_server 8200 + +# launch a proxy server that opens the service at port 8000 +# the workflow of this proxy: +# - send the request to prefill vLLM instance (port 8100), change max_tokens +# to 1 +# - after the prefill vLLM finishes prefill, send the request to decode vLLM +# instance +# NOTE: the usage of this API is subject to change --- in the future we will +# introduce "vllm connect" to connect between prefill and decode instances +python3 ../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py & +sleep 1 + +# serve two example requests +output1=$(curl -X POST -s http://localhost:8000/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", +"prompt": "San Francisco is a", +"max_tokens": 10, +"temperature": 0 +}') + +output2=$(curl -X POST -s http://localhost:8000/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "meta-llama/Meta-Llama-3.1-8B-Instruct", +"prompt": "Santa Clara is a", +"max_tokens": 10, +"temperature": 0 +}') + + +# Cleanup commands +pgrep python | xargs kill -9 +pkill -f python + +echo "" + +sleep 1 + +# Print the outputs of the curl requests +echo "" +echo "Output of first request: $output1" +echo "Output of second request: $output2" + +echo "🎉🎉 Successfully finished 2 test requests! 🎉🎉" +echo "" diff --git a/vllm/examples/fp8/quantizer/quantize.py b/vllm/examples/fp8/quantizer/quantize.py index 15f1a06b1..d75cc8b3d 100644 --- a/vllm/examples/fp8/quantizer/quantize.py +++ b/vllm/examples/fp8/quantizer/quantize.py @@ -230,7 +230,7 @@ def calibrate_loop(): def main(args): if not torch.cuda.is_available(): - raise EnvironmentError("GPU is required for inference.") + raise OSError("GPU is required for inference.") random.seed(RAND_SEED) np.random.seed(RAND_SEED) @@ -314,7 +314,7 @@ def main(args): # Workaround for wo quantization if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: - with open(f"{export_path}/config.json", 'r') as f: + with open(f"{export_path}/config.json") as f: tensorrt_llm_config = json.load(f) if args.qformat == "int8_wo": tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' diff --git a/vllm/examples/logging_configuration.md b/vllm/examples/logging_configuration.md index 0d278b039..9ac8b13cd 100644 --- a/vllm/examples/logging_configuration.md +++ b/vllm/examples/logging_configuration.md @@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence: { "formatters": { "vllm": { - "class": "vllm.logging.NewLineFormatter", + "class": "vllm.logging_utils.NewLineFormatter", "datefmt": "%m-%d %H:%M:%S", "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" } diff --git a/vllm/examples/offline_inference.py b/vllm/examples/offline_inference.py index 9b758fa24..23cc6e853 100644 --- a/vllm/examples/offline_inference.py +++ b/vllm/examples/offline_inference.py @@ -19,4 +19,4 @@ for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") \ No newline at end of file diff --git a/vllm/examples/offline_inference_audio_language.py b/vllm/examples/offline_inference_audio_language.py index 37ec667d9..050b791b6 100644 --- a/vllm/examples/offline_inference_audio_language.py +++ b/vllm/examples/offline_inference_audio_language.py @@ -34,11 +34,7 @@ def run_ultravox(question: str, audio_count: int): tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name, - enforce_eager=True, - enable_chunked_prefill=False, - max_model_len=8192, - limit_mm_per_prompt={"audio": audio_count}) + llm = LLM(model=model_name, limit_mm_per_prompt={"audio": audio_count}) stop_token_ids = None return llm, prompt, stop_token_ids diff --git a/vllm/examples/offline_inference_cli.py b/vllm/examples/offline_inference_cli.py new file mode 100644 index 000000000..391ac6b9b --- /dev/null +++ b/vllm/examples/offline_inference_cli.py @@ -0,0 +1,80 @@ +from dataclasses import asdict + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def get_prompts(num_prompts: int): + # The default sample prompts. + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + if num_prompts != len(prompts): + prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts] + + return prompts + + +def main(args): + # Create prompts + prompts = get_prompts(args.num_prompts) + + # Create a sampling params object. + sampling_params = SamplingParams(n=args.n, + temperature=args.temperature, + top_p=args.top_p, + top_k=args.top_k, + max_tokens=args.max_tokens) + + # Create an LLM. + # The default model is 'facebook/opt-125m' + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**asdict(engine_args)) + + # Generate texts from the prompts. + # The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + +if __name__ == '__main__': + parser = FlexibleArgumentParser() + parser = EngineArgs.add_cli_args(parser) + group = parser.add_argument_group("SamplingParams options") + group.add_argument("--num-prompts", + type=int, + default=4, + help="Number of prompts used for inference") + group.add_argument("--max-tokens", + type=int, + default=16, + help="Generated output length for sampling") + group.add_argument('--n', + type=int, + default=1, + help='Number of generated sequences per prompt') + group.add_argument('--temperature', + type=float, + default=0.8, + help='Temperature for text generation') + group.add_argument('--top-p', + type=float, + default=0.95, + help='top_p for text generation') + group.add_argument('--top-k', + type=int, + default=-1, + help='top_k for text generation') + + args = parser.parse_args() + main(args) diff --git a/vllm/examples/offline_inference_embedding.py b/vllm/examples/offline_inference_embedding.py index 7d5ef128b..ae158eef2 100644 --- a/vllm/examples/offline_inference_embedding.py +++ b/vllm/examples/offline_inference_embedding.py @@ -10,7 +10,7 @@ # Create an LLM. model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) -# Generate embedding. The output is a list of EmbeddingRequestOutputs. +# Generate embedding. The output is a list of PoolingRequestOutputs. outputs = model.encode(prompts) # Print the outputs. for output in outputs: diff --git a/vllm/examples/offline_inference_openai.md b/vllm/examples/offline_inference_openai.md index ea34374ed..4c6419797 100644 --- a/vllm/examples/offline_inference_openai.md +++ b/vllm/examples/offline_inference_openai.md @@ -35,8 +35,8 @@ ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` ### Step 2: Run the batch @@ -94,8 +94,8 @@ To follow along with this example, you can download the example batch, or create ``` $ cat openai_example_batch.jsonl -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` Now upload your batch file to your S3 bucket. diff --git a/vllm/examples/offline_inference_structured_outputs.py b/vllm/examples/offline_inference_structured_outputs.py new file mode 100644 index 000000000..00d864606 --- /dev/null +++ b/vllm/examples/offline_inference_structured_outputs.py @@ -0,0 +1,78 @@ +from enum import Enum + +from pydantic import BaseModel + +from vllm import LLM, SamplingParams +from vllm.sampling_params import GuidedDecodingParams + +llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100) + +# Guided decoding by Choice (list of possible options) +guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +outputs = llm.generate( + prompts="Classify this sentiment: vLLM is wonderful!", + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) + +# Guided decoding by Regex +guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n") +sampling_params = SamplingParams(guided_decoding=guided_decoding_params, + stop=["\n"]) +prompt = ("Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n") +outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) +print(outputs[0].outputs[0].text) + + +# Guided decoding by JSON using Pydantic schema +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +guided_decoding_params = GuidedDecodingParams(json=json_schema) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +prompt = ("Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's") +outputs = llm.generate( + prompts=prompt, + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) + +# Guided decoding by Grammar +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" +guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar) +sampling_params = SamplingParams(guided_decoding=guided_decoding_params) +prompt = ("Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table.") +outputs = llm.generate( + prompts=prompt, + sampling_params=sampling_params, +) +print(outputs[0].outputs[0].text) diff --git a/vllm/examples/offline_inference_vision_language.py b/vllm/examples/offline_inference_vision_language.py index 83d2548a5..f08f22eec 100644 --- a/vllm/examples/offline_inference_vision_language.py +++ b/vllm/examples/offline_inference_vision_language.py @@ -176,6 +176,31 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids +# H2OVL-Mississippi +def run_h2ovl(question: str, modality: str): + assert modality == "image" + + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + ) + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + messages = [{'role': 'user', 'content': f"\n{question}"}] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + return llm, prompt, stop_token_ids + + # InternVL def run_internvl(question: str, modality: str): assert modality == "image" @@ -262,10 +287,9 @@ def run_qwen2_vl(question: str, modality: str): model_name = "Qwen/Qwen2-VL-7B-Instruct" - # Tested on L40 llm = LLM( model=model_name, - max_model_len=8192, + max_model_len=4096, max_num_seqs=5, # Note - mm_processor_kwargs can also be passed to generate/chat calls mm_processor_kwargs={ @@ -353,6 +377,48 @@ def run_glm4v(question: str, modality: str): return llm, prompt, stop_token_ids +# Idefics3-8B-Llama3 +def run_idefics3(question: str, modality: str): + assert modality == "image" + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 3 * 364 + }, + }, + ) + prompt = ( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Aria +def run_aria(question: str, modality: str): + assert modality == "image" + model_name = "rhymes-ai/Aria" + + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16") + + prompt = (f"<|im_start|>user\n<|img|>\n{question}" + "<|im_end|>\n<|im_start|>assistant\n") + + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -364,6 +430,7 @@ def run_glm4v(question: str, modality: str): "chameleon": run_chameleon, "minicpmv": run_minicpmv, "blip-2": run_blip2, + "h2ovl_chat": run_h2ovl, "internvl_chat": run_internvl, "NVLM_D": run_nvlm_d, "qwen_vl": run_qwen_vl, @@ -372,6 +439,8 @@ def run_glm4v(question: str, modality: str): "mllama": run_mllama, "molmo": run_molmo, "glm4v": run_glm4v, + "idefics3": run_idefics3, + "aria": run_aria, } diff --git a/vllm/examples/offline_inference_vision_language_multi_image.py b/vllm/examples/offline_inference_vision_language_multi_image.py index e28514bf4..788b604cf 100644 --- a/vllm/examples/offline_inference_vision_language_multi_image.py +++ b/vllm/examples/offline_inference_vision_language_multi_image.py @@ -107,6 +107,40 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ) +def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: + model_name = "h2oai/h2ovl-mississippi-2b" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_model_len=8192, + limit_mm_per_prompt={"image": len(image_urls)}, + mm_processor_kwargs={"max_dynamic_patch": 4}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for H2OVL-Mississippi + # https://huggingface.co/h2oai/h2ovl-mississippi-2b + stop_token_ids = [tokenizer.eos_token_id] + + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "OpenGVLab/InternVL2-2B" @@ -256,13 +290,66 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData: ) +def load_idefics3(question, image_urls: List[str]) -> ModelRequestData: + model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + + # The configuration below has been confirmed to launch on a single L40 GPU. + llm = LLM( + model=model_name, + max_model_len=8192, + max_num_seqs=16, + enforce_eager=True, + limit_mm_per_prompt={"image": len(image_urls)}, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 2 * 364 + }, + }, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n{question}\nAssistant:" # noqa: E501 + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=None, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None, + ) + + +def load_aria(question, image_urls: List[str]) -> ModelRequestData: + model_name = "rhymes-ai/Aria" + llm = LLM(model=model_name, + tokenizer_mode="slow", + trust_remote_code=True, + dtype="bfloat16", + limit_mm_per_prompt={"image": len(image_urls)}) + placeholders = "<|img|>\n" * len(image_urls) + prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] + return ModelRequestData( + llm=llm, + prompt=prompt, + stop_token_ids=stop_token_ids, + image_data=[fetch_image(url) for url in image_urls], + chat_template=None) + + model_example_map = { "phi3_v": load_phi3v, + "h2ovl_chat": load_h2onvl, "internvl_chat": load_internvl, "NVLM_D": load_nvlm_d, "qwen2_vl": load_qwen2_vl, "qwen_vl_chat": load_qwenvl_chat, "mllama": load_mllama, + "idefics3": load_idefics3, + "aria": load_aria, } diff --git a/vllm/examples/openai_api_client_for_multimodal.py b/vllm/examples/openai_chat_completion_client_for_multimodal.py similarity index 97% rename from vllm/examples/openai_api_client_for_multimodal.py rename to vllm/examples/openai_chat_completion_client_for_multimodal.py index beb83e494..0ec4f71dd 100644 --- a/vllm/examples/openai_api_client_for_multimodal.py +++ b/vllm/examples/openai_chat_completion_client_for_multimodal.py @@ -53,7 +53,7 @@ def run_text_only() -> None: "content": "What's the capital of France?" }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion.choices[0].message.content @@ -83,7 +83,7 @@ def run_single_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -109,7 +109,7 @@ def run_single_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_base64.choices[0].message.content @@ -144,7 +144,7 @@ def run_multi_image() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -175,7 +175,7 @@ def run_audio() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_url.choices[0].message.content @@ -201,7 +201,7 @@ def run_audio() -> None: ], }], model=model, - max_tokens=64, + max_completion_tokens=64, ) result = chat_completion_from_base64.choices[0].message.content diff --git a/vllm/examples/openai_chat_completion_structured_outputs.py b/vllm/examples/openai_chat_completion_structured_outputs.py new file mode 100644 index 000000000..8c059c7ca --- /dev/null +++ b/vllm/examples/openai_chat_completion_structured_outputs.py @@ -0,0 +1,94 @@ +from enum import Enum + +from openai import OpenAI +from pydantic import BaseModel + +client = OpenAI( + base_url="http://localhost:8000/v1", + api_key="-", +) + +# Guided decoding by Choice (list of possible options) +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": "Classify this sentiment: vLLM is wonderful!" + }], + extra_body={"guided_choice": ["positive", "negative"]}, +) +print(completion.choices[0].message.content) + +# Guided decoding by Regex +prompt = ("Generate an email address for Alan Turing, who works in Enigma." + "End in .com and new line. Example result:" + "alan.turing@enigma.com\n") + +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={ + "guided_regex": "\w+@\w+\.com\n", + "stop": ["\n"] + }, +) +print(completion.choices[0].message.content) + + +# Guided decoding by JSON using Pydantic schema +class CarType(str, Enum): + sedan = "sedan" + suv = "SUV" + truck = "Truck" + coupe = "Coupe" + + +class CarDescription(BaseModel): + brand: str + model: str + car_type: CarType + + +json_schema = CarDescription.model_json_schema() + +prompt = ("Generate a JSON with the brand, model and car_type of" + "the most iconic car from the 90's") +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={"guided_json": json_schema}, +) +print(completion.choices[0].message.content) + +# Guided decoding by Grammar +simplified_sql_grammar = """ + ?start: select_statement + + ?select_statement: "SELECT " column_list " FROM " table_name + + ?column_list: column_name ("," column_name)* + + ?table_name: identifier + + ?column_name: identifier + + ?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/ +""" + +prompt = ("Generate an SQL query to show the 'username' and 'email'" + "from the 'users' table.") +completion = client.chat.completions.create( + model="Qwen/Qwen2.5-3B-Instruct", + messages=[{ + "role": "user", + "content": prompt, + }], + extra_body={"guided_grammar": simplified_sql_grammar}, +) +print(completion.choices[0].message.content) diff --git a/vllm/examples/openai_chat_embedding_client_for_multimodal.py b/vllm/examples/openai_chat_embedding_client_for_multimodal.py new file mode 100644 index 000000000..fff82020d --- /dev/null +++ b/vllm/examples/openai_chat_embedding_client_for_multimodal.py @@ -0,0 +1,120 @@ +import argparse +import base64 +import io + +import requests +from PIL import Image + +image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + + +def vlm2vec(): + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": + "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }], + "encoding_format": + "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +def dse_qwen2_vl(inp: dict): + # Embedding an Image + if inp["dtype"] == "image": + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": inp["image_url"], + } + }, { + "type": "text", + "text": "What is shown in this image?" + }] + }] + # Embedding a Text Query + else: + # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image + # of the minimum input size + buffer = io.BytesIO() + image_placeholder = Image.new("RGB", (56, 56)) + image_placeholder.save(buffer, "png") + buffer.seek(0) + image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_placeholder}", + } + }, + { + "type": "text", + "text": f"Query: {inp['content']}" + }, + ] + }] + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "MrLight/dse-qwen2-2b-mrl-v1", + "messages": messages, + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + "Script to call a specified VLM through the API. Make sure to serve " + "the model with --task embedding before running this.") + parser.add_argument("model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.") + args = parser.parse_args() + + if args.model == "vlm2vec": + vlm2vec() + elif args.model == "dse_qwen2_vl": + dse_qwen2_vl({ + "dtye": "image", + "image_url": image_url, + }) + dse_qwen2_vl({ + "dtype": "text", + "content": "What is the weather like today?", + }) diff --git a/vllm/examples/openai_cross_encoder_score.py b/vllm/examples/openai_cross_encoder_score.py new file mode 100644 index 000000000..8c32eea5d --- /dev/null +++ b/vllm/examples/openai_cross_encoder_score.py @@ -0,0 +1,58 @@ +"""Examples Python client Score for Cross Encoder Models +""" + +import argparse +import json +import pprint + +import requests + + +def post_http_request(prompt: json, api_url: str) -> requests.Response: + headers = {"User-Agent": "Test Client"} + response = requests.post(api_url, headers=headers, json=prompt) + return response + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") + args = parser.parse_args() + api_url = f"http://{args.host}:{args.port}/v1/score" + + model_name = args.model + + text_1 = "What is the capital of France?" + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 is string and text_2 is a list:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) + + text_1 = [ + "What is the capital of Brazil?", "What is the capital of France?" + ] + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 and text_2 are lists:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) + + text_1 = "What is the capital of Brazil?" + text_2 = "The capital of Brazil is Brasilia." + prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} + score_response = post_http_request(prompt=prompt, api_url=api_url) + print("Prompt for text_1 and text_2 are strings:") + pprint.pprint(prompt) + print("Score Response:") + pprint.pprint(score_response.data) \ No newline at end of file diff --git a/vllm/examples/openai_example_batch.jsonl b/vllm/examples/openai_example_batch.jsonl index 5aa7e185c..54ac8c813 100644 --- a/vllm/examples/openai_example_batch.jsonl +++ b/vllm/examples/openai_example_batch.jsonl @@ -1,2 +1,2 @@ -{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} -{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} +{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} diff --git a/vllm/examples/production_monitoring/grafana.json b/vllm/examples/production_monitoring/grafana.json index d1389f539..f76a61bb5 100644 --- a/vllm/examples/production_monitoring/grafana.json +++ b/vllm/examples/production_monitoring/grafana.json @@ -1,33 +1,4 @@ { - "__inputs": [ - ], - "__elements": {}, - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "10.4.2" - }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "timeseries", - "name": "Time series", - "version": "" - } - ], "annotations": { "list": [ { @@ -54,7 +25,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": null, + "id": 1, "links": [], "liveNow": false, "panels": [ @@ -76,6 +47,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -241,6 +213,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -358,6 +331,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -523,6 +497,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -658,6 +633,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -823,6 +799,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -984,7 +961,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1076,7 +1053,7 @@ "unit": "none" } }, - "pluginVersion": "10.4.2", + "pluginVersion": "11.2.0", "targets": [ { "datasource": { @@ -1117,6 +1094,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -1147,8 +1125,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1199,6 +1176,319 @@ ], "title": "Finish Reason", "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Queue Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Prefill", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Decode", + "range": true, + "refId": "B" + } + ], + "title": "Requests Prefill and Decode Time", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Tokens", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Generation Token in Sequence Group", + "type": "timeseries" } ], "refresh": "", @@ -1207,21 +1497,34 @@ "templating": { "list": [ { - "type": "datasource", - "name": "DS_PROMETHEUS", - "label": "datasource", - "current": {}, + "current": { + "selected": false, + "text": "prometheus", + "value": "edx8memhpd9tsa" + }, "hide": 0, "includeAll": false, + "label": "datasource", "multi": false, + "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false + "skipUrlSync": false, + "type": "datasource" }, { + "current": { + "selected": false, + "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct", + "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct" + }, + "datasource": { + "type": "prometheus", + "uid": "edx8memhpd9tsa" + }, "definition": "label_values(model_name)", "hide": 0, "includeAll": false, @@ -1249,7 +1552,6 @@ "timezone": "", "title": "vLLM", "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b", - "version": 1, + "version": 8, "weekStart": "" } - diff --git a/vllm/examples/run_cluster.sh b/vllm/examples/run_cluster.sh index 8e4aa59e1..7b4b40b4b 100644 --- a/vllm/examples/run_cluster.sh +++ b/vllm/examples/run_cluster.sh @@ -14,7 +14,7 @@ PATH_TO_HF_HOME="$4" shift 4 # Additional arguments are passed directly to the Docker command -ADDITIONAL_ARGS="$@" +ADDITIONAL_ARGS=("$@") # Validate node type if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then @@ -45,5 +45,5 @@ docker run \ --shm-size 10.24g \ --gpus all \ -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \ - ${ADDITIONAL_ARGS} \ + "${ADDITIONAL_ARGS[@]}" \ "${DOCKER_IMAGE}" -c "${RAY_START_CMD}" diff --git a/vllm/examples/template_dse_qwen2_vl.jinja b/vllm/examples/template_dse_qwen2_vl.jinja new file mode 100644 index 000000000..e7b93fae3 --- /dev/null +++ b/vllm/examples/template_dse_qwen2_vl.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %} +{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %} +{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %} +{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %} +{% endraw %}{% endif %}<|endoftext|> \ No newline at end of file diff --git a/vllm/examples/template_vlm2vec.jinja b/vllm/examples/template_vlm2vec.jinja new file mode 100644 index 000000000..489b99604 --- /dev/null +++ b/vllm/examples/template_vlm2vec.jinja @@ -0,0 +1,16 @@ +{%- if messages | length > 1 -%} + {{ raise_exception('Embedding models should only embed one message at a time') }} +{%- endif -%} + +{% set vars = namespace(parts=[], next_image_id=1) %} +{%- for message in messages -%} + {%- for content in message['content'] -%} + {%- if content['type'] == 'text' -%} + {%- set vars.parts = vars.parts + [content['text']] %} + {%- elif content['type'] == 'image' -%} + {%- set vars.parts = vars.parts + ['<|image_{i:d}|>'.format(i=vars.next_image_id)] %} + {%- set vars.next_image_id = vars.next_image_id + 1 %} + {%- endif -%} + {%- endfor -%} +{%- endfor -%} +{{ vars.parts | join(' ') }} diff --git a/vllm/examples/tool_chat_template_granite.jinja b/vllm/examples/tool_chat_template_granite.jinja new file mode 100644 index 000000000..467dcb2d1 --- /dev/null +++ b/vllm/examples/tool_chat_template_granite.jinja @@ -0,0 +1,36 @@ +{%- if tools %} + {{- '<|start_of_role|>available_tools<|end_of_role|> +' }} + {%- for tool in tools %} + {{- tool | tojson(indent=4) }} + {%- if not loop.last %} + {{- ' + +' }} + {%- endif %} + {%- endfor %} + {{- '<|end_of_text|> +' }} +{%- endif %} + +{%- for message in messages %} + {%- if message['role'] == 'system' %} + {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'user' %} + {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %} + {{- '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message.tool_calls|map(attribute='function')|list|tojson(indent=4) + '<|end_of_text|> +' }} + {%- elif message['role'] == 'assistant' %} + {{- '<|start_of_role|>assistant<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- elif message['role'] == 'tool_response' or message['role'] == 'tool' %} + {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|> +' }} + {%- endif %} + {%- if loop.last and add_generation_prompt %} + {{- '<|start_of_role|>assistant<|end_of_role|>' }} + {%- endif %} +{%- endfor %} diff --git a/vllm/examples/tool_chat_template_granite_20b_fc.jinja b/vllm/examples/tool_chat_template_granite_20b_fc.jinja new file mode 100644 index 000000000..cb52188ec --- /dev/null +++ b/vllm/examples/tool_chat_template_granite_20b_fc.jinja @@ -0,0 +1,130 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + +{%- if not full_function_description is defined %} + {%- set full_function_description = false %} +{%- endif %} + +{%- macro full_description(tool) %} + {{- tool.name + '(' }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- if tool.parameters is defined %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- endif %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} +{%- endmacro %} + +{%- macro simple_description(tool) %} + {{- tool.description }} +{%- endmacro %} + +{%- macro function_description(tool) %} + {%- if full_function_description %} + {{- full_description(tool) }} + {%- else %} + {{- simple_description(tool) }} + {%- endif %} +{%- endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set sys_prompt = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} + {% set sys_prompt = 'You are a helpful assistant with access to the following function calls. Your task is to understand the given conversation with function calls and responses and generate natural language response as the ASSISTANT to continue the conversation. You may use the following function calls to understand how to respond to the user query.' %} +{%- endif %} + +{{ 'SYSTEM: ' + sys_prompt }} +{% if tools is iterable and tools | length > 0 %} +<|function_call_library|> + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + function_description(tool) }} + {{- ', "parameters": ' }} + {%- if not tool.parameters is defined or tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +If none of the functions are relevant or the given question lacks the parameters required by the function, please output \" {\"name\": \"no_function\", \"arguments\": {}}\". +{%- endif %} + + + +{% for message in messages %} + {% if message['role'] == 'user' %} + {{- '\nUSER: ' + message['content'] }} + {% elif message['role'] == 'assistant' and message.tool_calls is defined %} + {{- '\nASSISTANT:' }} + {% for tc in message.tool_calls %} + {{- ' ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson }} + {% endfor %} + {{- '<|endoftext|>' }} + {% elif message['role'] == 'assistant' %} + {{- '\nASSISTANT: ' + message['content'] + ' <|endoftext|>' }} + {% elif message['role'] == 'tool' %} + {{- ' ' + message['content'] }} + {%- else %} + {{- raise_exception("Unexpected combination of role and message content") }} + {% endif %} + {% if loop.last and add_generation_prompt %} + {{- '\nASSISTANT: ' }} + {% endif %} +{% endfor %} diff --git a/vllm/examples/tool_chat_template_llama3.1_json.jinja b/vllm/examples/tool_chat_template_llama3.1_json.jinja index c24a7e513..033830936 100644 --- a/vllm/examples/tool_chat_template_llama3.1_json.jinja +++ b/vllm/examples/tool_chat_template_llama3.1_json.jinja @@ -19,10 +19,18 @@ {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} - {%- set system_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content']|trim %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text']|trim %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- if tools is not none %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- else %} + {%- set system_message = "" %} + {%- endif %} {%- endif %} {#- System message #} @@ -33,8 +41,8 @@ {{- "Cutting Knowledge Date: December 2023\n" }} {{- "Today Date: " + date_string + "\n\n" }} {%- if tools is not none and not tools_in_user_message %} - {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -48,7 +56,11 @@ {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} - {%- set first_user_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- else %} + {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} @@ -56,7 +68,7 @@ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -67,7 +79,17 @@ {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {%- if message['content'] is string %} + {{- message['content'] | trim}} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- content['text'] | trim }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '<|eot_id|>' }} {%- elif 'tool_calls' in message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only supports single tool-calls at once!") }} @@ -81,10 +103,14 @@ {{- "<|eot_id|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} - {%- if message.content is mapping %} - {{- message.content | tojson }} - {%- else %} + {%- if message.content is string %} {{- { "output": message.content } | tojson }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- { "output": content['text'] } | tojson }} + {%- endif %} + {%- endfor %} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} diff --git a/vllm/examples/tool_chat_template_llama3.2_json.jinja b/vllm/examples/tool_chat_template_llama3.2_json.jinja index 7e2477772..2b290c0ee 100644 --- a/vllm/examples/tool_chat_template_llama3.2_json.jinja +++ b/vllm/examples/tool_chat_template_llama3.2_json.jinja @@ -16,38 +16,62 @@ {%- set tools = none %} {%- endif %} +{#- Find out if there are any images #} +{% set image_ns = namespace(has_images=false) %} +{%- for message in messages %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {%- set image_ns.has_images = true %} + {%- endif %} + {%- endfor %} +{%- endfor %} + {#- This block extracts the system message, so we can slot it into the right place. #} {%- if messages[0]['role'] == 'system' %} - {%- set system_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set system_message = messages[0]['content']|trim %} + {%- else %} + {%- set system_message = messages[0]['content'][0]['text']|trim %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} - {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- if tools is not none %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} + {%- else %} + {%- set system_message = "" %} + {%- endif %} {%- endif %} -{#- System message #} -{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} -{%- if tools is not none %} - {{- "Environment: ipython\n" }} -{%- endif %} -{{- "Cutting Knowledge Date: December 2023\n" }} -{{- "Today Date: " + date_string + "\n\n" }} -{%- if tools is not none and not tools_in_user_message %} - {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} - {{- "Do not use variables.\n\n" }} - {%- for t in tools %} - {{- t | tojson(indent=4) }} - {{- "\n\n" }} - {%- endfor %} +{#- System message if there are no images, if the user supplied one, or if tools are used (default tool system message) #} +{%- if system_message or not image_ns.has_images %} + {{- "<|start_header_id|>system<|end_header_id|>\n\n" }} + {%- if tools is not none %} + {{- "Environment: ipython\n" }} + {%- endif %} + {{- "Cutting Knowledge Date: December 2023\n" }} + {{- "Today Date: " + date_string + "\n\n" }} + {%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call. " }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {%- endif %} + {{- system_message }} + {{- "<|eot_id|>" }} {%- endif %} -{{- system_message }} -{{- "<|eot_id|>" }} {#- Custom tools are passed in a user message with some extra guidance #} {%- if tools_in_user_message and not tools is none %} {#- Extract the first user message so we can plug it in here #} {%- if messages | length != 0 %} - {%- set first_user_message = messages[0]['content']|trim %} + {%- if messages[0]['content'] is string %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- else %} + {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %} + {%- endif %} {%- set messages = messages[1:] %} {%- else %} {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} @@ -55,7 +79,7 @@ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} {{- "Given the following functions, please respond with a JSON for a function call " }} {{- "with its proper arguments that best answers the given prompt.\n\n" }} - {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}. ' }} {{- "Do not use variables.\n\n" }} {%- for t in tools %} {{- t | tojson(indent=4) }} @@ -66,7 +90,19 @@ {%- for message in messages %} {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} - {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }} + {%- if message['content'] is string %} + {{- message['content'] | trim}} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'image' %} + {{- '<|image|>' }} + {%- elif content['type'] == 'text' %} + {{- content['text'] | trim }} + {%- endif %} + {%- endfor %} + {%- endif %} + {{- '<|eot_id|>' }} {%- elif 'tool_calls' in message %} {%- if not message.tool_calls|length == 1 %} {{- raise_exception("This model only supports single tool-calls at once!") }} @@ -80,10 +116,14 @@ {{- "<|eot_id|>" }} {%- elif message.role == "tool" or message.role == "ipython" %} {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} - {%- if message.content is mapping %} - {{- message.content | tojson }} - {%- else %} + {%- if message.content is string %} {{- { "output": message.content } | tojson }} + {%- else %} + {%- for content in message['content'] %} + {%- if content['type'] == 'text' %} + {{- { "output": content['text'] } | tojson }} + {%- endif %} + {%- endfor %} {%- endif %} {{- "<|eot_id|>" }} {%- endif %} diff --git a/vllm/examples/tool_chat_template_llama3.2_pythonic.jinja b/vllm/examples/tool_chat_template_llama3.2_pythonic.jinja new file mode 100644 index 000000000..8c38de6c6 --- /dev/null +++ b/vllm/examples/tool_chat_template_llama3.2_pythonic.jinja @@ -0,0 +1,98 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = false %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call functions, please respond with a python list of the calls. " }} + {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a python list for function calls " }} + {{- "with their proper arguments to best answer the given prompt.\n\n" }} + {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- tool_call.name + '(' -}} + {%- for param in tool_call.arguments %} + {{- param + '=' -}} + {{- "%sr" | format(tool_call.arguments[param]) -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ')' -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ']<|eot_id|>' -}} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/vllm/examples/tool_chat_template_toolace.jinja b/vllm/examples/tool_chat_template_toolace.jinja new file mode 100644 index 000000000..a9b3b7189 --- /dev/null +++ b/vllm/examples/tool_chat_template_toolace.jinja @@ -0,0 +1,65 @@ +{{- bos_token }} + +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language." %} +{%- endif %} + +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n" }} + {{- "If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.\n" }} + {{- "You should only return the function call in tools call sections.\n\n" }} + {{- "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n" }} + {{- "You SHOULD NOT include any other text in the response.\n" }} + {{- "Here is a list of functions in JSON format that you can invoke.\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- "\n" }} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- tool_call.name + '(' -}} + {%- for param in tool_call.arguments %} + {{- param + '=' -}} + {{- "%sr" | format(tool_call.arguments[param]) -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ')' -}} + {% if not loop.last %}, {% endif %} + {%- endfor %} + {{- ']<|eot_id|>' -}} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} + +{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} diff --git a/vllm/format.sh b/vllm/format.sh index be6ee0ce4..0b196de9d 100755 --- a/vllm/format.sh +++ b/vllm/format.sh @@ -41,21 +41,24 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) ISORT_VERSION=$(isort --vn) CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') +SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') # # params: tool name, tool version, required version tool_version_check() { - if [[ $2 != $3 ]]; then - echo "❓❓Wrong $1 version installed: $3 is required, not $2." + expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) + if [[ "$2" != "$expected" ]]; then + echo "❓❓Wrong $1 version installed: $expected is required, not $2." exit 1 fi } -tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)" +tool_version_check "yapf" "$YAPF_VERSION" +tool_version_check "ruff" "$RUFF_VERSION" +tool_version_check "mypy" "$MYPY_VERSION" +tool_version_check "isort" "$ISORT_VERSION" +tool_version_check "codespell" "$CODESPELL_VERSION" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" +tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" YAPF_FLAGS=( '--recursive' @@ -294,6 +297,14 @@ echo 'vLLM actionlint:' tools/actionlint.sh -color echo 'vLLM actionlint: Done' +echo 'vLLM shellcheck:' +tools/shellcheck.sh +echo 'vLLM shellcheck: Done' + +echo 'excalidraw png check:' +tools/png-lint.sh +echo 'excalidraw png check: Done' + if ! git diff --quiet &>/dev/null; then echo echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" @@ -304,3 +315,7 @@ if ! git diff --quiet &>/dev/null; then else echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi + +echo 'vLLM sphinx-lint:' +tools/sphinx-lint.sh +echo 'vLLM sphinx-lint: Done' diff --git a/vllm/pyproject.toml b/vllm/pyproject.toml index e78f5652f..253b706a7 100644 --- a/vllm/pyproject.toml +++ b/vllm/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.5.0", + "torch == 2.5.1", "wheel", "jinja2", ] @@ -34,7 +34,7 @@ select = [ # Pyflakes "F", # pyupgrade - # "UP", + "UP", # flake8-bugbear "B", # flake8-simplify @@ -55,14 +55,12 @@ ignore = [ ] [tool.mypy] -python_version = "3.8" - ignore_missing_imports = true check_untyped_defs = true follow_imports = "silent" # After fixing type errors resulting from follow_imports: "skip" -> "silent", -# move the directory here and remove it from format.sh and mypy.yaml +# move the directory here and remove it from tools/mypy.sh files = [ "vllm/*.py", "vllm/adapter_commons", @@ -70,7 +68,7 @@ files = [ "vllm/entrypoints", "vllm/core", "vllm/inputs", - "vllm/logging", + "vllm/logging_utils", "vllm/multimodal", "vllm/platforms", "vllm/transformers_utils", @@ -95,6 +93,10 @@ skip_gitignore = true [tool.pytest.ini_options] markers = [ "skip_global_cleanup", - "core_model: run this model test in each PR instead of just daily", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", + "quant_model: run this model test under Quantized category", "distributed_2_gpus: run this test only in distributed tests for 2 GPUs", + "skip_v1: do not run this test with v1", + "optional: optional tests that are automatically skipped, include --optional to run them", ] diff --git a/vllm/python_only_dev.py b/vllm/python_only_dev.py index 4ab203bb6..f70b49840 100644 --- a/vllm/python_only_dev.py +++ b/vllm/python_only_dev.py @@ -1,91 +1,14 @@ -# enable python only development -# copy compiled files to the current directory directly +msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation). -import argparse -import os -import shutil -import subprocess -import sys -import warnings +TL;DR: -parser = argparse.ArgumentParser( - description="Development mode for python-only code") -parser.add_argument('-q', - '--quit-dev', - action='store_true', - help='Set the flag to quit development mode') -args = parser.parse_args() +VLLM_USE_PRECOMPILED=1 pip install -e . -# cannot directly `import vllm` , because it will try to -# import from the current directory -output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"], - capture_output=True) +or -assert output.returncode == 0, "vllm is not installed" +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +pip install -e . +""" # noqa -text = output.stdout.decode("utf-8") - -package_path = None -for line in text.split("\n"): - if line.startswith("Location: "): - package_path = line.split(": ")[1] - break - -assert package_path is not None, "could not find package path" - -cwd = os.getcwd() - -assert cwd != package_path, "should not import from the current directory" - -files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", - "vllm/vllm_flash_attn/flash_attn_interface.py", - "vllm/vllm_flash_attn/__init__.py", - # "vllm/_version.py", # not available in nightly wheels yet -] - -# Try to create _version.py to avoid version related warning -# Refer to https://github.com/vllm-project/vllm/pull/8771 -try: - from setuptools_scm import get_version - get_version(write_to="vllm/_version.py") -except ImportError: - warnings.warn( - "To avoid warnings related to vllm._version, " - "you should install setuptools-scm by `pip install setuptools-scm`", - stacklevel=2) - -if not args.quit_dev: - for file in files_to_copy: - src = os.path.join(package_path, file) - dst = file - print(f"Copying {src} to {dst}") - shutil.copyfile(src, dst) - - pre_built_vllm_path = os.path.join(package_path, "vllm") - tmp_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup") - os.rename(pre_built_vllm_path, tmp_path) - - print(f"Linking {current_vllm_path} to {pre_built_vllm_path}") - os.symlink(current_vllm_path, pre_built_vllm_path) -else: - vllm_symlink_path = os.path.join(package_path, "vllm") - vllm_backup_path = os.path.join(package_path, "vllm_pre_built") - current_vllm_path = os.path.join(cwd, "vllm") - - print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}") - assert os.path.islink( - vllm_symlink_path - ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link" - assert current_vllm_path == os.readlink( - vllm_symlink_path - ), "current directory is not the source code of package" - os.unlink(vllm_symlink_path) - - print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}") - os.rename(vllm_backup_path, vllm_symlink_path) +print(msg) diff --git a/vllm/requirements-build.txt b/vllm/requirements-build.txt index ea2b688bb..fec01caaf 100644 --- a/vllm/requirements-build.txt +++ b/vllm/requirements-build.txt @@ -1,9 +1,9 @@ -# Should be mirrored in pyproject.toml -cmake>=3.26 -ninja -packaging -setuptools>=61 -setuptools-scm>=8 -torch==2.5.0 -wheel -jinja2 +# Should be mirrored in pyproject.toml +cmake>=3.26 +ninja +packaging +setuptools>=61 +setuptools-scm>=8 +torch==2.5.1 +wheel +jinja2 diff --git a/vllm/requirements-common.txt b/vllm/requirements-common.txt index d72cc4476..72fb020a8 100644 --- a/vllm/requirements-common.txt +++ b/vllm/requirements-common.txt @@ -10,25 +10,26 @@ protobuf # Required by LlamaTokenizer. fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' aiohttp -openai >= 1.40.0 # Ensure modern openai package (ensure types module present) +openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support) uvicorn[standard] pydantic >= 2.9 # Required for fastapi >= 0.113.0 pillow # Required for image processing prometheus_client >= 0.18.0 prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer -lm-format-enforcer == 0.10.6 +lm-format-enforcer >= 0.10.9, < 0.11 outlines >= 0.0.43, < 0.1 +xgrammar >= 0.1.5; platform_machine == "x86_64" typing_extensions >= 4.10 -filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 partial-json-parser # used for parsing partial JSON outputs pyzmq msgspec gguf == 0.10.0 importlib_metadata -mistral_common[opencv] >= 1.4.4 +mistral_common[opencv] >= 1.5.0 pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.7.1 # required for compressed-tensors +compressed-tensors == 0.8.0 # required for compressed-tensors diff --git a/vllm/requirements-cpu.txt b/vllm/requirements-cpu.txt index 27ca8ca5d..db8ad9d3a 100644 --- a/vllm/requirements-cpu.txt +++ b/vllm/requirements-cpu.txt @@ -1,6 +1,7 @@ # Common dependencies -r requirements-common.txt -# Dependencies for x86_64 CPUs -torch == 2.4.0+cpu; platform_machine != "ppc64le" -torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +# Dependencies for CPUs +torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" +torch==2.5.1; platform_machine == "aarch64" +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch \ No newline at end of file diff --git a/vllm/requirements-cuda.txt b/vllm/requirements-cuda.txt index 92fa303d6..058ab7c1e 100644 --- a/vllm/requirements-cuda.txt +++ b/vllm/requirements-cuda.txt @@ -3,8 +3,8 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 -nvidia-ml-py # for pynvml package -torch == 2.5.0 +nvidia-ml-py >= 12.560.30 # for pynvml package +torch == 2.5.1 # These must be updated alongside torch -torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 +torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 diff --git a/vllm/requirements-hpu.txt b/vllm/requirements-hpu.txt new file mode 100644 index 000000000..4674efb81 --- /dev/null +++ b/vllm/requirements-hpu.txt @@ -0,0 +1,11 @@ +# Common dependencies +-r requirements-common.txt + +# Dependencies for HPU code +ray +triton +pandas +tabulate +setuptools>=61 +setuptools-scm>=8 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6 diff --git a/vllm/requirements-lint.txt b/vllm/requirements-lint.txt index 07f738873..711bb50a0 100644 --- a/vllm/requirements-lint.txt +++ b/vllm/requirements-lint.txt @@ -1,11 +1,12 @@ # formatting yapf==0.32.0 toml==0.10.2 -tomli==2.0.1 +tomli==2.0.2 ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 +sphinx-lint==1.0.0 # type checking mypy==1.11.1 diff --git a/vllm/requirements-openvino.txt b/vllm/requirements-openvino.txt index 7ad0d1e7f..95e591475 100644 --- a/vllm/requirements-openvino.txt +++ b/vllm/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.5.0 # should be aligned with "common" vLLM torch version +torch == 2.5.1 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/vllm/requirements-test.in b/vllm/requirements-test.in new file mode 100644 index 000000000..44972866d --- /dev/null +++ b/vllm/requirements-test.in @@ -0,0 +1,34 @@ +# testing +pytest +tensorizer>=2.9.0 +pytest-forked +pytest-asyncio +pytest-rerunfailures +pytest-shard + +# testing utils +awscli +decord # required for video tests +einops # required for MPT, qwen-vl and Mamba +httpx +librosa # required for audio tests +peft +ray[adag]==2.35 +sentence-transformers # required for embedding tests +soundfile # required for audio tests +timm # required for internvl test +torch==2.5.1 +transformers_stream_generator # required for qwen-vl test +matplotlib # required for qwen-vl test +mistral_common[opencv] >= 1.5.0 # required for pixtral test +datamodel_code_generator # required for minicpm3 test +lm-eval[api]==0.4.4 # required for model evaluation test + +# TODO: Add this after fully implementing llava(mantis) +# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test + +# quantization +bitsandbytes>=0.44.0 +buildkite-test-collector==0.1.9 + +numpy < 2.0.0 diff --git a/vllm/requirements-test.txt b/vllm/requirements-test.txt index 9787fa2a4..19369254d 100644 --- a/vllm/requirements-test.txt +++ b/vllm/requirements-test.txt @@ -1,34 +1,599 @@ -# testing -pytest -tensorizer>=2.9.0 -pytest-forked -pytest-asyncio -pytest-rerunfailures -pytest-shard +# +# This file is autogenerated by pip-compile with Python 3.9 +# by the following command: +# +# pip-compile requirements-test.in +# +absl-py==2.1.0 + # via rouge-score +accelerate==1.0.1 + # via + # lm-eval + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # datasets + # fsspec + # lm-eval +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via httpx +argcomplete==3.5.1 + # via datamodel-code-generator +async-timeout==4.0.3 + # via + # aiohttp + # redis +attrs==24.2.0 + # via + # aiohttp + # jsonlines + # jsonschema + # referencing +audioread==3.0.1 + # via librosa +awscli==1.35.23 + # via -r requirements-test.in +bitsandbytes==0.44.1 + # via -r requirements-test.in +black==24.10.0 + # via datamodel-code-generator +boto3==1.35.57 + # via tensorizer +botocore==1.35.57 + # via + # awscli + # boto3 + # s3transfer +buildkite-test-collector==0.1.9 + # via -r requirements-test.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via soundfile +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # black + # nltk + # ray +colorama==0.4.6 + # via + # awscli + # sacrebleu + # tqdm-multiprocess +contourpy==1.3.0 + # via matplotlib +cupy-cuda12x==13.3.0 + # via ray +cycler==0.12.1 + # via matplotlib +datamodel-code-generator==0.26.3 + # via -r requirements-test.in +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +datasets==3.0.2 + # via + # evaluate + # lm-eval +decorator==5.1.1 + # via librosa +decord==0.6.0 + # via -r requirements-test.in +dill==0.3.8 + # via + # datasets + # evaluate + # lm-eval + # multiprocess +dnspython==2.7.0 + # via email-validator +docutils==0.16 + # via awscli +einops==0.8.0 + # via -r requirements-test.in +email-validator==2.2.0 + # via pydantic +evaluate==0.4.3 + # via lm-eval +exceptiongroup==1.2.2 + # via + # anyio + # pytest +fastrlock==0.8.2 + # via cupy-cuda12x +filelock==3.16.1 + # via + # datasets + # huggingface-hub + # ray + # torch + # transformers + # triton +fonttools==4.54.1 + # via matplotlib +frozenlist==1.5.0 + # via + # aiohttp + # aiosignal + # ray +fsspec[http]==2024.9.0 + # via + # datasets + # evaluate + # huggingface-hub + # torch +genson==1.3.0 + # via datamodel-code-generator +h11==0.14.0 + # via httpcore +hiredis==3.0.0 + # via tensorizer +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via -r requirements-test.in +huggingface-hub==0.26.2 + # via + # accelerate + # datasets + # evaluate + # peft + # sentence-transformers + # timm + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # email-validator + # httpx + # requests + # yarl +importlib-resources==6.4.5 + # via matplotlib +inflect==5.6.2 + # via datamodel-code-generator +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via datamodel-code-generator +jinja2==3.1.4 + # via + # datamodel-code-generator + # torch +jmespath==1.0.1 + # via + # boto3 + # botocore +joblib==1.4.2 + # via + # librosa + # nltk + # scikit-learn +jsonlines==4.0.0 + # via lm-eval +jsonschema==4.23.0 + # via + # mistral-common + # ray +jsonschema-specifications==2024.10.1 + # via jsonschema +kiwisolver==1.4.7 + # via matplotlib +lazy-loader==0.4 + # via librosa +libnacl==2.1.0 + # via tensorizer +librosa==0.10.2.post1 + # via -r requirements-test.in +llvmlite==0.43.0 + # via numba +lm-eval[api]==0.4.4 + # via -r requirements-test.in +lxml==5.3.0 + # via sacrebleu +markupsafe==3.0.2 + # via jinja2 +matplotlib==3.9.2 + # via -r requirements-test.in +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +mistral-common[opencv]==1.5.1 + # via + # -r requirements-test.in + # mistral-common +more-itertools==10.5.0 + # via lm-eval +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via + # librosa + # ray +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via + # datasets + # evaluate +mypy-extensions==1.0.0 + # via black +networkx==3.2.1 + # via torch +nltk==3.9.1 + # via rouge-score +numba==0.60.0 + # via librosa +numexpr==2.10.1 + # via lm-eval +numpy==1.26.4 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # contourpy + # cupy-cuda12x + # datasets + # decord + # evaluate + # librosa + # matplotlib + # mistral-common + # numba + # numexpr + # opencv-python-headless + # pandas + # peft + # rouge-score + # sacrebleu + # scikit-learn + # scipy + # soxr + # tensorizer + # torchvision + # transformers +nvidia-cublas-cu12==12.4.5.8 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.4.127 + # via torch +nvidia-cuda-nvrtc-cu12==12.4.127 + # via torch +nvidia-cuda-runtime-cu12==12.4.127 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.2.1.3 + # via torch +nvidia-curand-cu12==10.3.5.147 + # via torch +nvidia-cusolver-cu12==11.6.1.9 + # via torch +nvidia-cusparse-cu12==12.3.1.170 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-nccl-cu12==2.21.5 + # via torch +nvidia-nvjitlink-cu12==12.4.127 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 + # torch +nvidia-nvtx-cu12==12.4.127 + # via torch +opencv-python-headless==4.10.0.84 + # via mistral-common +packaging==24.1 + # via + # accelerate + # black + # datamodel-code-generator + # datasets + # evaluate + # huggingface-hub + # lazy-loader + # matplotlib + # peft + # pooch + # pytest + # pytest-rerunfailures + # ray + # transformers + # typepy +pandas==2.2.3 + # via + # datasets + # evaluate +pathspec==0.12.1 + # via black +pathvalidate==3.2.1 + # via pytablewriter +peft==0.13.2 + # via + # -r requirements-test.in + # lm-eval +pillow==10.4.0 + # via + # matplotlib + # mistral-common + # sentence-transformers + # torchvision +platformdirs==4.3.6 + # via + # black + # pooch +pluggy==1.5.0 + # via pytest +pooch==1.8.2 + # via librosa +portalocker==2.10.1 + # via sacrebleu +propcache==0.2.0 + # via yarl +protobuf==5.28.3 + # via + # ray + # tensorizer +psutil==6.1.0 + # via + # accelerate + # peft + # tensorizer +py==1.11.0 + # via pytest-forked +pyarrow==18.0.0 + # via datasets +pyasn1==0.6.1 + # via rsa +pybind11==2.13.6 + # via lm-eval +pycparser==2.22 + # via cffi +pydantic[email]==2.9.2 + # via + # datamodel-code-generator + # mistral-common +pydantic-core==2.23.4 + # via pydantic +pyparsing==3.2.0 + # via matplotlib +pytablewriter==1.2.0 + # via lm-eval +pytest==8.3.3 + # via + # -r requirements-test.in + # buildkite-test-collector + # pytest-asyncio + # pytest-forked + # pytest-rerunfailures + # pytest-shard +pytest-asyncio==0.24.0 + # via -r requirements-test.in +pytest-forked==1.6.0 + # via -r requirements-test.in +pytest-rerunfailures==14.0 + # via -r requirements-test.in +pytest-shard==0.1.2 + # via -r requirements-test.in +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas + # typepy +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via + # accelerate + # awscli + # datamodel-code-generator + # datasets + # huggingface-hub + # peft + # ray + # timm + # transformers +ray[adag]==2.35.0 + # via -r requirements-test.in +redis==5.2.0 + # via tensorizer +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications +regex==2024.9.11 + # via + # nltk + # sacrebleu + # tiktoken + # transformers +requests==2.32.3 + # via + # buildkite-test-collector + # datasets + # evaluate + # huggingface-hub + # lm-eval + # mistral-common + # pooch + # ray + # tiktoken + # transformers +rouge-score==0.1.2 + # via lm-eval +rpds-py==0.20.1 + # via + # jsonschema + # referencing +rsa==4.7.2 + # via awscli +s3transfer==0.10.3 + # via + # awscli + # boto3 +sacrebleu==2.4.3 + # via lm-eval +safetensors==0.4.5 + # via + # accelerate + # peft + # timm + # transformers +scikit-learn==1.5.2 + # via + # librosa + # lm-eval + # sentence-transformers +scipy==1.13.1 + # via + # librosa + # scikit-learn + # sentence-transformers +sentence-transformers==3.2.1 + # via -r requirements-test.in +sentencepiece==0.2.0 + # via mistral-common +six==1.16.0 + # via + # python-dateutil + # rouge-score +sniffio==1.3.1 + # via + # anyio + # httpx +soundfile==0.12.1 + # via + # -r requirements-test.in + # librosa +soxr==0.5.0.post1 + # via librosa +sqlitedict==2.1.0 + # via lm-eval +sympy==1.13.1 + # via torch +tabledata==1.3.3 + # via pytablewriter +tabulate==0.9.0 + # via sacrebleu +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via lm-eval +tensorizer==2.9.0 + # via -r requirements-test.in +threadpoolctl==3.5.0 + # via scikit-learn +tiktoken==0.7.0 + # via + # lm-eval + # mistral-common +timm==1.0.11 + # via -r requirements-test.in +tokenizers==0.20.3 + # via transformers +toml==0.10.2 + # via datamodel-code-generator +tomli==2.0.2 + # via + # black + # pytest +torch==2.5.1 + # via + # -r requirements-test.in + # accelerate + # bitsandbytes + # lm-eval + # peft + # sentence-transformers + # tensorizer + # timm + # torchvision +torchvision==0.20.1 + # via timm +tqdm==4.66.6 + # via + # datasets + # evaluate + # huggingface-hub + # lm-eval + # nltk + # peft + # sentence-transformers + # tqdm-multiprocess + # transformers +tqdm-multiprocess==0.0.11 + # via lm-eval +transformers==4.46.3 + # via + # lm-eval + # peft + # sentence-transformers + # transformers-stream-generator +transformers-stream-generator==0.0.5 + # via -r requirements-test.in +triton==3.1.0 + # via torch +typepy[datetime]==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # anyio + # black + # huggingface-hub + # librosa + # mistral-common + # multidict + # pydantic + # pydantic-core + # torch +tzdata==2024.2 + # via pandas +urllib3==1.26.20 + # via + # botocore + # requests +word2number==1.1 + # via lm-eval +xxhash==3.5.0 + # via + # datasets + # evaluate +yarl==1.17.1 + # via aiohttp +zipp==3.20.2 + # via importlib-resources +zstandard==0.23.0 + # via lm-eval -# testing utils -awscli -einops # required for MPT, qwen-vl and Mamba -httpx -librosa # required for audio tests -opencv-python # required for video tests -peft -requests -ray[adag]==2.35 -sentence-transformers # required for embedding -soundfile # required for audio test -timm # required for internvl test -transformers_stream_generator # required for qwen-vl test -matplotlib # required for qwen-vl test -datamodel_code_generator # required for minicpm3 test -lm-eval[api]==0.4.4 # required for model evaluation test - -# TODO: Add this after fully implementing llava(mantis) -# git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test - -# Benchmarking -aiohttp - -# quantization -bitsandbytes>=0.44.0 -buildkite-test-collector==0.1.8 +# The following packages are considered to be unsafe in a requirements file: +# setuptools diff --git a/vllm/requirements-tpu.txt b/vllm/requirements-tpu.txt index 4c606cf0a..b8f0b1546 100644 --- a/vllm/requirements-tpu.txt +++ b/vllm/requirements-tpu.txt @@ -2,6 +2,22 @@ -r requirements-common.txt # Dependencies for TPU -# Currently, the TPU backend uses a nightly version of PyTorch XLA. -# You can install the dependencies in Dockerfile.tpu. +cmake>=3.26 +ninja +packaging +setuptools-scm>=8 +wheel +jinja2 ray[default] + +# Install torch_xla +--pre +--extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-releases/index.html +--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html +--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html +torch==2.6.0.dev20241126+cpu +torchvision==0.20.0.dev20241126+cpu +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl +jaxlib==0.4.36.dev20241122 +jax==0.4.36.dev20241122 diff --git a/vllm/requirements-xpu.txt b/vllm/requirements-xpu.txt index ce83a178c..e41295792 100644 --- a/vllm/requirements-xpu.txt +++ b/vllm/requirements-xpu.txt @@ -8,9 +8,9 @@ packaging setuptools-scm>=8 wheel jinja2 -# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ -torch == 2.3.1+cxx11.abi -intel-extension-for-pytorch == 2.3.110+xpu -oneccl_bind_pt == 2.3.100+xpu -triton-xpu == 3.0.0b2 +torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl +intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl +oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl + +triton-xpu == 3.0.0b1 diff --git a/vllm/setup.py b/vllm/setup.py index 4af809bb4..ec10d6fab 100644 --- a/vllm/setup.py +++ b/vllm/setup.py @@ -1,5 +1,4 @@ import importlib.util -import io import logging import os import re @@ -56,12 +55,6 @@ def is_ninja_available() -> bool: return which("ninja") is not None -def remove_prefix(text, prefix): - if text.startswith(prefix): - return text[len(prefix):] - return text - - class CMakeExtension(Extension): def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None: @@ -198,8 +191,10 @@ def build_extensions(self) -> None: os.makedirs(self.build_temp) targets = [] - target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."), - "vllm_flash_attn.") + + def target_name(s: str) -> str: + return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.") + # Build all the extensions for ext in self.extensions: self.configure(ext) @@ -254,6 +249,92 @@ def run(self): self.copy_file(file, dst_file) +class repackage_wheel(build_ext): + """Extracts libraries and other files from an existing wheel.""" + default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + + def run(self) -> None: + wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", + self.default_wheel) + + assert _is_cuda( + ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds" + + import zipfile + + if os.path.isfile(wheel_location): + wheel_path = wheel_location + print(f"Using existing wheel={wheel_path}") + else: + # Download the wheel from a given URL, assume + # the filename is the last part of the URL + wheel_filename = wheel_location.split("/")[-1] + + import tempfile + + # create a temporary directory to store the wheel + temp_dir = tempfile.mkdtemp(prefix="vllm-wheels") + wheel_path = os.path.join(temp_dir, wheel_filename) + + print(f"Downloading wheel from {wheel_location} to {wheel_path}") + + from urllib.request import urlretrieve + + try: + urlretrieve(wheel_location, filename=wheel_path) + except Exception as e: + from setuptools.errors import SetupError + + raise SetupError( + f"Failed to get vLLM wheel from {wheel_location}") from e + + with zipfile.ZipFile(wheel_path) as wheel: + files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/flash_attn_interface.py", + "vllm/vllm_flash_attn/__init__.py", + # "vllm/_version.py", # not available in nightly wheels yet + ] + file_members = filter(lambda x: x.filename in files_to_copy, + wheel.filelist) + + for file in file_members: + print(f"Extracting and including {file.filename} " + "from existing wheel") + package_name = os.path.dirname(file.filename).replace("/", ".") + file_name = os.path.basename(file.filename) + + if package_name not in package_data: + package_data[package_name] = [] + + wheel.extract(file) + if file_name.endswith(".py"): + # python files shouldn't be added to package_data + continue + + package_data[package_name].append(file_name) + + +def _is_hpu() -> bool: + is_hpu_available = True + try: + subprocess.run(["hl-smi"], capture_output=True, check=True) + except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): + if not os.path.exists('/dev/accel/accel0') and not os.path.exists( + '/dev/accel/accel_controlD0'): + # last resort... + try: + output = subprocess.check_output( + 'lsmod | grep habanalabs | wc -l', shell=True) + is_hpu_available = int(output) > 0 + except (ValueError, FileNotFoundError, PermissionError, + subprocess.CalledProcessError): + is_hpu_available = False + return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" + + def _no_device() -> bool: return VLLM_TARGET_DEVICE == "empty" @@ -261,7 +342,7 @@ def _no_device() -> bool: def _is_cuda() -> bool: has_cuda = torch.version.cuda is not None return (VLLM_TARGET_DEVICE == "cuda" and has_cuda - and not (_is_neuron() or _is_tpu())) + and not (_is_neuron() or _is_tpu() or _is_hpu())) def _is_hip() -> bool: @@ -327,7 +408,7 @@ def get_neuronxcc_version(): "__init__.py") # Check if the command was executed successfully - with open(version_file, "rt") as fp: + with open(version_file) as fp: content = fp.read() # Extract the version using a regular expression @@ -357,6 +438,22 @@ def get_path(*filepath) -> str: return os.path.join(ROOT_DIR, *filepath) +def get_gaudi_sw_version(): + """ + Returns the driver version. + """ + # Enable console printing for `hl-smi` check + output = subprocess.run("hl-smi", + shell=True, + text=True, + capture_output=True, + env={"ENABLE_CONSOLE": "true"}) + if output.returncode == 0 and output.stdout: + return output.stdout.split("\n")[2].replace( + " ", "").split(":")[1][:-1].split("-")[0] + return "0.0.0" # when hl-smi is not available + + def get_vllm_version() -> str: version = get_version( write_to="vllm/vllm/_version.py", # TODO: move this to pyproject.toml @@ -369,12 +466,15 @@ def get_vllm_version() -> str: if envs.VLLM_TARGET_DEVICE == "empty": version += f"{sep}empty" elif _is_cuda(): - cuda_version = str(get_nvcc_cuda_version()) - if cuda_version != MAIN_CUDA_VERSION: - cuda_version_str = cuda_version.replace(".", "")[:3] - # skip this for source tarball, required for pypi - if "sdist" not in sys.argv: - version += f"{sep}cu{cuda_version_str}" + if envs.VLLM_USE_PRECOMPILED: + version += ".precompiled" + else: + cuda_version = str(get_nvcc_cuda_version()) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + # skip this for source tarball, required for pypi + if "sdist" not in sys.argv: + version += f"{sep}cu{cuda_version_str}" elif _is_hip(): # Get the HIP version hipcc_version = get_hipcc_rocm_version() @@ -387,6 +487,12 @@ def get_vllm_version() -> str: if neuron_version != MAIN_CUDA_VERSION: neuron_version_str = neuron_version.replace(".", "")[:3] version += f"{sep}neuron{neuron_version_str}" + elif _is_hpu(): + # Get the Intel Gaudi Software Suite version + gaudi_sw_version = str(get_gaudi_sw_version()) + if gaudi_sw_version != MAIN_CUDA_VERSION: + gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3] + version += f"{sep}gaudi{gaudi_sw_version}" elif _is_openvino(): version += f"{sep}openvino" elif _is_tpu(): @@ -405,7 +511,8 @@ def read_readme() -> str: """Read the README file if present.""" p = get_path("README.md") if os.path.isfile(p): - return io.open(get_path("README.md"), "r", encoding="utf-8").read() + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() else: return "" @@ -444,6 +551,8 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-rocm.txt") elif _is_neuron(): requirements = _read_requirements("requirements-neuron.txt") + elif _is_hpu(): + requirements = _read_requirements("requirements-hpu.txt") elif _is_openvino(): requirements = _read_requirements("requirements-openvino.txt") elif _is_tpu(): @@ -454,7 +563,7 @@ def _read_requirements(filename: str) -> List[str]: requirements = _read_requirements("requirements-xpu.txt") else: raise ValueError( - "Unsupported platform, please use CUDA, ROCm, Neuron, " + "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " "OpenVINO, or CPU.") return requirements @@ -477,13 +586,18 @@ def _read_requirements(filename: str) -> List[str]: package_data = { "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"] } -if envs.VLLM_USE_PRECOMPILED: - ext_modules = [] - package_data["vllm"].append("*.so") if _no_device(): ext_modules = [] +if not ext_modules: + cmdclass = {} +else: + cmdclass = { + "build_ext": + repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext + } + setup( name="vllm", version=get_vllm_version(), @@ -499,7 +613,6 @@ def _read_requirements(filename: str) -> List[str]: "Documentation": "https://vllm.readthedocs.io/en/latest/", }, classifiers=[ - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -513,14 +626,15 @@ def _read_requirements(filename: str) -> List[str]: ], packages=find_packages(exclude=("benchmarks", "csrc", "docs", "examples", "tests*")), - python_requires=">=3.8", + python_requires=">=3.9", install_requires=get_requirements(), ext_modules=ext_modules, extras_require={ "tensorizer": ["tensorizer>=2.9.0"], - "audio": ["librosa", "soundfile"] # Required for audio processing + "audio": ["librosa", "soundfile"], # Required for audio processing + "video": ["decord"] # Required for video processing }, - cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {}, + cmdclass=cmdclass, package_data=package_data, entry_points={ "console_scripts": [ diff --git a/vllm/tests/basic_correctness/test_basic_correctness.py b/vllm/tests/basic_correctness/test_basic_correctness.py index 79647589d..fcba253d1 100644 --- a/vllm/tests/basic_correctness/test_basic_correctness.py +++ b/vllm/tests/basic_correctness/test_basic_correctness.py @@ -14,11 +14,12 @@ from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata +from ..conftest import VllmRunner from ..models.utils import check_outputs_equal from ..utils import multi_gpu_test MODELS = [ - "facebook/opt-125m", + "google/gemma-2-2b-it", "meta-llama/Llama-3.2-1B", ] @@ -42,8 +43,6 @@ def test_vllm_gc_ed(): @pytest.mark.parametrize("enforce_eager", [False, True]) def test_models( hf_runner, - vllm_runner, - example_prompts, model: str, backend: str, dtype: str, @@ -54,15 +53,27 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") + if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + pytest.skip( + "XFORMERS does not support gemma2 with full context length.") + os.environ["VLLM_ATTENTION_BACKEND"] = backend + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, - dtype=dtype, - enforce_eager=enforce_eager, - gpu_memory_utilization=0.7) as vllm_model: + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=enforce_eager, + gpu_memory_utilization=0.7) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) check_outputs_equal( @@ -156,3 +167,29 @@ def test_model_with_failure(vllm_runner) -> None: ModelInputForGPUWithSamplingMetadata) finally: os.remove(filename) + + +def test_failure_with_async_out_proc(vllm_runner) -> None: + + filename = None + try: + with vllm_runner("facebook/opt-125m", + dtype="half", + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model,\ + patch("vllm.model_executor.models.opt.OPTForCausalLM.forward", + side_effect=ValueError()): + model_config = vllm_model.model.llm_engine.model_config + assert model_config.use_async_output_proc + with pytest.raises(ValueError) as exc_info: + vllm_model.generate_greedy('how to make pizza?', 250) + matches = re.search(r"input dumped to (.+).pkl", + str(exc_info.value)) + assert matches is not None + + filename = f"{matches.group(1)}.pkl" + finally: + # Clean up + if filename is not None: + os.remove(filename) + pass diff --git a/vllm/tests/basic_correctness/test_chunked_prefill.py b/vllm/tests/basic_correctness/test_chunked_prefill.py index 51aec8c87..469d18a4d 100644 --- a/vllm/tests/basic_correctness/test_chunked_prefill.py +++ b/vllm/tests/basic_correctness/test_chunked_prefill.py @@ -11,6 +11,9 @@ import pytest +from tests.kernels.utils import override_backend_env_variable +from vllm.platforms import current_platform + from ..models.utils import check_logprobs_close, check_outputs_equal from ..utils import multi_gpu_test @@ -28,6 +31,7 @@ # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models( hf_runner, vllm_runner, @@ -38,11 +42,15 @@ def test_models( chunked_prefill_token_size: int, enforce_eager: bool, tensor_parallel_size: int, + attention_backend: str, + monkeypatch, ) -> None: """ Checks exact match decode between huggingface model and vllm runner with chunked prefill. """ + override_backend_env_variable(monkeypatch, attention_backend) + max_num_seqs = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size @@ -71,13 +79,18 @@ def test_models( @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) def test_models_distributed( hf_runner, vllm_runner, example_prompts, model: str, distributed_executor_backend: str, + attention_backend: str, + monkeypatch, ) -> None: + override_backend_env_variable(monkeypatch, attention_backend) + if (model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray"): # test ray adag @@ -194,12 +207,14 @@ def test_models_with_fp8_kv_cache( # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. @pytest.mark.parametrize("tensor_parallel_size", [1]) +@pytest.mark.parametrize("dtype", ["half"]) def test_with_prefix_caching( vllm_runner, max_tokens: int, enforce_eager: bool, chunk_size: int, tensor_parallel_size: int, + dtype: str, ) -> None: """ Checks exact match decode with and without prefix caching @@ -221,7 +236,7 @@ def test_with_prefix_caching( for enable in (True, False): with vllm_runner( model, - dtype="half", + dtype=dtype, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=True, enable_prefix_caching=enable, @@ -248,3 +263,61 @@ def test_with_prefix_caching( name_0="w/o prefix caching", name_1="with prefix caching", ) + + +@pytest.mark.parametrize("model", ["facebook/opt-125m"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16]) +@pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("attention_backend", ["TORCH_SDPA"]) +@pytest.mark.cpu_model +@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") +def test_models_cpu( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + max_tokens: int, + chunked_prefill_token_size: int, + enforce_eager: bool, + attention_backend: str, + monkeypatch, +) -> None: + test_models( + hf_runner, + vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + chunked_prefill_token_size, + enforce_eager, + 1, + attention_backend, + monkeypatch, + ) + + +@pytest.mark.parametrize("max_tokens", [16]) +@pytest.mark.parametrize("enforce_eager", [False]) +@pytest.mark.parametrize("chunk_size", [30, 32]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.cpu_model +@pytest.mark.skipif(not current_platform.is_cpu(), reason="CPU only") +def test_with_prefix_caching_cpu( + vllm_runner, + max_tokens: int, + enforce_eager: bool, + chunk_size: int, + dtype: str, +) -> None: + test_with_prefix_caching( + vllm_runner, + max_tokens, + enforce_eager, + chunk_size, + 1, + dtype, + ) diff --git a/vllm/tests/compile/backend.py b/vllm/tests/compile/backend.py new file mode 100644 index 000000000..8fa10e5bd --- /dev/null +++ b/vllm/tests/compile/backend.py @@ -0,0 +1,37 @@ +from copy import deepcopy +from typing import Callable, Union + +from torch import fx + +from vllm.compilation.inductor_pass import InductorPass + + +class TestBackend: + """ + This class provides a simple Inductor backend that can be used for testing. + It takes a list of custom passes and runs them after Inductor's passes. + It also saves the graph before and after the custom passes for inspection. + """ + + def __init__(self, *passes: Union[InductorPass, Callable[[fx.Graph], + None]]): + self.custom_passes = list(passes) + from torch._inductor import config + self.current_config = config.shallow_copy_dict() + self.current_config['force_disable_caches'] = True + self.current_config['post_grad_custom_post_pass'] = self.post_pass + + def __call__(self, graph: fx.GraphModule, example_inputs): + from torch._inductor.compile_fx import compile_fx + return compile_fx(graph, + example_inputs, + config_patches=self.current_config) + + def post_pass(self, graph: fx.Graph): + self.graph_pre_pass = deepcopy(graph) + for pass_ in self.custom_passes: + pass_(graph) + + self.graph_post_pass = deepcopy(graph) + # assign by reference, will reflect the final state of the graph + self.final_graph = graph diff --git a/vllm/tests/compile/piecewise/__init__.py b/vllm/tests/compile/piecewise/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/tests/compile/piecewise/test_simple.py b/vllm/tests/compile/piecewise/test_simple.py new file mode 100644 index 000000000..aa1152481 --- /dev/null +++ b/vllm/tests/compile/piecewise/test_simple.py @@ -0,0 +1,109 @@ +""" +Test the piecewise compilation with a simple model so that we +can exactly calculate the expected output and side effects. +""" + +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) +from vllm.utils import direct_register_custom_op + +global_counter = 0 + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + global global_counter + global_counter += 1 + print(f"{global_counter=}") + out.copy_(q) + out[0] += 1 + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@support_torch_compile +class SillyModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Overall effect: + x += 1 + x[0] += 2 + global_counter += 2 + """ + x = x + 1 + x = x + 2 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x - 2 + x = x - 1 + out = torch.empty_like(x) + torch.ops.silly.attention(x, x, x, out) + x = out + x = x + 1 + return x + + +def test_simple_piecewise_compile(): + + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_copy_inputs=True, + cudagraph_capture_sizes=[1, 2], + )) + with set_current_vllm_config(vllm_config): + model = SillyModel(vllm_config=vllm_config, prefix='') + + inputs = torch.randn(100).cuda() + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=5, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=3, # 1 + num_layers + num_inductor_compilations=3, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + + model(inputs) + + model(torch.randn(2).cuda()) + model(torch.randn(1).cuda()) + + input = torch.zeros(2).cuda() + global global_counter + global_counter = 0 + output = model(input) + assert global_counter == 2 + assert torch.allclose(output.cpu(), torch.tensor([3., 1.])) diff --git a/vllm/tests/compile/piecewise/test_toy_llama.py b/vllm/tests/compile/piecewise/test_toy_llama.py new file mode 100644 index 000000000..07c10a3a1 --- /dev/null +++ b/vllm/tests/compile/piecewise/test_toy_llama.py @@ -0,0 +1,436 @@ +""" +Test the piecewise compilation with a simple model, comparing the output +with and without the piecewise compilation. + +This is a tractable model, the weights and computation are specially designed +if the config `tractable_init` is set to True. Otherwise, the weights are +initialized randomly with a fixed seed. +""" +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from torch import nn +from torch.library import Library + +from vllm.compilation.counter import compilation_counter +from vllm.compilation.decorators import support_torch_compile +from vllm.config import (CompilationConfig, CompilationLevel, VllmConfig, + set_current_vllm_config) +from vllm.utils import direct_register_custom_op + +# create a library to hold the custom op +silly_lib = Library("silly", "FRAGMENT") # noqa + + +def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + out.copy_(q) + out += k + out += v + + +def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, + out: torch.Tensor) -> None: + return + + +direct_register_custom_op( + op_name="attention", + op_func=silly_attention, + mutates_args=["out"], + fake_impl=silly_attention_fake, + target_lib=silly_lib, +) + + +@dataclass +class LlamaConfig: + hidden_size: int = 128 + mlp_size: int = 256 + vocab_size: int = 128 + num_layers: int = 2 + init_value: float = 1.0 + tractable_init: bool = False + random_seed: int = 0 + + def __post_init__(self): + assert self.mlp_size >= self.hidden_size + + +class LlamaMLP(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.gate_up_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.mlp_size * 2, + bias=False, + ) + self.down_projection = nn.Linear( + in_features=config.mlp_size, + out_features=config.hidden_size, + bias=False, + ) + + if config.tractable_init: + nn.init.eye_(self.gate_up_projection.weight.data[:config.mlp_size]) + nn.init.eye_(self.gate_up_projection.weight.data[config.mlp_size:]) + nn.init.eye_(self.down_projection.weight.data) + else: + nn.init.xavier_normal_(self.gate_up_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.down_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + + def forward(self, x): + # for tractable_init and positive input, this is + # essentially an elementwise-square + x = self.gate_up_projection(x) + x = x[:, :x.size(1) // 2] * torch.nn.functional.relu( + x[:, x.size(1) // 2:]) + x = self.down_projection(x) + return x + + +class LlamaAttention(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.qkv_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size * 3, + bias=False, + ) + + self.output_projection = nn.Linear( + in_features=config.hidden_size, + out_features=config.hidden_size, + bias=False, + ) + + if config.tractable_init: + nn.init.eye_(self.qkv_projection.weight.data[:config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[config.hidden_size:2 * + config.hidden_size]) + nn.init.eye_(self.qkv_projection.weight.data[2 * + config.hidden_size:]) + nn.init.eye_(self.output_projection.weight.data) + else: + nn.init.xavier_normal_(self.qkv_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + nn.init.xavier_normal_(self.output_projection.weight.data, + generator=torch.Generator().manual_seed( + config.random_seed), + gain=0.001) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + # for tractable_init, this is: + # output = (hidden_states * 3 + positions * 2) + qkv = self.qkv_projection(hidden_states) + hidden_size = qkv.size(-1) // 3 + q, k, v = qkv.split([hidden_size, hidden_size, hidden_size], dim=-1) + + q = q + positions.unsqueeze(1) + k = k + positions.unsqueeze(1) + + attn_output = torch.empty_like(q) + torch.ops.silly.attention(q, k, v, attn_output) + + output = self.output_projection(attn_output) + return output + + +class LlamaDecoderLayer(nn.Module): + + def __init__(self, config: LlamaConfig) -> None: + super().__init__() + self.self_attention = LlamaAttention(config) + self.mlp = LlamaMLP(config) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + For tractable computation: + - if residual is None, the outputs are: + - residual = (hidden_states + 1) * 3 + positions * 2 + hidden_states = hidden_states * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + - if residual is not None, the outputs are: + - residual = (hidden_states + residual + 1) * 3 + positions * 2 + hidden_states + residual = (hidden_states + residual) * 4 + positions * 2 + 3 + - hidden_states = (residual + 1) ** 2 + """ # noqa + if residual is None: + residual = hidden_states + hidden_states = hidden_states + 1 + else: + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states + 1 + + hidden_states = self.self_attention(positions=positions, + hidden_states=hidden_states) + + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = hidden_states + 1 + hidden_states = self.mlp(hidden_states) + + return hidden_states, residual + + +@support_torch_compile +class LlamaModel(nn.Module): + + def __init__(self, + *, + vllm_config: VllmConfig, + config: LlamaConfig, + prefix: str = '', + **kwargs) -> None: + super().__init__() + self.embedding_tokens = nn.Embedding( + num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size, + ) + self.layers = nn.ModuleList( + [LlamaDecoderLayer(config) for _ in range(config.num_layers)]) + + # this is the initial value of the hidden states + self.embedding_tokens.weight.data.fill_(config.init_value) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + ) -> torch.Tensor: + hidden_states = self.embedding_tokens(input_ids) + residual = None + for layer in self.layers: + hidden_states, residual = layer(positions, hidden_states, residual) + return hidden_states + + +def tractable_computation(input_ids: torch.Tensor, + positions: torch.Tensor, + config: LlamaConfig, + init_value: float = 1.0) -> torch.Tensor: + hidden_states = torch.ones(input_ids.size(0), + config.hidden_size, + device=input_ids.device, + dtype=input_ids.dtype) * init_value + + # first layer + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + # following layers + for _ in range(config.num_layers - 1): + hidden_states = hidden_states + residual + residual = hidden_states * 4 + positions.unsqueeze(1) * 2 + 3 + hidden_states = (residual + 1)**2 + + return hidden_states + + +@torch.inference_mode +def run_model(llama_config, + use_compile: bool, + split_attn: bool = False) -> torch.Tensor: + + if use_compile: + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + cudagraph_capture_sizes=[1, 2], + ) + if split_attn: + compilation_config.splitting_ops = ["silly.attention"] + else: + compilation_config = CompilationConfig( + level=CompilationLevel.NO_COMPILATION, ) + + vllm_config = VllmConfig(compilation_config=compilation_config) + with set_current_vllm_config(vllm_config): + model = LlamaModel(config=llama_config, + vllm_config=vllm_config, + prefix="").eval().cuda() + + B = 16 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda() + + model(input_ids, positions) + model(input_ids[:2], positions[:2]) + model(input_ids[:1], positions[:1]) + + input_ids[:2].zero_() + output = model(input_ids[:2], positions[:2]) + + output = output.cpu() + + if llama_config.tractable_init: + expected_output = tractable_computation(input_ids[:2], positions[:2], + llama_config).cpu() + + assert torch.allclose(output, expected_output) + else: + return output.cpu() + + +def test_toy_llama(): + # compare output with and without piecewise compilation + + llama_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=12) + + tractable_config = LlamaConfig(hidden_size=128, + mlp_size=256, + vocab_size=128, + num_layers=2, + tractable_init=True) + + outputs = [] + with compilation_counter.expect( + num_graphs_seen=0, + num_piecewise_graphs_seen=0, + num_piecewise_capturable_graphs_seen=0, + num_inductor_compilations=0, + num_cudagraph_caputured=0, + ): + outputs.append(run_model(llama_config, use_compile=False)) + run_model(tractable_config, use_compile=False) + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=1, + num_piecewise_capturable_graphs_seen=1, + num_inductor_compilations=1, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured= + 2, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append(run_model(llama_config, use_compile=True)) + run_model(tractable_config, use_compile=True) + + with compilation_counter.expect( + num_graphs_seen=1, # one graph for the model + num_piecewise_graphs_seen=2 * llama_config.num_layers + + 1, # 2 * num_layers + 1 + num_piecewise_capturable_graphs_seen=1 + + llama_config.num_layers, # 1 + num_layers + num_inductor_compilations=1 + + llama_config.num_layers, # num_piecewise_capturable_graphs_seen + num_cudagraph_caputured=2 * + (1 + llama_config.num_layers + ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen + ): + outputs.append( + run_model(llama_config, use_compile=True, split_attn=True)) + run_model(tractable_config, use_compile=True, split_attn=True) + + for i in range(1, len(outputs)): + assert torch.allclose(outputs[0], outputs[i]) + + +@torch.inference_mode +def benchmark(): + from triton.testing import do_bench + + # similar to llama 3.1-8B + llama_config = LlamaConfig(hidden_size=4096, + mlp_size=14336, + vocab_size=128 * 1024, + num_layers=32) + + # a tiny model to measure the overhead + # of piecewise cudagraph + llama_config = LlamaConfig(hidden_size=40, + mlp_size=80, + vocab_size=128, + num_layers=2) + + cudagraph_sizes = [1, 2, 4] + [i * 8 for i in range(1, 33)] + + eager_time = {} + full_cudagraph_time = {} + piecewise_cudagraph_time = {} + + pool = torch.cuda.graph_pool_handle() + + for piecewise in [False, True]: + if piecewise: + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + use_cudagraph=True, + splitting_ops=["silly.attention"], + cudagraph_capture_sizes=cudagraph_sizes, + ) + else: + compilation_config = CompilationConfig( + level=CompilationLevel.PIECEWISE, + cudagraph_capture_sizes=cudagraph_sizes, + ) + + vllm_config = VllmConfig(compilation_config=compilation_config) + with set_current_vllm_config(vllm_config): + model = LlamaModel(config=llama_config, + vllm_config=vllm_config, + prefix="").eval().cuda().to(torch.bfloat16) + + B = 256 # max batch size + input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda() + positions = torch.arange(B).cuda().to(torch.bfloat16) + + graphs = {} + + model(input_ids, positions) + for b in cudagraph_sizes[::-1]: + if not piecewise: + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, pool=pool): + output = model(input_ids[:b], positions[:b]) + graphs[b] = (graph, output) + else: + output = model(input_ids[:b], positions[:b]) + graphs[b] = (model, output) + for b in cudagraph_sizes: + if piecewise: + # noqa is for `Function definition does not bind loop variable` + # it will be problematic if we save the created lambda function + # and use it later, because it will look up the name `b` in the + # enclosing scope, and the value of `b` will always be 256. + # it is fine here, because we only use the lambda function once. + runtime = do_bench(lambda: graphs[b][0] # noqa + (input_ids[:b], positions[:b])) # noqa + piecewise_cudagraph_time[b] = runtime + else: + runtime = do_bench(lambda: graphs[b][0].replay()) # noqa + eager_runtime = do_bench( + lambda: model(input_ids[:b], positions[:b])) # noqa + full_cudagraph_time[b] = runtime + eager_time[b] = eager_runtime + + # print in tabular format + print("batch size\teager mode\tfull cudagraph\tpiecewise cudagraph") + for b in cudagraph_sizes: + print(f"{b}\t{eager_time[b]:.3f}\t{full_cudagraph_time[b]:.3f}" + f"\t{piecewise_cudagraph_time[b]:.3f}") + + +if __name__ == "__main__": + benchmark() diff --git a/vllm/tests/compile/test_basic_correctness.py b/vllm/tests/compile/test_basic_correctness.py index 6aa27b24b..99781c55b 100644 --- a/vllm/tests/compile/test_basic_correctness.py +++ b/vllm/tests/compile/test_basic_correctness.py @@ -1,47 +1,141 @@ +import dataclasses from typing import Dict, List, Optional import pytest -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from vllm.utils import cuda_device_count_stateless from ..utils import compare_all_settings +@dataclasses.dataclass +class TestSetting: + model: str + model_args: List[str] + pp_size: int + tp_size: int + attn_backend: str + method: str + fullgraph: bool + + +# representative settings for testing +test_settings = [ + # basic llama model + TestSetting( + model="meta-llama/Llama-3.2-1B", + model_args=[], + pp_size=2, + tp_size=2, + attn_backend="FLASHINFER", + method="generate", + fullgraph=True, + ), + # llama model with quantization + TestSetting( + model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + model_args=["--quantization", "gptq"], + pp_size=1, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # MoE model + TestSetting( + model="ibm/PowerMoE-3b", + model_args=[], + pp_size=1, + tp_size=2, + attn_backend="FLASH_ATTN", + method="generate", + fullgraph=True, + ), + # embedding model + TestSetting( + model="BAAI/bge-multilingual-gemma2", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="FLASHINFER", + method="encode", + fullgraph=True, + ), + # encoder-based embedding model (BERT) + TestSetting( + model="BAAI/bge-base-en-v1.5", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="XFORMERS", + method="encode", + fullgraph=True, + ), + # vision language model + TestSetting( + model="microsoft/Phi-3.5-vision-instruct", + model_args=["--trust-remote-code", "--max-model-len", "2048"], + pp_size=2, + tp_size=1, + attn_backend="FLASH_ATTN", + method="generate_with_image", + fullgraph=False, + ), +] + + # we cannot afford testing the full Catesian product # of all models and all levels -@pytest.mark.parametrize( - "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", - [ - ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True), - ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", - ["--quantization", "compressed-tensors" - ], 1, 1, "FLASH_ATTN", "generate", True), - ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True), - # TODO: add multi-modality test for llava - ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) - ]) -def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend, - method, fullgraph): +@pytest.mark.parametrize("test_setting", test_settings) +def test_compile_correctness(test_setting: TestSetting): # this test is run under multiple suits, with different GPUs. # make sure we only run the test with correct CUDA devices. # don't use "<", as it will duplicate the tests. + model = test_setting.model + model_args = test_setting.model_args + pp_size = test_setting.pp_size + tp_size = test_setting.tp_size + attn_backend = test_setting.attn_backend + method = test_setting.method + fullgraph = test_setting.fullgraph if cuda_device_count_stateless() != pp_size * tp_size: pytest.skip("Not correct CUDA devices for the test.") import os os.environ["VLLM_ATTENTION_BACKEND"] = attn_backend - if not fullgraph: - os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" - all_args = [["--enforce-eager"] + model_args + ["--max_model_len", "1024"] - + ["-pp", str(pp_size)] + ["-tp", str(tp_size)]] * 3 - # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case - # inductor will change the output, so we cannot compare them. - all_envs: List[Optional[Dict[str, str]]] = [{ - "VLLM_TORCH_COMPILE_LEVEL": - str(level) - } for level in [ - CompilationLevel.NO_COMPILATION, - CompilationLevel.DYNAMO_AS_IS, - CompilationLevel.DYNAMO_ONCE, - ]] - compare_all_settings(model, all_args, all_envs, method=method) + final_args = ["--enforce-eager"] + model_args + ["-pp", str(pp_size)] + \ + ["-tp", str(tp_size)] + + all_args: List[List[str]] = [] + all_envs: List[Optional[Dict[str, str]]] = [] + + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.PIECEWISE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) + + # inductor will change the output, so we only compare if the output + # is close, not exactly the same. + compare_all_settings( + model, + all_args, + all_envs, + method=method if method != "generate" else "generate_close") + all_envs.clear() + all_args.clear() + + for level in [ + CompilationLevel.NO_COMPILATION, + CompilationLevel.DYNAMO_AS_IS, + CompilationLevel.DYNAMO_ONCE, + ]: + all_args.append(final_args + [f"-O{level}"]) + all_envs.append({}) + if level != CompilationLevel.DYNAMO_ONCE and not fullgraph: + # "DYNAMO_ONCE" will always use fullgraph + all_envs[-1][ + "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore + + compare_all_settings(model, all_args * 3, all_envs, method=method) diff --git a/vllm/tests/compile/test_full_graph.py b/vllm/tests/compile/test_full_graph.py index f28f9145b..4dfdfe21a 100644 --- a/vllm/tests/compile/test_full_graph.py +++ b/vllm/tests/compile/test_full_graph.py @@ -1,6 +1,6 @@ import pytest -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from ..utils import fork_new_process_for_each_test from .utils import TEST_MODELS, check_full_graph_support @@ -9,7 +9,7 @@ @pytest.mark.parametrize("model_info", TEST_MODELS) @pytest.mark.parametrize( "optimization_level", - [CompilationLevel.DYNAMO_ONCE, CompilationLevel.INDUCTOR]) + [CompilationLevel.DYNAMO_ONCE, CompilationLevel.PIECEWISE]) @fork_new_process_for_each_test def test_full_graph(model_info, optimization_level): model = model_info[0] diff --git a/vllm/tests/compile/test_functionalization.py b/vllm/tests/compile/test_functionalization.py new file mode 100644 index 000000000..503618907 --- /dev/null +++ b/vllm/tests/compile/test_functionalization.py @@ -0,0 +1,95 @@ +import pytest +import torch + +import vllm.envs as envs +from vllm import LLM, SamplingParams +from vllm.compilation.fix_functionalization import FixFunctionalizationPass +from vllm.compilation.fusion import (FusionPass, find_auto_fn, + find_auto_fn_maybe) +from vllm.compilation.reshapes import RedundantReshapesPass +from vllm.compilation.vllm_inductor_pass import is_func +from vllm.config import CompilationConfig + +from .backend import TestBackend + +OPS_IN_MODEL = [ + torch.ops._C.rotary_embedding.default, + torch.ops._C.fused_add_rms_norm.default, + torch.ops._C.silu_and_mul.default, +] + +RMS_OP = torch.ops._C.rms_norm.default + +RMS_QUANT_OPS = { + "static_fp8": [ + torch.ops._C.rms_norm_static_fp8_quant.default, + torch.ops._C.fused_add_rms_norm_static_fp8_quant.default + ], +} + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +@pytest.mark.parametrize("model", + ["nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"]) +@pytest.mark.parametrize("do_fusion", [True, False]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", + reason="Only test on CUDA") +def test_fix_functionalization(model: str, do_fusion: bool): + torch.set_default_device("cuda") + + config = CompilationConfig.PassConfig(enable_fusion=do_fusion, + enable_reshape=True) + reshape_pass = RedundantReshapesPass(config) + fusion_pass = FusionPass.instance(config) + + passes = [reshape_pass, fusion_pass] if do_fusion else [reshape_pass] + func_pass = FixFunctionalizationPass(config) + backend_func = TestBackend(*passes, func_pass) + backend_no_func = TestBackend(*passes) + + # instantiate a full engine and manually compile the model 2x + # (with and without FixFunctionalizationPass) + llm = LLM(model=model, enforce_eager=True) + model_runner = llm.llm_engine.model_executor.driver_worker.model_runner + orig_model = model_runner.model + # TODO mark inputs dynamic? (currently torch.compile is triggered 4x) + # Can only do that by using the decorator but then we'd have to instantiate + # 2 LLM instances. + + sampling_params = SamplingParams(temperature=0.0, top_p=1.0) + model_runner.model = torch.compile(orig_model, + fullgraph=True, + backend=backend_func) + gen_func = llm.generate(prompts, sampling_params) + + model_runner.model = torch.compile(orig_model, + fullgraph=True, + backend=backend_no_func) + gen_no_func = llm.generate(prompts, sampling_params) + + for output_func, output_no_func in zip(gen_func, gen_no_func): + assert output_func.outputs[0].text == output_no_func.outputs[0].text + + # OPS_IN_MODEL always appear. RMS_OP is fused away if we run fusion, + # and replaced by fused quantized ops in RMS_QUANT_OPS. + ops = OPS_IN_MODEL + (RMS_QUANT_OPS["static_fp8"] + if do_fusion else [RMS_OP]) + + for op in ops: + find_auto_fn(backend_no_func.graph_post_pass.nodes, op) + assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes, + op) is None # noqa: E501 + + # make sure the ops were all de-functionalized + found = dict() + for node in backend_func.graph_post_pass.nodes: + for op in ops: + if is_func(node, op): + found[op] = True + assert all(found[op] for op in ops) diff --git a/vllm/tests/compile/test_fusion.py b/vllm/tests/compile/test_fusion.py new file mode 100644 index 000000000..f92ec8d0d --- /dev/null +++ b/vllm/tests/compile/test_fusion.py @@ -0,0 +1,91 @@ +import pytest +import torch +from compressed_tensors.quantization import FP8_DTYPE + +import vllm.envs as envs +from vllm.compilation.fusion import (FusionPass, find_auto_fn, + find_auto_fn_maybe) +from vllm.compilation.reshapes import RedundantReshapesPass +from vllm.config import CompilationConfig +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + apply_fp8_linear) + +from .backend import TestBackend + + +class TestModel(torch.nn.Module): + + def __init__(self, hidden_size: int, eps: float, *args, **kwargs): + super().__init__(*args, **kwargs) + self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)] + self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)] + self.w = [ + torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + for _ in range(2) + ] + + def forward(self, x): + resid = torch.relu(x) + y = self.norm[0](x) + + x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1]) + # make sure resid is used for replacement to work + y2, resid = self.norm[1](x2, resid) + + x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3]) + y3, resid = self.norm[2](x3, resid) # use resid here + return y3 + + +@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("hidden_size", [64, 3392, 4096]) +@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049]) +@pytest.mark.parametrize("eps", [1e-5, 1e-6]) +@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda", + reason="Only test on CUDA") +def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps): + torch.set_default_device("cuda") + torch.set_default_dtype(torch.float16) + + if eps != 1e-5: + pytest.skip("Only test eps=1e-5 for now") + + # Reshape pass is needed for the fusion pass to work + config = CompilationConfig.PassConfig(enable_fusion=True, + enable_reshape=True) + reshape_pass = RedundantReshapesPass(config) + fusion_pass = FusionPass.instance(config) + + backend = TestBackend(reshape_pass, fusion_pass) + model = TestModel(hidden_size, eps) + + # First dimension dynamic + x = torch.rand(num_tokens, hidden_size) + torch._dynamo.mark_dynamic(x, 0) + + result = model(x) + + model2 = torch.compile(model, backend=backend) + result2 = model2(x) + + # Check that it gives the same answer + torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3) + + # Check substitution worked + pre_nodes = backend.graph_pre_pass.nodes + post_nodes = backend.graph_post_pass.nodes + + rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default + add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default + fp8_quant = torch.ops._C.static_scaled_fp8_quant.default + + # In pre-nodes, fp8 quant should be present and fused kernels should not + assert find_auto_fn_maybe(pre_nodes, rms_quant) is None + assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None + find_auto_fn(pre_nodes, fp8_quant) + + # In post-nodes, fused kernels should be present and fp8 quant should not + find_auto_fn(post_nodes, rms_quant) + find_auto_fn(post_nodes, add_rms_quant) + assert find_auto_fn_maybe(post_nodes, fp8_quant) is None diff --git a/vllm/tests/compile/test_pass_manager.py b/vllm/tests/compile/test_pass_manager.py new file mode 100644 index 000000000..03e753509 --- /dev/null +++ b/vllm/tests/compile/test_pass_manager.py @@ -0,0 +1,35 @@ +import pickle + +import pytest +import torch +from torch._inductor.codecache import BypassFxGraphCache + +from vllm.compilation.config import CompilationConfig +from vllm.compilation.inductor_pass import (CallableInductorPass, + as_inductor_pass) +from vllm.compilation.pass_manager import PostGradPassManager + + +def simple_callable(graph: torch.fx.Graph): + pass + + +@as_inductor_pass(files=(__file__, )) +def callable_decorated(graph: torch.fx.Graph): + pass + + +@pytest.mark.parametrize( + "works, callable", + [(False, simple_callable), (True, callable_decorated), + (True, CallableInductorPass(simple_callable, "simple_callable"))]) +def test_pass_manager(works: bool, callable): + config = CompilationConfig().pass_config + pass_manager = PostGradPassManager([callable]) + pass_manager.configure(config) # Adds default passes + + if works: + pickle.dumps(pass_manager) + else: + with pytest.raises(BypassFxGraphCache): + pickle.dumps(pass_manager) diff --git a/vllm/tests/compile/test_wrapper.py b/vllm/tests/compile/test_wrapper.py index 3668c1fab..74f66baaa 100644 --- a/vllm/tests/compile/test_wrapper.py +++ b/vllm/tests/compile/test_wrapper.py @@ -3,6 +3,7 @@ import torch from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.config import CompilationLevel class MyMod(torch.nn.Module): @@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher): def __init__(self, model): self.model = model compiled_callable = torch.compile(self.forward, backend="eager") - super().__init__(compiled_callable) + super().__init__(compiled_callable, + compilation_level=CompilationLevel.DYNAMO_ONCE) def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): # this is the function to be compiled diff --git a/vllm/tests/compile/utils.py b/vllm/tests/compile/utils.py index 64fc08e80..7c92d165d 100644 --- a/vllm/tests/compile/utils.py +++ b/vllm/tests/compile/utils.py @@ -4,7 +4,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm.compilation.levels import CompilationLevel +from vllm.config import CompilationLevel from vllm.platforms import current_platform TEST_MODELS = [ @@ -23,13 +23,12 @@ ("meta-llama/Meta-Llama-3-8B", {}), ] -# TODO: enable in pytorch 2.5 -if False and is_quant_method_supported("aqlm"): # noqa: SIM223 +if is_quant_method_supported("aqlm"): TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", { "quantization": "aqlm" })) -# TODO: enable in pytorch 2.5 +# TODO: figure out why this fails. if False and is_quant_method_supported("gguf"): # noqa: SIM223 TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", { "quantization": "gguf" @@ -66,16 +65,15 @@ def check_full_graph_support(model, optimization_level, tp_size=1): # make sure these models can be captured in full graph mode - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" - # Inductor doesn't support fp8 and the base meta llama uses too - # much memory. - quantization = model_kwargs.get("quantization") - if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B") - and optimization_level >= CompilationLevel.INDUCTOR): + # The base meta llama uses too much memory. + if (model == "meta-llama/Meta-Llama-3-8B" + and optimization_level >= CompilationLevel.PIECEWISE): return + print(f"MODEL={model}") + prompts = [ "Hello, my name is", "The president of the United States is", @@ -87,6 +85,7 @@ def check_full_graph_support(model, enforce_eager=True, tensor_parallel_size=tp_size, disable_custom_all_reduce=True, + compilation_config=optimization_level, **model_kwargs) outputs = llm.generate(prompts, sampling_params) diff --git a/vllm/tests/conftest.py b/vllm/tests/conftest.py index 2fce2d772..d6be8f5b0 100644 --- a/vllm/tests/conftest.py +++ b/vllm/tests/conftest.py @@ -1,11 +1,11 @@ import json import os -import sys import tempfile from collections import UserList from enum import Enum from typing import (Any, Callable, Dict, List, Optional, Tuple, Type, TypedDict, TypeVar, Union) +from unittest.mock import patch import numpy as np import pytest @@ -52,7 +52,7 @@ def _read_prompts(filename: str) -> List[str]: - with open(filename, "r") as f: + with open(filename) as f: prompts = f.readlines() return prompts @@ -62,14 +62,8 @@ class _ImageAssetPrompts(TypedDict): cherry_blossom: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _ImageAssetsBase(UserList): - pass -else: - - class _ImageAssetsBase(UserList[ImageAsset]): - pass +class _ImageAssetsBase(UserList[ImageAsset]): + pass class _ImageAssets(_ImageAssetsBase): @@ -94,14 +88,8 @@ class _VideoAssetPrompts(TypedDict): sample_demo_1: str -if sys.version_info < (3, 9): - # UserList cannot be subscripted - class _VideoAssetsBase(UserList): - pass -else: - - class _VideoAssetsBase(UserList[VideoAsset]): - pass +class _VideoAssetsBase(UserList[VideoAsset]): + pass class _VideoAssets(_VideoAssetsBase): @@ -121,6 +109,23 @@ def prompts(self, prompts: _VideoAssetPrompts) -> List[str]: """Singleton instance of :class:`_VideoAssets`.""" +@pytest.fixture(params=[True, False]) +def run_with_both_engines(request): + # Automatically runs tests twice, once with V1 and once without + use_v1 = request.param + # Tests decorated with `@skip_v1` are only run without v1 + skip_v1 = request.node.get_closest_marker("skip_v1") + + if use_v1: + if skip_v1: + pytest.skip("Skipping test on vllm V1") + with patch('vllm.envs.VLLM_USE_V1', True): + yield + else: + with patch('vllm.envs.VLLM_USE_V1', False): + yield + + @pytest.fixture(autouse=True) def init_test_http_connection(): # pytest_asyncio may use a different event loop per test @@ -238,6 +243,9 @@ def video_assets() -> _VideoAssets: class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + if device is None: device = "cpu" if current_platform.is_cpu() else "cuda" @@ -255,12 +263,11 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, - is_embedding_model: bool = False, is_sentence_transformer: bool = False, + is_cross_encoder: bool = False, skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, - postprocess_inputs: Callable[[BatchEncoding], - BatchEncoding] = identity, + postprocess_inputs: Callable[..., BatchEncoding] = identity, ) -> None: torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -275,6 +282,14 @@ def __init__( device="cpu", trust_remote_code=True, ).to(dtype=torch_dtype)) + elif is_cross_encoder: + # Lazy init required for AMD CI + from sentence_transformers import CrossEncoder + self.model = CrossEncoder(model_name, + device="cpu", + trust_remote_code=True) + self.model.model = self.wrap_device(self.model.model)\ + .to(dtype=torch_dtype) else: model_kwargs = model_kwargs if model_kwargs is not None else {} self.model = self.wrap_device( @@ -303,6 +318,7 @@ def __init__( if skip_tokenizer_init: self.tokenizer = self.processor.tokenizer + self.dtype = dtype self.postprocess_inputs = postprocess_inputs def get_inputs( @@ -337,7 +353,7 @@ def get_inputs( processor_kwargs["sampling_rate"] = sr inputs = self.processor(**processor_kwargs) - inputs = self.postprocess_inputs(inputs) + inputs = self.postprocess_inputs(inputs, dtype=self.dtype) all_inputs.append(inputs) @@ -617,6 +633,9 @@ def generate_encoder_decoder_greedy_logprobs_limit( def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]: return self.model.encode(prompts) + def predict(self, prompts: List[List[str]]) -> torch.Tensor: + return self.model.predict(prompts, convert_to_tensor=True) + def __enter__(self): return self @@ -637,6 +656,7 @@ def __init__( model_name: str, task: TaskOption = "auto", tokenizer_name: Optional[str] = None, + tokenizer_mode: str = "auto", # Use smaller max model length, otherwise bigger model cannot run due # to kv cache size limit. max_model_len: int = 1024, @@ -653,6 +673,7 @@ def __init__( model=model_name, task=task, tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, trust_remote_code=True, dtype=dtype, swap_space=swap_space, @@ -823,6 +844,7 @@ def generate_greedy_logprobs( audios: Optional[PromptAudioInput] = None, videos: Optional[PromptVideoInput] = None, stop_token_ids: Optional[List[int]] = None, + stop: Optional[List[str]] = None, ) -> Union[List[TokensTextLogprobs], List[TokensTextLogprobsPromptLogprobs]]: greedy_logprobs_params = SamplingParams( @@ -830,7 +852,8 @@ def generate_greedy_logprobs( max_tokens=max_tokens, logprobs=num_logprobs, prompt_logprobs=num_prompt_logprobs, - stop_token_ids=stop_token_ids) + stop_token_ids=stop_token_ids, + stop=stop) return self.generate_w_logprobs(prompts, greedy_logprobs_params, @@ -890,6 +913,14 @@ def encode( req_outputs = self.model.encode(inputs) return [req_output.outputs.embedding for req_output in req_outputs] + def score( + self, + text_1: Union[str, List[str]], + text_2: Union[str, List[str]], + ) -> List[List[float]]: + req_outputs = self.model.score(text_1, text_2) + return [req_output.outputs.embedding for req_output in req_outputs] + def __enter__(self): return self @@ -958,7 +989,7 @@ def dummy_opt_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyOPTForCausalLM"] with open(json_path, "w") as f: @@ -977,7 +1008,7 @@ def dummy_llava_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyLlava"] with open(json_path, "w") as f: @@ -996,9 +1027,28 @@ def dummy_gemma2_embedding_path(): "*.msgpack" ]) assert os.path.exists(json_path) - with open(json_path, "r") as f: + with open(json_path) as f: config = json.load(f) config["architectures"] = ["MyGemma2Embedding"] with open(json_path, "w") as f: json.dump(config, f) return _dummy_gemma2_embedding_path + + +# Add the flag `--optional` to allow run tests +# that are marked with @pytest.mark.optional +def pytest_addoption(parser): + parser.addoption("--optional", + action="store_true", + default=False, + help="run optional test") + + +def pytest_collection_modifyitems(config, items): + if config.getoption("--optional"): + # --optional given in cli: do not skip optional tests + return + skip_optional = pytest.mark.skip(reason="need --optional option to run") + for item in items: + if "optional" in item.keywords: + item.add_marker(skip_optional) diff --git a/vllm/tests/core/block/e2e/test_correctness_sliding_window.py b/vllm/tests/core/block/e2e/test_correctness_sliding_window.py index 9320a9ef6..415d0bd82 100644 --- a/vllm/tests/core/block/e2e/test_correctness_sliding_window.py +++ b/vllm/tests/core/block/e2e/test_correctness_sliding_window.py @@ -3,6 +3,7 @@ import pytest +from tests.kernels.utils import override_backend_env_variable from vllm import LLM, SamplingParams from .conftest import get_text_from_llm_generator @@ -28,8 +29,9 @@ @pytest.mark.parametrize("test_llm_kwargs", [{}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, - batch_size, seed): + batch_size, seed, backend, monkeypatch): """ The test does a bunch of assignments "x1 = 10\nx2 = 33\n..." and then asks for value of one of them (which is outside the sliding window). @@ -38,6 +40,8 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, Additionally, we compare the results of the v1 and v2 managers. """ + override_backend_env_variable(monkeypatch, backend) + sampling_params = SamplingParams( max_tokens=1024, ignore_eos=True, @@ -84,7 +88,9 @@ def test_sliding_window_retrival(baseline_llm_generator, test_llm_generator, @pytest.mark.parametrize("test_llm_kwargs", [{"enable_chunked_prefill": True}]) @pytest.mark.parametrize("batch_size", [5]) @pytest.mark.parametrize("seed", [1]) -def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) +def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed, + backend, monkeypatch): """ This is similar to test_sliding_window_retrival, however, it doesn't compare against the v1 block manager since v1 doesn't support @@ -93,6 +99,8 @@ def test_sliding_window_chunked_prefill(test_llm_generator, batch_size, seed): The results with and without chunked prefill are not the same due to numerical instabilities. """ + override_backend_env_variable(monkeypatch, backend) + sampling_params = SamplingParams( max_tokens=10, ignore_eos=True, diff --git a/vllm/tests/core/block/test_prefix_caching_block.py b/vllm/tests/core/block/test_prefix_caching_block.py index 1a6e17ef7..bbeb4b3a5 100644 --- a/vllm/tests/core/block/test_prefix_caching_block.py +++ b/vllm/tests/core/block/test_prefix_caching_block.py @@ -5,9 +5,14 @@ import pytest +from tests.core.utils import create_dummy_sequence +from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.core.block.interfaces import Block, BlockAllocator -from vllm.core.block.prefix_caching_block import (PrefixCachingBlock, +from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker, + PrefixCachingBlock, PrefixCachingBlockAllocator) +from vllm.sequence import Logprob +from vllm.utils import Device class TestPrefixCachingBlock: @@ -99,13 +104,11 @@ def test_blocks_have_correct_hash_in_chain(block_size: int, token_ids = [random.randint(0, 50_000) for _ in range(num_tokens)] - first_chain, second_chain = [ - TestPrefixCachingBlock.create_chain( - block_size=block_size, - token_ids=token_ids, - num_empty_trailing_blocks=num_empty_trailing_blocks) - for _ in range(2) - ] + first_chain, second_chain = (TestPrefixCachingBlock.create_chain( + block_size=block_size, + token_ids=token_ids, + num_empty_trailing_blocks=num_empty_trailing_blocks) + for _ in range(2)) for first_chain_block, second_chain_block in zip( first_chain, second_chain): @@ -728,18 +731,71 @@ def test_touch_block(): token_ids=common_token_ids, allocator=allocator, ) - block_ids = [block.block_id for block in blocks] + block_hashes = [block.content_hash for block in blocks] # The allocated blocks should be marked as touched # but not computed. - computed_block_ids = allocator.get_computed_block_ids( - [], block_ids, skip_last_block_id=False) + computed_block_ids = allocator.find_cached_blocks_prefix( + block_hashes) assert len(computed_block_ids) == 0 allocator.mark_blocks_as_computed([]) - computed_block_ids = allocator.get_computed_block_ids( - [], block_ids, skip_last_block_id=False) + computed_block_ids = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes) assert len(computed_block_ids) == common_blocks + @staticmethod + def test_find_cached_blocks_prefix(): + """ + This test verifies the behavior of find_cached_blocks_prefix. + """ + block_size = 4 + num_blocks = 8 + total_test_blocks = 12 + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + + token_ids = list(range(total_test_blocks * block_size)) + block_tokens_seq1 = token_ids[:num_blocks * block_size] + blocks_seq1 = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=block_tokens_seq1, + allocator=allocator, + ) + block_hashes_seq1 = [block.content_hash for block in blocks_seq1] + allocator.mark_blocks_as_computed([]) + + # All blocks should be cached. + cached_blocks_seq1 = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks_seq1) == num_blocks + + # Free the first sequence. + for block in blocks_seq1: + allocator.free(block) + + # All blocks should be still be cached if not required to be allocated. + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks) == num_blocks + + block_tokens_seq2 = token_ids[num_blocks * block_size:] + blocks_seq2 = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=block_tokens_seq2, + allocator=allocator, + ) + block_hashes_seq2 = [block.content_hash for block in blocks_seq2] + allocator.mark_blocks_as_computed([]) + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq2) + assert len(cached_blocks) == len(blocks_seq2) + + # Half of the blocks from seq1 should still be cached. + num_evicted_blocks = len(blocks_seq2) + cached_blocks = allocator.find_cached_blocks_prefix( + block_hashes=block_hashes_seq1) + assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + @staticmethod def create_immutable_chain( block_size: int, @@ -764,3 +820,114 @@ def create_immutable_chain( blocks.append(prev_block) return blocks + + +class TestComputedBlocksTracker: + + @staticmethod + def _get_mock_allocator(): + return MagicMock(spec=PrefixCachingBlockAllocator) + + @staticmethod + def test_get_num_cached_tokens(): + """ + Test it correctly computes the number of cached tokens for a given + sequence: + + - The cache token count is derived from the number of cached blocks. + - The cache token count is updated when the allocator is updated. + - When a sequence is removed, the cache token count should be updated + accordingly. + + # TODO(rickyx): This behaviour for prefill sequence is a hack until + we fix the computed blocks tracking. + - The cache token count for prefill sequence doesn't change while + the sequence is in continuous prefill (chunked prefill). + """ + block_size = 4 + mock_allocator = TestComputedBlocksTracker._get_mock_allocator() + tracker = ComputedBlocksTracker( + allocator=mock_allocator, + block_size=block_size, + enable_caching=True, + ) + + # Not yet allocated. + tokens = [0, 1, 2, 3, 4, 5] + seq1 = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + mock_allocator.find_cached_blocks_prefix.return_value = [] + assert tracker.get_num_cached_tokens(seq1) == 0 + + mock_allocator.find_cached_blocks_prefix.return_value = [ + None + ] # 1 block cached. + # Result is cached for prefill sequence. + assert tracker.get_num_cached_tokens(seq1) == 0 + + # Mark the sequence as non-prefill. + seq1.data.update_num_computed_tokens(len(tokens)) # 6 tokens computed. + assert not seq1.is_prefill() + + # Recomputes for decoding sequence. + assert tracker.get_num_cached_tokens(seq1) == 4 + + # Append new tokens to the sequence. + num_new_tokens = 3 + for i in range(num_new_tokens): + seq1.append_token_id(i, {i: Logprob(logprob=0.0)}) + + assert tracker.get_num_cached_tokens(seq1) == 4 + + # Update the allocator. + mock_allocator.find_cached_blocks_prefix.return_value = [ + None + ] * 2 # 2 blocks cached. + assert tracker.get_num_cached_tokens(seq1) == 8 + + # Remove the sequence. + tracker.remove_seq(seq1.seq_id) + + # Re-create the sequence with the same request id to simulate recompute. + seq1 = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + mock_allocator.find_cached_blocks_prefix.return_value = [ + ] # no cached block + assert tracker.get_num_cached_tokens(seq1) == 0 + + @staticmethod + def test_correct_block_hash(): + """ + Test that the block hash is correctly computed for a sequence (should + match the underlying block allocator's block hash). So the number of + cached tokens is correctly retrieved. + """ + block_size = 4 + allocator = CpuGpuBlockAllocator.create( + allocator_type="prefix_caching", + num_gpu_blocks=16, + num_cpu_blocks=16, + block_size=block_size, + ) + gpu_allocator = allocator._allocators[Device.GPU] + + tracker = ComputedBlocksTracker( + allocator=allocator, + block_size=block_size, + enable_caching=True, + ) + + tokens = list(range(block_size * 4)) # 4 blocks. + seq = create_dummy_sequence(request_id=0, + token_ids=tokens, + block_size=block_size) + _ = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=tokens, + allocator=gpu_allocator, + ) + allocator.mark_blocks_as_computed([]) + + assert tracker.get_num_cached_tokens(seq) == len(tokens) diff --git a/vllm/tests/core/test_chunked_prefill_scheduler.py b/vllm/tests/core/test_chunked_prefill_scheduler.py index acd82065a..eaaf004df 100644 --- a/vllm/tests/core/test_chunked_prefill_scheduler.py +++ b/vllm/tests/core/test_chunked_prefill_scheduler.py @@ -413,6 +413,45 @@ def cannot_append_second_group2(seq_group, num_lookahead_slots): assert out.num_batched_tokens == max_num_batched_tokens +@pytest.mark.parametrize("num_scheduler_steps", [1, 5]) +def test_chunked_prefill_spec_prefill(num_scheduler_steps): + """Verify that the num_lookahead_slots is set appropriately for an all""" + """prefill batch depending on whether multi-step scheduling is enabled""" + """or not""" + block_size = 4 + max_seqs = 30 + max_model_len = 200 + max_num_batched_tokens = 30 + num_lookahead_slots = 4 + scheduler_config = SchedulerConfig( + "generate", + max_num_batched_tokens, + max_seqs, + max_model_len, + enable_chunked_prefill=True, + num_lookahead_slots=num_lookahead_slots, + num_scheduler_steps=num_scheduler_steps, + ) + cache_config = CacheConfig(block_size, 1.0, 1, "auto") + cache_config.num_cpu_blocks = 16 + cache_config.num_gpu_blocks = 16 + scheduler = Scheduler(scheduler_config, cache_config, None) + + _, seq_group = create_dummy_prompt("1", + prompt_length=30, + block_size=block_size) + scheduler.add_seq_group(seq_group) + _, out = schedule_and_update_computed_tokens(scheduler) + # The request is chunked. + # prefill scheduled now. + assert len(out.scheduled_seq_groups) == 1 + assert out.num_prefill_groups == 1 + assert out.num_batched_tokens == max_num_batched_tokens + print(out.num_lookahead_slots) + assert out.num_lookahead_slots == (0 if (num_scheduler_steps == 1) else + num_lookahead_slots) + + def test_chunked_prefill_max_seqs(): block_size = 4 max_seqs = 2 diff --git a/vllm/tests/core/test_scheduler.py b/vllm/tests/core/test_scheduler.py index 5ff32be61..8f6de84e5 100644 --- a/vllm/tests/core/test_scheduler.py +++ b/vllm/tests/core/test_scheduler.py @@ -12,9 +12,9 @@ from vllm.lora.request import LoRARequest from vllm.sequence import SequenceGroup -from .utils import (append_new_token, append_new_token_seq_group, - create_dummy_prompt, get_sequence_groups, - schedule_and_update_computed_tokens) +from .utils import (append_new_token, append_new_token_seq, + append_new_token_seq_group, create_dummy_prompt, + get_sequence_groups, schedule_and_update_computed_tokens) def test_scheduler_add_seq_group(): @@ -305,6 +305,8 @@ def initialize_scheduler( block_size=4, num_cpu_blocks=8, num_gpu_blocks=8, + enable_prefix_caching=False, + enable_chunked_prefill=False, ): block_size = block_size scheduler_config = SchedulerConfig( @@ -312,8 +314,15 @@ def initialize_scheduler( max_num_batched_tokens=max_token_budget, max_num_seqs=max_num_seqs, max_model_len=max_model_len, + enable_chunked_prefill=enable_chunked_prefill, + ) + cache_config = CacheConfig( + block_size, + 1.0, + 1, + "auto", + enable_prefix_caching=enable_prefix_caching, ) - cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_gpu_blocks = num_gpu_blocks scheduler = Scheduler(scheduler_config, cache_config, lora_config) @@ -800,3 +809,165 @@ def test_scheduling_budget(): assert budget.num_curr_seqs == 0 budget.subtract_num_seqs(seq_group.request_id, 2) assert budget.num_curr_seqs == 0 + + +@pytest.mark.parametrize("enable_prefix_caching", [True, False]) +def test_prefix_caching_aware_prefills(enable_prefix_caching): + """ + Test the below scenario: + + For 3 sequences, seqA, seqB, seqC, share the first block as prefix. + + The test verifies the below scenarios: + 1. SeqA is first scheduled. + 2. SeqB and SeqC can be prefilled together in a single schedule round + even though there are not enough token budgets to prefill both without + considering prefix caching. + """ + + block_size = 4 + max_num_batched_tokens = 12 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_token_budget=max_num_batched_tokens, + max_num_seqs=max_seq_group, + max_model_len=max_num_batched_tokens, + enable_prefix_caching=enable_prefix_caching, + ) + + seqA_tokens = list(range(8)) + num_shared_tokens = 4 + seqB_tokens = seqA_tokens[:num_shared_tokens] + list(range( + 12, 16)) # Shared prefix first 4. + seqC_tokens = seqA_tokens[:num_shared_tokens] + list(range( + 16, 20)) # Shared prefix first 4. + + seqA, seqA_group = create_dummy_prompt("0", + prompt_tokens=seqA_tokens, + block_size=block_size) + seqB, seqB_group = create_dummy_prompt("1", + prompt_tokens=seqB_tokens, + block_size=block_size) + seqC, seqC_group = create_dummy_prompt("2", + prompt_tokens=seqC_tokens, + block_size=block_size) + + # Schedule seqA prefill. + scheduler.add_seq_group(seqA_group) + metas, out, _ = scheduler.schedule() + assert (len(out.scheduled_seq_groups) == 1 + and out.scheduled_seq_groups[0].seq_group == seqA_group) + assert out.scheduled_seq_groups[0].token_chunk_size == len(seqA_tokens) + + # Schedule seqA decode. + append_new_token_seq_group(len(seqA_tokens), seqA_group, 999) + metas, out, _ = scheduler.schedule() + + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 1 + + # Schedule seqB and seqC prefills should work with prefix caching. + scheduler.add_seq_group(seqB_group) + scheduler.add_seq_group(seqC_group) + metas, out, _ = scheduler.schedule() + + if enable_prefix_caching: + assert len(out.scheduled_seq_groups) == 2 + assert set([ + out.scheduled_seq_groups[0].seq_group, + out.scheduled_seq_groups[1].seq_group, + ]) == set([seqB_group, seqC_group]) + assert len(metas) == 2 + for meta in metas: + assert meta.token_chunk_size == 8 + assert (len(meta.computed_block_nums) == num_shared_tokens // + block_size) # 1 Block for the 8 tokens. + else: + assert len(out.scheduled_seq_groups) == 1 + assert len(metas) == 1 + assert metas[0].token_chunk_size == 8 + assert len(metas[0].computed_block_nums) == 0 # No blocks computed. + + +def test_no_multiple_partial_prefills_with_chunked_prefill_and_prefix_caching( +): + """ + This test verifies that we don't schedule new prefills if there's already + a continuous prefill in progress even though the new prefills with shared + prefix can fit in the token budget: + + - SeqA is being chunked prefill. + - SeqB with the same prompt shouldn't be scheduled for prefill even though + there's enough token budget to prefill the cached tokens. + - Neither should seqC be scheduled. + + - When seqA is in decoding phase, seqB and seqC can be scheduled. + - Entire seqB should be prefilled since it's a full prefix cache hit. + - SeqC would be partially prefilled with the prefix shared, and the + remaining unique tokens would be prefilled (rounded down to be + block-size aligned). + """ + + block_size = 2 + max_num_batched_tokens = 4 + max_seq_group = 3 + scheduler = initialize_scheduler( + block_size=block_size, + num_cpu_blocks=16, + num_gpu_blocks=16, + max_token_budget=max_num_batched_tokens, + max_num_seqs=max_seq_group, + max_model_len=100, + enable_prefix_caching=True, + enable_chunked_prefill=True, + ) + + seqA_tokens = list(range(8)) + seqB_tokens = seqA_tokens + seqC_shared_prefix_len = 4 + seqC_tokens = seqA_tokens[:seqC_shared_prefix_len] + list(range(12, 20)) + + seqA, seqA_group = create_dummy_prompt("0", + prompt_tokens=seqA_tokens, + block_size=block_size) + seqB, seqB_group = create_dummy_prompt("1", + prompt_tokens=seqB_tokens, + block_size=block_size) + + # Chunked prefill seqA. + scheduler.add_seq_group(seqA_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 4 + + # seqB should not be scheduled with ongoing prefills. + scheduler.add_seq_group(seqB_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 1 + assert out.scheduled_seq_groups[0].seq_group == seqA_group + assert out.scheduled_seq_groups[0].token_chunk_size == 4 + + # both seqB and seqC can now be scheduled with seqA is over. + # seqA is in decoding phase. + append_new_token_seq(seqA, 999) + seqC, seqC_group = create_dummy_prompt("2", + prompt_tokens=seqC_tokens, + block_size=block_size) + scheduler.add_seq_group(seqC_group) + metas, out = schedule_and_update_computed_tokens(scheduler) + assert len(out.scheduled_seq_groups) == 3 + + metas = {meta.request_id: meta for meta in metas} + assert metas[seqA_group.request_id].token_chunk_size == 1 # Decode + assert (metas[seqB_group.request_id].token_chunk_size == 8 + ) # Fully cached prefill + assert ( + metas[seqC_group.request_id].token_chunk_size == 6 + ), "A partial prefix of C (4 tokens) should be prefilled, with the " + "remaining tokens fit into 3 token budget (4-1 from the seqA). It will " + "then be rounded down to 2 tokens on block size, thus 6 tokens in total." diff --git a/vllm/tests/core/utils.py b/vllm/tests/core/utils.py index a95a573db..277368b57 100644 --- a/vllm/tests/core/utils.py +++ b/vllm/tests/core/utils.py @@ -1,16 +1,20 @@ import time -from typing import List, Optional +from collections import defaultdict +from typing import Any, Dict, List, Optional from typing import Sequence as GenericSequence from typing import Tuple from vllm import SamplingParams +from vllm.core.scheduler import Scheduler, SchedulerOutputs +from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest -from vllm.sequence import Logprob, Sequence, SequenceGroup +from vllm.sequence import (Logprob, Sequence, SequenceGroup, + SequenceGroupMetadata) def create_dummy_prompt( request_id: str, - prompt_length: int, + prompt_length: int = -1, block_size: Optional[int] = None, lora_request: Optional[LoRARequest] = None, best_of: int = 1, @@ -25,12 +29,10 @@ def create_dummy_prompt( # Create dummy prompt sequence with tokens 0...block_size-1 # and prompt "0 ... block_size". prompt_tokens = list(range(prompt_length)) + prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), - inputs={ - "prompt": prompt_str, - "prompt_token_ids": prompt_tokens, - }, + inputs=token_inputs(prompt_tokens, prompt=prompt_str), block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], @@ -44,6 +46,15 @@ def create_dummy_prompt( return prompt, seq_group +def create_dummy_sequence(request_id: int, token_ids: List[int], + block_size: int) -> Sequence: + return Sequence( + seq_id=request_id, + inputs=token_inputs(token_ids), + block_size=block_size, + ) + + def create_dummy_prompt_encoder_decoder( request_id: str, decoder_prompt_length: int, @@ -63,23 +74,21 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - inputs = { - "prompt": decoder_prompt_str, - "prompt_token_ids": decoder_prompt_tokens, - "encoder_prompt": encoder_prompt_str, - "encoder_prompt_token_ids": encoder_prompt_tokens, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), } decoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=True) + inputs=inputs["decoder"], + block_size=block_size) encoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=False) + inputs=inputs["encoder"], + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams(best_of=best_of), @@ -108,7 +117,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={"prompt_token_ids": prompt_token_ids}, + inputs=token_inputs(prompt_token_ids), block_size=16, ) @@ -143,21 +152,19 @@ def create_seq_group_encoder_decoder( prompt_token_ids = [0] * seq_prompt_len - inputs = { - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "encoder_prompt": "", - "encoder_prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), } seqs = [] for seq_id_offset, output_len in enumerate(seq_output_lens): # Construct decoder input sequences - seq = Sequence(seq_id=seq_id_start + seq_id_offset, - inputs=inputs, - block_size=16, - from_decoder_prompt=True) + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) for i in range(output_len): seq.append_token_id( @@ -167,10 +174,11 @@ def create_seq_group_encoder_decoder( seqs.append(seq) # Encoder input sequence - encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs, - block_size=16, - from_decoder_prompt=False) + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) return SequenceGroup(request_id=request_id, seqs=seqs, @@ -199,12 +207,40 @@ def append_new_token(out, token_id: int): def schedule_and_update_computed_tokens(scheduler): metas, out, _ = scheduler.schedule() - for s, meta in zip(out.scheduled_seq_groups, metas): - s.seq_group.update_num_computed_tokens(meta.token_chunk_size) + for s in out.scheduled_seq_groups: + s.seq_group.update_num_computed_tokens(s.token_chunk_size) return metas, out +def append_new_token_seq(seq: Sequence, token_id: int): + seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + def append_new_token_seq_group(token_chunk_size, seq_group, token_id: int): seq_group.update_num_computed_tokens(token_chunk_size) for seq in seq_group.get_seqs(): seq.append_token_id(token_id, {token_id: Logprob(token_id)}) + + +class SchedulerProxy: + """ + A proxy class to forward calls to the scheduler. + """ + + def __init__(self, scheduler: Scheduler): + self.scheduler_ = scheduler + self.call_history: Dict[str, List[Any]] = defaultdict(list) + + def __getattr__(self, name: str) -> Any: + + def wrapper(*args, **kwargs): + result = getattr(self.scheduler_, name)(*args, **kwargs) + self.call_history[name].append((args, kwargs, result)) + return result + + return wrapper + + def last_schedule_ret( + self, ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, Any]: + _, _, ret = self.call_history["schedule"][-1] + return ret diff --git a/vllm/tests/distributed/test_ca_buffer_sharing.py b/vllm/tests/distributed/test_ca_buffer_sharing.py new file mode 100644 index 000000000..fc4043cd3 --- /dev/null +++ b/vllm/tests/distributed/test_ca_buffer_sharing.py @@ -0,0 +1,59 @@ +# can only run on machines with p2p access across GPUs +# can only run with torchrun: +# torchrun --nproc_per_node=2 tests/distributed/test_ca_buffer_sharing.py + +import ctypes + +import torch +import torch.distributed as dist + +from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary +from vllm.distributed.device_communicators.custom_all_reduce import ( # noqa + CustomAllreduce) + +# create a cpu process group for communicating metadata (ipc handle) +dist.init_process_group(backend="gloo") +rank = local_rank = dist.get_rank() +world_size = dist.get_world_size() + +# every process sets its own device (differently) +lib = CudaRTLibrary() +lib.cudaSetDevice(rank) + +buffer_size_in_bytes = 1024 +byte_value = 2 # the value we write to the buffer for verification + +pointers = CustomAllreduce.create_shared_buffer(buffer_size_in_bytes) + +print(f"Rank {rank} has pointers {pointers}") + +dist.barrier() +torch.cuda.synchronize() + +if rank == 0: + # the first rank tries to write to all buffers + for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemset(pointer, byte_value, buffer_size_in_bytes) + +dist.barrier() +torch.cuda.synchronize() + +host_data = (ctypes.c_char * buffer_size_in_bytes)() + +# all ranks read from all buffers, and check if the data is correct +for p in pointers: + pointer = ctypes.c_void_p(p) + lib.cudaMemcpy(host_data, pointer, buffer_size_in_bytes) + for i in range(buffer_size_in_bytes): + assert ord(host_data[i]) == byte_value, ( + f"Rank {rank} failed" + f" to verify buffer {p}. Expected {byte_value}, " + f"got {ord(host_data[i])}") + +print(f"Rank {rank} verified all buffers") + +dist.barrier() +torch.cuda.synchronize() + +CustomAllreduce.free_shared_buffer(pointers) diff --git a/vllm/tests/distributed/test_custom_all_reduce.py b/vllm/tests/distributed/test_custom_all_reduce.py index 95435e753..86ca1948e 100644 --- a/vllm/tests/distributed/test_custom_all_reduce.py +++ b/vllm/tests/distributed/test_custom_all_reduce.py @@ -95,13 +95,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port): inp = torch.ones(sz, dtype=torch.float32, device=device) out = inp for _ in range(num_communication): - out = fa.all_reduce_unreg(out) + out = fa.all_reduce(out, registered=False) torch.testing.assert_close(out, inp * (tp_size**num_communication)) inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device) out = inp for _ in range(num_communication): - out = fa.all_reduce_unreg(out) + out = fa.all_reduce(out, registered=False) torch.testing.assert_close(out, inp * (tp_size**num_communication)) diff --git a/vllm/tests/distributed/test_pipeline_parallel.py b/vllm/tests/distributed/test_pipeline_parallel.py index ed6360f9d..386877e0e 100644 --- a/vllm/tests/distributed/test_pipeline_parallel.py +++ b/vllm/tests/distributed/test_pipeline_parallel.py @@ -32,6 +32,8 @@ class PPTestOptions(NamedTuple): multi_node_only: bool trust_remote_code: bool tokenizer_mode: Optional[str] + load_format: Optional[str] = None + hf_overrides: Optional[str] = None @dataclass @@ -50,6 +52,8 @@ def detailed( task: TaskOption = "auto", trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -78,7 +82,9 @@ def detailed( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) @staticmethod @@ -90,6 +96,8 @@ def fast( multi_node_only: bool = False, trust_remote_code: bool = False, tokenizer_mode: Optional[str] = None, + load_format: Optional[str] = None, + hf_overrides: Optional[str] = None, ): return PPTestSettings( parallel_setups=[ @@ -102,7 +110,9 @@ def fast( task=task, test_options=PPTestOptions(multi_node_only=multi_node_only, trust_remote_code=trust_remote_code, - tokenizer_mode=tokenizer_mode), + tokenizer_mode=tokenizer_mode, + load_format=load_format, + hf_overrides=hf_overrides), ) def iter_params(self, model_name: str): @@ -156,17 +166,17 @@ def iter_params(self, model_name: str): "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4), "mosaicml/mpt-7b": PPTestSettings.fast(), "nvidia/Minitron-8B-Base": PPTestSettings.fast(), - "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "allenai/OLMo-1B-hf": PPTestSettings.fast(), + "shanearora/OLMo-7B-1124-hf": PPTestSettings.fast(), + "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), + "adept/persimmon-8b-chat": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(), - "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 - "adept/persimmon-8b-chat": PPTestSettings.fast(), + "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'), # noqa: E501 "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), - "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), + "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(), "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(), "stabilityai/stablelm-3b-4e1t": PPTestSettings.fast(), "bigcode/starcoder2-3b": PPTestSettings.fast(), @@ -214,9 +224,9 @@ def iter_params(self, model_name: str): # NOTE: You can update this on your local machine to run specific tests TEST_MODELS = [ # [LANGUAGE GENERATION] + "microsoft/Phi-3.5-MoE-instruct", "meta-llama/Meta-Llama-3-8B", "ibm/PowerLM-3b", - "microsoft/Phi-3-mini-4k-instruct", # [LANGUAGE EMBEDDING] "intfloat/e5-mistral-7b-instruct", "BAAI/bge-multilingual-gemma2", @@ -238,7 +248,8 @@ def _compare_tp( method: Literal["generate", "encode"], ): tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup - multi_node_only, trust_remote_code, tokenizer_mode = test_options + multi_node_only, trust_remote_code, tokenizer_mode, \ + load_format, hf_overrides = test_options if num_gpus_available < tp_size * pp_size: pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") @@ -267,6 +278,10 @@ def _compare_tp( common_args.append("--trust-remote-code") if tokenizer_mode: common_args.extend(["--tokenizer-mode", tokenizer_mode]) + if load_format: + common_args.extend(["--load-format", load_format]) + if hf_overrides: + common_args.extend(["--hf-overrides", hf_overrides]) if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2 and chunked_prefill): diff --git a/vllm/tests/distributed/test_pynccl.py b/vllm/tests/distributed/test_pynccl.py index e0e424439..3e9b0e10a 100644 --- a/vllm/tests/distributed/test_pynccl.py +++ b/vllm/tests/distributed/test_pynccl.py @@ -60,9 +60,9 @@ def worker_fn(): tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) with pynccl_comm.change_state(enable=True): - pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == pynccl_comm.world_size + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -84,14 +84,14 @@ def multiple_allreduce_worker_fn(): with pynccl_comm.change_state(enable=True): # two groups can communicate independently if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.all_reduce(tensor) - pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 4 + tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() else: - pynccl_comm.all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 2 + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -112,12 +112,12 @@ def multiple_allreduce_with_vllm_worker_fn(): if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 4 + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() else: tensor = tensor_model_parallel_all_reduce(tensor) - result = tensor.mean().cpu().item() - assert result == 2 + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -140,14 +140,82 @@ def worker_fn_with_cudagraph(): with torch.cuda.graph( graph, stream=pynccl_comm.stream), pynccl_comm.change_state( enable=True): - # operation during the graph capture is recorded but not executed - # see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#creating-a-graph-using-stream-capture # noqa - pynccl_comm.all_reduce(a) - pynccl_comm.stream.synchronize() - assert a.mean().cpu().item() == pynccl_comm.world_size**0 + a_out = pynccl_comm.all_reduce(a) + torch.cuda.synchronize() graph.replay() - pynccl_comm.stream.synchronize() - assert a.mean().cpu().item() == pynccl_comm.world_size**1 + torch.cuda.synchronize() + assert torch.all(a_out == pynccl_comm.world_size).cpu().item() + + +@worker_fn_wrapper +def all_gather_worker_fn(): + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + + rank = pynccl_comm.rank + world_size = pynccl_comm.world_size + device = f'cuda:{pynccl_comm.rank}' + + num_elems = 1000 + tensor = torch.arange(num_elems, dtype=torch.float32, + device=device) + rank * num_elems + result = torch.zeros(num_elems * world_size, + dtype=torch.float32, + device=device) + + expected = torch.cat([ + torch.arange(num_elems, dtype=torch.float32) + r * num_elems + for r in range(world_size) + ]).to(device) + + with pynccl_comm.change_state(enable=True): + pynccl_comm.all_gather(result, tensor) + torch.cuda.synchronize() + torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_all_gather(): + distributed_run(all_gather_worker_fn, 2) + + +@worker_fn_wrapper +def reduce_scatter_worker_fn(): + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + + rank = pynccl_comm.rank + world_size = pynccl_comm.world_size + device = f'cuda:{pynccl_comm.rank}' + + num_elems = 1000 + tensor = torch.arange(num_elems, dtype=torch.float32, + device=device) + rank * num_elems + assert (num_elems % world_size == 0) + result = torch.zeros(num_elems // world_size, + dtype=torch.float32, + device=device) + + # Calculate expected result for this rank's chunk + scattered_size = num_elems // world_size + all_tensors = [ + torch.arange(num_elems, dtype=torch.float32) + r * num_elems + for r in range(world_size) + ] + expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] + for tensor in all_tensors).to(device) + + with pynccl_comm.change_state(enable=True): + pynccl_comm.reduce_scatter(result, tensor) + torch.cuda.synchronize() + torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) + + +@pytest.mark.skipif(torch.cuda.device_count() < 2, + reason="Need at least 2 GPUs to run the test.") +def test_pynccl_reduce_scatter(): + distributed_run(reduce_scatter_worker_fn, 2) @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -175,8 +243,8 @@ def send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - result = tensor.mean().cpu().item() - assert result == 1 + torch.cuda.synchronize() + assert torch.all(tensor == 1).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 2, @@ -214,11 +282,11 @@ def multiple_send_recv_worker_fn(): pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) - result = tensor.mean().cpu().item() + torch.cuda.synchronize() if torch.distributed.get_rank() in [0, 2]: - assert result == 1 + assert torch.all(tensor == 1).cpu().item() else: - assert result == 2 + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -227,6 +295,38 @@ def test_pynccl_multiple_send_recv(): distributed_run(multiple_send_recv_worker_fn, 4) +@pytest.mark.skipif(torch.cuda.device_count() < 4, + reason="Need at least 4 GPUs to run the test.") +def test_pynccl_broadcast(): + distributed_run(broadcast_worker_fn, 4) + + +@worker_fn_wrapper +def broadcast_worker_fn(): + # Test broadcast for every root rank. + # Essentially this is an all-gather operation. + pynccl_comm = PyNcclCommunicator(get_world_group().cpu_group, + device=get_world_group().device) + recv_tensors = [ + torch.empty(16, + 1024, + 1024, + dtype=torch.float32, + device=pynccl_comm.device) + for i in range(pynccl_comm.world_size) + ] + recv_tensors[pynccl_comm.rank] = torch.ones( + 16, 1024, 1024, dtype=torch.float32, + device=pynccl_comm.device) * pynccl_comm.rank + + for i in range(pynccl_comm.world_size): + pynccl_comm.broadcast(recv_tensors[i], src=i) + # the broadcast op might be launched in a different stream + # need to synchronize to make sure the tensor is ready + torch.cuda.synchronize() + assert torch.all(recv_tensors[i] == i).cpu().item() + + def test_ncclGetUniqueId(): lib = NCCLLibrary() unique_id = lib.ncclGetUniqueId() diff --git a/vllm/tests/distributed/test_utils.py b/vllm/tests/distributed/test_utils.py index a51a9909f..5fb1ae7b2 100644 --- a/vllm/tests/distributed/test_utils.py +++ b/vllm/tests/distributed/test_utils.py @@ -1,9 +1,17 @@ +import socket + +import pytest import ray +import torch import vllm.envs as envs -from vllm.utils import (cuda_device_count_stateless, +from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator +from vllm.distributed.utils import StatelessProcessGroup +from vllm.utils import (cuda_device_count_stateless, get_open_port, update_environment_variables) +from ..utils import multi_gpu_test + @ray.remote class _CUDADeviceCountStatelessTestActor: @@ -24,10 +32,110 @@ def test_cuda_device_count_stateless(): CUDA_VISIBLE_DEVICES is changed.""" actor = _CUDADeviceCountStatelessTestActor.options( # type: ignore num_gpus=2).remote() - assert sorted(ray.get( - actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"] + assert len( + sorted(ray.get( + actor.get_cuda_visible_devices.remote()).split(","))) == 2 assert ray.get(actor.get_count.remote()) == 2 ray.get(actor.set_cuda_visible_devices.remote("0")) assert ray.get(actor.get_count.remote()) == 1 ray.get(actor.set_cuda_visible_devices.remote("")) assert ray.get(actor.get_count.remote()) == 0 + + +def cpu_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + if rank <= 2: + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) + data = torch.tensor([rank]) + data = pg1.broadcast_obj(data, src=2) + assert data.item() == 2 + if rank <= 2: + data = torch.tensor([rank + 1]) + data = pg2.broadcast_obj(data, src=2) + assert data.item() == 3 + pg2.barrier() + pg1.barrier() + + +def gpu_worker(rank, WORLD_SIZE, port1, port2): + torch.cuda.set_device(rank) + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + pynccl1 = PyNcclCommunicator(pg1, device=rank) + if rank <= 2: + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) + pynccl2 = PyNcclCommunicator(pg2, device=rank) + data = torch.tensor([rank]).cuda() + pynccl1.all_reduce(data) + pg1.barrier() + torch.cuda.synchronize() + if rank <= 2: + pynccl2.all_reduce(data) + pg2.barrier() + torch.cuda.synchronize() + item = data[0].item() + print(f"rank: {rank}, item: {item}") + if rank == 3: + assert item == 6 + else: + assert item == 18 + + +def broadcast_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + if rank == 2: + pg1.broadcast_obj("secret", src=2) + else: + obj = pg1.broadcast_obj(None, src=2) + assert obj == "secret" + pg1.barrier() + + +def allgather_worker(rank, WORLD_SIZE, port1, port2): + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, + rank=rank, + world_size=WORLD_SIZE) + data = pg1.all_gather_obj(rank) + assert data == list(range(WORLD_SIZE)) + pg1.barrier() + + +@pytest.mark.skip(reason="This test is flaky and prone to hang.") +@multi_gpu_test(num_gpus=4) +@pytest.mark.parametrize( + "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]) +def test_stateless_process_group(worker): + port1 = get_open_port() + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", port1)) + port2 = get_open_port() + WORLD_SIZE = 4 + from multiprocessing import get_context + ctx = get_context("fork") + processes = [] + for i in range(WORLD_SIZE): + rank = i + processes.append( + ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2))) + for p in processes: + p.start() + for p in processes: + p.join() + for p in processes: + assert not p.exitcode + print("All processes finished.") diff --git a/vllm/tests/encoder_decoder/test_e2e_correctness.py b/vllm/tests/encoder_decoder/test_e2e_correctness.py index bef0c515b..fa5d6a69a 100644 --- a/vllm/tests/encoder_decoder/test_e2e_correctness.py +++ b/vllm/tests/encoder_decoder/test_e2e_correctness.py @@ -7,12 +7,18 @@ import pytest from transformers import AutoModelForSeq2SeqLM +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, + global_force_attn_backend_context_manager) from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs from ..conftest import DecoderPromptType from ..models.utils import check_logprobs_close +LIST_ENC_DEC_SUPPORTED_BACKENDS = [ + _Backend.XFORMERS, _Backend.FLASH_ATTN, None +] + def vllm_to_hf_output( vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -28,8 +34,16 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) -@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("decoder_prompt_type", list(DecoderPromptType)) @@ -48,6 +62,7 @@ def test_encoder_decoder_e2e( num_logprobs: int, decoder_prompt_type: DecoderPromptType, enforce_eager: bool, + attn_backend: _Backend, ) -> None: ''' End-to-End (E2E) test for the encoder-decoder framework. @@ -56,43 +71,49 @@ def test_encoder_decoder_e2e( implementations to ensure that both implementations produce consistent and correct results. ''' - test_case_prompts = example_encoder_decoder_prompts[decoder_prompt_type] - - # Configuration settings for HF baseline - hf_kwargs = { - "top_k": None, - "num_beams": 1, - "repetition_penalty": 1.0, - "top_p": 1.0, - "length_penalty": 1.0, - "early_stopping": False, - "no_repeat_ngram_size": None, - "min_length": 0 - } - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForSeq2SeqLM) as hf_model: - hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( - test_case_prompts, - max_tokens, - num_logprobs, - **hf_kwargs, - )) - with vllm_runner(model, dtype=dtype, - enforce_eager=enforce_eager) as vllm_model: - vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( - test_case_prompts, max_tokens, num_logprobs) - - hf_skip_tokens = (1 - if decoder_prompt_type == DecoderPromptType.NONE else 0) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, decoder_prompt_type) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - num_outputs_0_skip_tokens=hf_skip_tokens, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + test_case_prompts = example_encoder_decoder_prompts[ + decoder_prompt_type] + + # Configuration settings for HF baseline + hf_kwargs = { + "top_k": None, + "num_beams": 1, + "repetition_penalty": 1.0, + "top_p": 1.0, + "length_penalty": 1.0, + "early_stopping": False, + "no_repeat_ngram_size": None, + "min_length": 0 + } + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForSeq2SeqLM) as hf_model: + hf_outputs = ( + hf_model.generate_encoder_decoder_greedy_logprobs_limit( + test_case_prompts, + max_tokens, + num_logprobs, + **hf_kwargs, + )) + with vllm_runner(model, dtype=dtype, + enforce_eager=enforce_eager) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + test_case_prompts, max_tokens, num_logprobs) + + hf_skip_tokens = (1 if decoder_prompt_type == DecoderPromptType.NONE + else 0) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output, decoder_prompt_type) + for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + num_outputs_0_skip_tokens=hf_skip_tokens, + ) diff --git a/vllm/tests/engine/output_processor/test_stop_checker.py b/vllm/tests/engine/output_processor/test_stop_checker.py index 0d84443c5..cc14e8cbf 100644 --- a/vllm/tests/engine/output_processor/test_stop_checker.py +++ b/vllm/tests/engine/output_processor/test_stop_checker.py @@ -4,6 +4,7 @@ from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus @@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={"prompt_token_ids": []}, + inputs=token_inputs([]), block_size=16, eos_token_id=eos_token_id, ) diff --git a/vllm/tests/engine/test_arg_utils.py b/vllm/tests/engine/test_arg_utils.py index f7dc167fe..de78d41ad 100644 --- a/vllm/tests/engine/test_arg_utils.py +++ b/vllm/tests/engine/test_arg_utils.py @@ -2,6 +2,7 @@ import pytest +from vllm.config import PoolerConfig from vllm.engine.arg_utils import EngineArgs, nullable_kvs from vllm.utils import FlexibleArgumentParser @@ -30,6 +31,64 @@ def test_limit_mm_per_prompt_parser(arg, expected): assert args.limit_mm_per_prompt == expected +def test_compilation_config(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + + # default value + args = parser.parse_args([]) + assert args.compilation_config is None + + # set to O3 + args = parser.parse_args(["-O3"]) + assert args.compilation_config.level == 3 + + # set to O 3 (space) + args = parser.parse_args(["-O", "3"]) + assert args.compilation_config.level == 3 + + # set to O 3 (equals) + args = parser.parse_args(["-O=3"]) + assert args.compilation_config.level == 3 + + # set to json + args = parser.parse_args(["--compilation-config", '{"level": 3}']) + assert args.compilation_config.level == 3 + + # set to json + args = parser.parse_args(['--compilation-config={"level": 3}']) + assert args.compilation_config.level == 3 + + +def test_prefix_cache_default(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args = parser.parse_args([]) + + engine_args = EngineArgs.from_cli_args(args=args) + assert (not engine_args.enable_prefix_caching + ), "prefix caching defaults to off." + + # with flag to turn it on. + args = parser.parse_args(["--enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert engine_args.enable_prefix_caching + + # with disable flag to turn it off. + args = parser.parse_args(["--no-enable-prefix-caching"]) + engine_args = EngineArgs.from_cli_args(args=args) + assert not engine_args.enable_prefix_caching + + +def test_valid_pooling_config(): + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args = parser.parse_args([ + '--override-pooler-config', + '{"pooling_type": "MEAN"}', + ]) + engine_args = EngineArgs.from_cli_args(args=args) + assert engine_args.override_pooler_config == PoolerConfig( + pooling_type="MEAN", ) + + @pytest.mark.parametrize( ("arg"), [ diff --git a/vllm/tests/engine/test_short_mm_context.py b/vllm/tests/engine/test_short_mm_context.py new file mode 100644 index 000000000..a6ba7a131 --- /dev/null +++ b/vllm/tests/engine/test_short_mm_context.py @@ -0,0 +1,29 @@ +import pytest + +from ..conftest import IMAGE_ASSETS + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "USER: \nWhat's the content of the image?\nASSISTANT:", + "cherry_blossom": + "USER: \nWhat is the season?\nASSISTANT:", +}) + +models = ["llava-hf/llava-1.5-7b-hf"] + + +@pytest.mark.parametrize("model", models) +def test_context_length_too_short(vllm_runner, image_assets, model): + images = [asset.pil_image for asset in image_assets] + + with pytest.raises(ValueError, match="too long to fit into the model"): + vllm_model = vllm_runner( + model, + max_model_len=128, # LLaVA has a feature size of 576 + enforce_eager=True, + ) + + with vllm_model: + vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], + max_tokens=1, + images=[images[0]]) diff --git a/vllm/tests/entrypoints/conftest.py b/vllm/tests/entrypoints/conftest.py index e7ef5637c..0f7d15e1d 100644 --- a/vllm/tests/entrypoints/conftest.py +++ b/vllm/tests/entrypoints/conftest.py @@ -69,6 +69,37 @@ def sample_json_schema(): } +@pytest.fixture +def sample_complex_json_schema(): + return { + "type": "object", + "properties": { + "score": { + "type": "integer", + "minimum": 0, + "maximum": 100 # Numeric range + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$" # Regex pattern + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$" + }, + "tags": { + "type": "array", + "items": { + "type": "string", + "pattern": + "^[a-z]{1,10}$" # Combining length and pattern restrictions + } + } + }, + "required": ["score", "grade", "email", "tags"] + } + + @pytest.fixture def sample_guided_choice(): return [ diff --git a/vllm/tests/entrypoints/llm/test_accuracy.py b/vllm/tests/entrypoints/llm/test_accuracy.py new file mode 100644 index 000000000..6bf7190a6 --- /dev/null +++ b/vllm/tests/entrypoints/llm/test_accuracy.py @@ -0,0 +1,56 @@ +""" +This file test accuracy of the vLLM server via LMEval. +It uses local-completions, which interacts with vLLM +through the OAI API with N concurrent connections. +This simulates real work usage of the API and makes +sure that the zmq frontend mp RPC message passing and +AsyncLLMEngine are working correctly. +""" + +import lm_eval +import pytest + +from vllm.platforms import current_platform + +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +NUM_CONCURRENT = 500 +TASK = "gsm8k" +FILTER = "exact_match,strict-match" +RTOL = 0.03 +EXPECTED_VALUE = 0.58 + + +def run_test(): + """Run the end to end accuracy test.""" + + model_args = f"pretrained={MODEL_NAME},max_model_len=2048" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks="gsm8k", + batch_size="auto", + ) + + measured_value = results["results"][TASK][FILTER] + assert (measured_value - RTOL < EXPECTED_VALUE + and measured_value + RTOL > EXPECTED_VALUE + ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 is currently only supported on CUDA.") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test() + + +def test_lm_eval_accuracy_v0_engine(monkeypatch): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test() diff --git a/vllm/tests/entrypoints/llm/test_encode.py b/vllm/tests/entrypoints/llm/test_encode.py index 4c9f796e5..411638092 100644 --- a/vllm/tests/entrypoints/llm/test_encode.py +++ b/vllm/tests/entrypoints/llm/test_encode.py @@ -3,7 +3,7 @@ import pytest -from vllm import LLM, EmbeddingRequestOutput, PoolingParams +from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm.distributed import cleanup_dist_env_and_memory MODEL_NAME = "intfloat/e5-mistral-7b-instruct" @@ -43,8 +43,8 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: List[EmbeddingRequestOutput], - o2: List[EmbeddingRequestOutput]): +def assert_outputs_equal(o1: List[PoolingRequestOutput], + o2: List[PoolingRequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] diff --git a/vllm/tests/entrypoints/llm/test_guided_generate.py b/vllm/tests/entrypoints/llm/test_guided_generate.py index 67c79415f..de6257cfc 100644 --- a/vllm/tests/entrypoints/llm/test_guided_generate.py +++ b/vllm/tests/entrypoints/llm/test_guided_generate.py @@ -76,6 +76,34 @@ def test_guided_json_completion(sample_json_schema, llm): jsonschema.validate(instance=output_json, schema=sample_json_schema) +@pytest.mark.skip_global_cleanup +def test_guided_complex_json_completion(sample_complex_json_schema, llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=1000, + guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema)) + outputs = llm.generate(prompts=[ + f"Give an example JSON for an assignment grade " + f"that fits this schema: {sample_complex_json_schema}" + ] * 2, + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + prompt = output.prompt + + generated_text = output.outputs[0].text + assert generated_text is not None + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + output_json = json.loads(generated_text) + jsonschema.validate(instance=output_json, + schema=sample_complex_json_schema) + + @pytest.mark.skip_global_cleanup def test_guided_choice_completion(sample_guided_choice, llm): sampling_params = SamplingParams( @@ -159,3 +187,30 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm): sampling_params=sampling_params, use_tqdm=True, guided_options_request=dict(guided_regex=sample_regex)) + + +@pytest.mark.skip_global_cleanup +def test_guided_json_object(llm): + sampling_params = SamplingParams( + temperature=1.0, + max_tokens=100, + guided_decoding=GuidedDecodingParams(json_object=True)) + + outputs = llm.generate( + prompts=("Generate a JSON object describing a person with name " + "and age for John Smith who is 31 years old."), + sampling_params=sampling_params, + use_tqdm=True) + + assert outputs is not None + for output in outputs: + assert output is not None + assert isinstance(output, RequestOutput) + + generated_text = output.outputs[0].text + print(generated_text) + assert generated_text is not None + + # Parse to verify it is valid JSON + parsed_json = json.loads(generated_text) + assert isinstance(parsed_json, dict) diff --git a/vllm/tests/entrypoints/llm/test_lazy_outlines.py b/vllm/tests/entrypoints/llm/test_lazy_outlines.py index cbfb0cc32..2c53676c5 100644 --- a/vllm/tests/entrypoints/llm/test_lazy_outlines.py +++ b/vllm/tests/entrypoints/llm/test_lazy_outlines.py @@ -1,12 +1,13 @@ import sys +from contextlib import nullcontext + +from vllm_test_utils import BlameResult, blame from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory -def test_lazy_outlines(sample_regex): - """If users don't use guided decoding, outlines should not be imported. - """ +def run_normal(): prompts = [ "Hello, my name is", "The president of the United States is", @@ -25,13 +26,12 @@ def test_lazy_outlines(sample_regex): generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - # make sure outlines is not imported - assert 'outlines' not in sys.modules - # Destroy the LLM object and free up the GPU memory. del llm cleanup_dist_env_and_memory() + +def run_lmfe(sample_regex): # Create an LLM with guided decoding enabled. llm = LLM(model="facebook/opt-125m", enforce_eager=True, @@ -51,5 +51,26 @@ def test_lazy_outlines(sample_regex): generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + +def test_lazy_outlines(sample_regex): + """If users don't use guided decoding, outlines should not be imported. + """ # make sure outlines is not imported - assert 'outlines' not in sys.modules + module_name = "outlines" + # In CI, we only check finally if the module is imported. + # If it is indeed imported, we can rerun the test with `use_blame=True`, + # which will trace every function call to find the first import location, + # and help find the root cause. + # We don't run it in CI by default because it is slow. + use_blame = False + context = blame( + lambda: module_name in sys.modules) if use_blame else nullcontext() + with context as result: + run_normal() + run_lmfe(sample_regex) + if use_blame: + assert isinstance(result, BlameResult) + print(f"the first import location is:\n{result.trace_stack}") + assert module_name not in sys.modules, ( + f"Module {module_name} is imported. To see the first" + f" import location, run the test with `use_blame=True`.") diff --git a/vllm/tests/entrypoints/llm/test_prompt_validation.py b/vllm/tests/entrypoints/llm/test_prompt_validation.py index 565dfa013..ee7010a23 100644 --- a/vllm/tests/entrypoints/llm/test_prompt_validation.py +++ b/vllm/tests/entrypoints/llm/test_prompt_validation.py @@ -3,7 +3,22 @@ from vllm import LLM +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + def test_empty_prompt(): - llm = LLM(model="gpt2") + llm = LLM(model="gpt2", enforce_eager=True) with pytest.raises(ValueError, match='Prompt cannot be empty'): llm.generate([""]) + + +@pytest.mark.skip_v1 +def test_out_of_vocab_token(): + llm = LLM(model="gpt2", enforce_eager=True) + with pytest.raises(ValueError, match='out of vocabulary'): + llm.generate({"prompt_token_ids": [999999]}) diff --git a/vllm/tests/entrypoints/openai/test_accuracy.py b/vllm/tests/entrypoints/openai/test_accuracy.py index 63beaaba2..b1d4461d1 100644 --- a/vllm/tests/entrypoints/openai/test_accuracy.py +++ b/vllm/tests/entrypoints/openai/test_accuracy.py @@ -10,6 +10,8 @@ import lm_eval import pytest +from vllm.platforms import current_platform + from ...utils import RemoteOpenAIServer MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" @@ -18,22 +20,33 @@ FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUE = 0.58 -DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"] +DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"] MORE_ARGS_LIST = [ + [], # Default ["--enable-chunked-prefill"], # Chunked ["--num-scheduler-steps", "8"], # MS ["--num-scheduler-steps", "8", "--multi-step-stream-outputs"] # MS+Stream ] +MAX_WAIT_SECONDS = None +if current_platform.is_tpu(): + MORE_ARGS_LIST = [ + [], # Default + # ["--num-scheduler-steps", "8"], # Multi-step << currently fails + ] + MAX_WAIT_SECONDS = 600 + + +def run_test(more_args): + """Run the end to end accuracy test.""" -@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) -def test_lm_eval_accuracy(more_args): args = list(DEFAULT_ARGS) args.extend(more_args) - print(f"Running with: {args}") - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer( + MODEL_NAME, args, + max_wait_seconds=MAX_WAIT_SECONDS) as remote_server: url = f"{remote_server.url_for('v1')}/completions" model_args = ( @@ -51,3 +64,22 @@ def test_lm_eval_accuracy(more_args): assert (measured_value - RTOL < EXPECTED_VALUE and measured_value + RTOL > EXPECTED_VALUE ), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}" + + +@pytest.mark.skipif(not current_platform.is_cuda(), + reason="V1 currently only supported on CUDA") +def test_lm_eval_accuracy_v1_engine(monkeypatch): + """Run with the V1 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + run_test([]) + + +@pytest.mark.parametrize("more_args", MORE_ARGS_LIST) +def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args): + """Run with the V0 Engine.""" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "0") + run_test(more_args) diff --git a/vllm/tests/entrypoints/openai/test_async_tokenization.py b/vllm/tests/entrypoints/openai/test_async_tokenization.py new file mode 100644 index 000000000..fcce8b46c --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_async_tokenization.py @@ -0,0 +1,137 @@ +import asyncio +import contextlib +import random +import time +from typing import Callable + +import openai +import pytest +import pytest_asyncio +import requests + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" + + +@pytest.fixture(scope="module") +def server(): # noqa: F811 + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "8192", + "--enforce-eager", + "--max-num-seqs", + "128", + "--load-format", + "dummy", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ids=["completion", "chat"], + argnames=["create_func_gen", "content_body"], + argvalues=[ + (lambda x: x.completions.create, { + "prompt": " ".join(['A'] * 10_000) + }), + (lambda x: x.chat.completions.create, { + "messages": [{ + "role": "user", + "content": " ".join(['A'] * 10_000) + }] + }), + ], +) +async def test_with_and_without_truncate( + server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + create_func_gen: Callable, + content_body: dict, +): + create_func = create_func_gen(client) + body = {"model": MODEL_NAME, **content_body, "max_tokens": 10} + + num_requests = 10 + truncate_prompt_tokens = ([1000] * (num_requests // 2) + [None] * + (num_requests - num_requests // 2)) + random.shuffle(truncate_prompt_tokens) + + bodies = [{ + **body, "extra_body": { + 'truncate_prompt_tokens': t + } + } for t in truncate_prompt_tokens] + + async def get_status_code(**kwargs): + try: + await create_func(**kwargs) + return 200 + except openai.APIStatusError as e: + return e.status_code + + responses = await asyncio.gather(*[get_status_code(**b) for b in bodies]) + assert 500 not in responses + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ids=["single completion", "multiple completions", "chat"], + argnames=["create_func_gen", "content_body"], + argvalues=[ + (lambda x: x.completions.create, { + "prompt": " ".join(['A'] * 300_000) + }), + (lambda x: x.completions.create, { + "prompt": [" ".join(['A'] * 300_000)] * 2 + }), + (lambda x: x.chat.completions.create, { + "messages": [{ + "role": "user", + "content": " ".join(['A'] * 300_000) + }] + }), + ], +) +async def test_healthcheck_response_time( + server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + create_func_gen: Callable, + content_body: dict, +): + num_requests = 50 + + create_func = create_func_gen(client) + body = {"model": MODEL_NAME, **content_body, "max_tokens": 10} + + def get_response_time(url): + start_time = time.monotonic() + res = requests.get(url) + end_time = time.monotonic() + assert res.status_code == 200 + return end_time - start_time + + no_load_response_time = get_response_time(server.url_for("health")) + tasks = [ + asyncio.create_task(create_func(**body)) for _ in range(num_requests) + ] + await asyncio.sleep(1) # give the tasks a chance to start running + load_response_time = get_response_time(server.url_for("health")) + + with contextlib.suppress(openai.APIStatusError): + await asyncio.gather(*tasks) + + assert load_response_time < 100 * no_load_response_time + assert load_response_time < 0.1 diff --git a/vllm/tests/entrypoints/openai/test_audio.py b/vllm/tests/entrypoints/openai/test_audio.py index df8a14028..a74109e2f 100644 --- a/vllm/tests/entrypoints/openai/test_audio.py +++ b/vllm/tests/entrypoints/openai/test_audio.py @@ -68,11 +68,12 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -91,7 +92,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -123,11 +124,12 @@ async def test_single_chat_session_audio_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -146,7 +148,7 @@ async def test_single_chat_session_audio_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -178,7 +180,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -188,7 +190,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -242,7 +244,7 @@ async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) diff --git a/vllm/tests/entrypoints/openai/test_basic.py b/vllm/tests/entrypoints/openai/test_basic.py index d3aea533b..4616f363c 100644 --- a/vllm/tests/entrypoints/openai/test_basic.py +++ b/vllm/tests/entrypoints/openai/test_basic.py @@ -1,7 +1,6 @@ from http import HTTPStatus from typing import List -import openai import pytest import pytest_asyncio import requests @@ -83,10 +82,8 @@ async def client(server): indirect=True, ) @pytest.mark.asyncio -async def test_show_version(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/version") +async def test_show_version(server: RemoteOpenAIServer): + response = requests.get(server.url_for("version")) response.raise_for_status() assert response.json() == {"version": VLLM_VERSION} @@ -102,9 +99,7 @@ async def test_show_version(client: openai.AsyncOpenAI): indirect=True, ) @pytest.mark.asyncio -async def test_check_health(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - - response = requests.get(base_url + "/health") +async def test_check_health(server: RemoteOpenAIServer): + response = requests.get(server.url_for("health")) assert response.status_code == HTTPStatus.OK diff --git a/vllm/tests/entrypoints/openai/test_chat.py b/vllm/tests/entrypoints/openai/test_chat.py index d1aebbd70..8d23a2be6 100644 --- a/vllm/tests/entrypoints/openai/test_chat.py +++ b/vllm/tests/entrypoints/openai/test_chat.py @@ -65,11 +65,12 @@ async def test_no_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=False) choice = chat_completion.choices[0] assert choice.logprobs is None @@ -90,12 +91,13 @@ async def test_zero_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=0) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=0) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -117,12 +119,13 @@ async def test_some_logprobs_chat(client: openai.AsyncOpenAI, model_name: str): "content": "what is 1+1?" }] - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=5, - temperature=0.0, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=5, + temperature=0.0, + logprobs=True, + top_logprobs=5) choice = chat_completion.choices[0] assert choice.logprobs is not None @@ -149,7 +152,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises((openai.BadRequestError, openai.APIError)): stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=21, stream=True) @@ -159,16 +162,17 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=30, stream=False) # the server should still work afterwards - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - stream=False) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + stream=False) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -271,11 +275,12 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert chat_completion.id is not None assert len(chat_completion.choices) == 1 @@ -294,7 +299,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 @@ -319,7 +324,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -329,7 +334,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str): stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -369,7 +374,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={"include_usage": False}) @@ -380,7 +385,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, # "continuous_usage_stats": False}} stream = await client.chat.completions.create(model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, stream_options={ @@ -409,7 +414,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": None}) @@ -419,7 +424,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=False, stream_options={"include_usage": True}) @@ -429,7 +434,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(min_tokens=10), temperature=0.0, stream=True, @@ -476,7 +481,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice1 = chat_completion.choices[0].message.content @@ -490,7 +495,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, extra_body=dict(guided_choice=sample_guided_choice, guided_decoding_backend=guided_decoding_backend)) choice2 = chat_completion.choices[0].message.content @@ -517,7 +522,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -535,7 +540,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, extra_body=dict(guided_json=sample_json_schema, guided_decoding_backend=guided_decoding_backend)) message = chat_completion.choices[0].message @@ -563,7 +568,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip1 = chat_completion.choices[0].message.content @@ -575,7 +580,7 @@ async def test_guided_regex_chat(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=20, + max_completion_tokens=20, extra_body=dict(guided_regex=sample_regex, guided_decoding_backend=guided_decoding_backend)) ip2 = chat_completion.choices[0].message.content @@ -623,7 +628,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=10, + max_completion_tokens=10, logprobs=True, top_logprobs=5, extra_body=dict(guided_choice=sample_guided_choice, @@ -660,7 +665,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -694,7 +699,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -750,7 +755,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -765,7 +770,7 @@ async def test_required_tool_use_not_yet_supported( await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -796,7 +801,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, with pytest.raises(openai.BadRequestError): await client.chat.completions.create(model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tool_choice={ "type": "function", "function": { @@ -809,7 +814,7 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, await client.chat.completions.create( model=MODEL_NAME, messages=messages, - max_tokens=1000, + max_completion_tokens=1000, tools=[{ "type": "function", "function": { @@ -824,6 +829,20 @@ async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI, "name": "nondefined_function_name" } }) + with pytest.raises(openai.BadRequestError): + await client.chat.completions.create( + model=MODEL_NAME, + messages=messages, + max_completion_tokens=1000, + tools=[{ + "type": "function", + "function": { + "name": "dummy_function_name", + "description": "This is a dummy function", + "parameters": sample_json_schema + } + }], + tool_choice={}) @pytest.mark.asyncio @@ -894,19 +913,19 @@ async def test_response_format_json_schema(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_extra_fields(client: openai.AsyncOpenAI): - with pytest.raises(BadRequestError) as exc_info: - await client.chat.completions.create( - model=MODEL_NAME, - messages=[{ - "role": "system", - "content": "You are a helpful assistant.", - "extra_field": "0", - }], # type: ignore - temperature=0, - seed=0) - - assert "extra_forbidden" in exc_info.value.message +async def test_extra_fields_allowed(client: openai.AsyncOpenAI): + resp = await client.chat.completions.create( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?", + "extra_field": "0", + }], # type: ignore + temperature=0, + seed=0) + + content = resp.choices[0].message.content + assert content is not None @pytest.mark.asyncio diff --git a/vllm/tests/entrypoints/openai/test_chat_echo.py b/vllm/tests/entrypoints/openai/test_chat_echo.py new file mode 100644 index 000000000..223ac5b41 --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_chat_echo.py @@ -0,0 +1,79 @@ +from typing import NamedTuple + +import openai # use the official client for correctness check +import pytest +import pytest_asyncio + +from ...utils import RemoteOpenAIServer + +# # any model with a chat template should work here +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 + + +@pytest.fixture(scope="module") +def server(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--enforce-eager", + "--max-model-len", + "4080", + "--chat-template", + DUMMY_CHAT_TEMPLATE, + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +class TestCase(NamedTuple): + model_name: str + echo: bool + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "test_case", + [ + TestCase(model_name=MODEL_NAME, echo=True), + TestCase(model_name=MODEL_NAME, echo=False) + ], +) +async def test_chat_session_with_echo_and_continue_final_message( + client: openai.AsyncOpenAI, test_case: TestCase): + saying: str = "Here is a common saying about apple. An apple a day, keeps" + # test echo with continue_final_message parameter + chat_completion = await client.chat.completions.create( + model=test_case.model_name, + messages=[{ + "role": "user", + "content": "tell me a common saying" + }, { + "role": "assistant", + "content": saying + }], + extra_body={ + "echo": test_case.echo, + "continue_final_message": True, + "add_generation_prompt": False + }) + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "stop" + + message = choice.message + if test_case.echo: + assert message.content is not None and saying in message.content + else: + assert message.content is not None and saying not in message.content + assert message.role == "assistant" diff --git a/vllm/tests/entrypoints/openai/test_completion.py b/vllm/tests/entrypoints/openai/test_completion.py index f03bdb045..c81cfdbbe 100644 --- a/vllm/tests/entrypoints/openai/test_completion.py +++ b/vllm/tests/entrypoints/openai/test_completion.py @@ -157,15 +157,15 @@ async def test_added_lora_tokens(client: openai.AsyncOpenAI): @pytest.mark.asyncio async def test_added_lora_tokens_base_model(client: openai.AsyncOpenAI): # test using token IDs - completion = await client.completions.create( - model=MODEL_NAME, - prompt=[0, 0, 32000, 32001, 32002], - echo=True, - max_tokens=5, - temperature=0.0, - ) - # Added tokens should not appear in tokenized prompt - assert "vllm" not in completion.choices[0].text + with pytest.raises(openai.BadRequestError, match="out of vocabulary"): + # Added tokens should be rejected by the base model + await client.completions.create( + model=MODEL_NAME, + prompt=[0, 0, 32000, 32001, 32002], + echo=True, + max_tokens=5, + temperature=0.0, + ) @pytest.mark.asyncio diff --git a/vllm/tests/entrypoints/openai/test_embedding.py b/vllm/tests/entrypoints/openai/test_embedding.py index f119c6c12..9f2b77dde 100644 --- a/vllm/tests/entrypoints/openai/test_embedding.py +++ b/vllm/tests/entrypoints/openai/test_embedding.py @@ -4,14 +4,18 @@ import openai import pytest import pytest_asyncio +import requests + +from vllm.transformers_utils.tokenizer import get_tokenizer from ...utils import RemoteOpenAIServer -EMBEDDING_MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +MODEL_NAME = "intfloat/e5-mistral-7b-instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 @pytest.fixture(scope="module") -def embedding_server(): +def server(): args = [ # use half precision for speed and memory savings in CI environment "--dtype", @@ -19,31 +23,29 @@ def embedding_server(): "--enforce-eager", "--max-model-len", "8192", + "--chat-template", + DUMMY_CHAT_TEMPLATE, ] - with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @pytest_asyncio.fixture -async def embedding_client(embedding_server): - async with embedding_server.get_async_client() as async_client: +async def client(server): + async with server.get_async_client() as async_client: yield async_client @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "The chef prepared a delicious meal.", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -57,7 +59,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, # test using token IDs input_tokens = [1, 1, 1, 1, 1] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -71,18 +73,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, - model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str): # test List[str] input_texts = [ "The cat sat on the mat.", "A feline was resting on a rug.", "Stars twinkle brightly in the night sky." ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, encoding_format="float", @@ -90,11 +88,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, assert embeddings.id is not None assert len(embeddings.data) == 3 assert len(embeddings.data[0].embedding) == 4096 + assert embeddings.usage.completion_tokens == 0 + assert embeddings.usage.prompt_tokens == 32 + assert embeddings.usage.total_tokens == 32 # test List[List[int]] input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], [25, 32, 64, 77]] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, encoding_format="float", @@ -108,22 +109,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_conversation_embedding(server: RemoteOpenAIServer, + client: openai.AsyncOpenAI, + model_name: str): + messages = [{ + "role": "user", + "content": "The cat sat on the mat.", + }, { + "role": "assistant", + "content": "A feline was resting on a rug.", + }, { + "role": "user", + "content": "Stars twinkle brightly in the night sky.", + }] + + chat_response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float", + }) + chat_response.raise_for_status() + chat_embeddings = chat_response.json() + + tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast") + prompt = tokenizer.apply_chat_template( + messages, + chat_template=DUMMY_CHAT_TEMPLATE, + add_generation_prompt=True, + continue_final_message=False, + tokenize=False, + ) + completion_response = await client.embeddings.create( + model=model_name, + input=prompt, + encoding_format="float", + # To be consistent with chat + extra_body={"add_special_tokens": False}, + ) + completion_embeddings = completion_response.model_dump(mode="json") + + assert chat_embeddings.pop("id") is not None + assert completion_embeddings.pop("id") is not None + assert chat_embeddings.pop("created") <= completion_embeddings.pop( + "created") + assert chat_embeddings == completion_embeddings + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_batch_base64_embedding(client: openai.AsyncOpenAI, model_name: str): input_texts = [ "Hello my name is", "The best thing about vLLM is that it supports many different models" ] - responses_float = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="float") + responses_float = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="float") - responses_base64 = await embedding_client.embeddings.create( - input=input_texts, model=model_name, encoding_format="base64") + responses_base64 = await client.embeddings.create(input=input_texts, + model=model_name, + encoding_format="base64") decoded_responses_base64_data = [] for data in responses_base64.data: @@ -137,8 +186,8 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, 1] # Default response is float32 decoded from base64 by OpenAI Client - responses_default = await embedding_client.embeddings.create( - input=input_texts, model=model_name) + responses_default = await client.embeddings.create(input=input_texts, + model=model_name) assert responses_float.data[0].embedding == responses_default.data[ 0].embedding @@ -147,18 +196,15 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] # test single embedding - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 10}) @@ -173,7 +219,7 @@ async def test_single_embedding_truncation( 1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728, 9901, 340, 2229, 385, 340, 315, 28741, 28804, 2 ] - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_tokens, extra_body={"truncate_prompt_tokens": 10}) @@ -187,18 +233,15 @@ async def test_single_embedding_truncation( @pytest.mark.asyncio -@pytest.mark.parametrize( - "model_name", - [EMBEDDING_MODEL_NAME], -) -async def test_single_embedding_truncation_invalid( - embedding_client: openai.AsyncOpenAI, model_name: str): +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI, + model_name: str): input_texts = [ "Como o Brasil pode fomentar o desenvolvimento de modelos de IA?", ] with pytest.raises(openai.BadRequestError): - embeddings = await embedding_client.embeddings.create( + embeddings = await client.embeddings.create( model=model_name, input=input_texts, extra_body={"truncate_prompt_tokens": 8193}) diff --git a/vllm/tests/entrypoints/openai/test_metrics.py b/vllm/tests/entrypoints/openai/test_metrics.py index 6cb74eb78..6523c8b62 100644 --- a/vllm/tests/entrypoints/openai/test_metrics.py +++ b/vllm/tests/entrypoints/openai/test_metrics.py @@ -70,18 +70,21 @@ async def client(server): [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), ("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)], + "vllm:request_params_max_tokens": + [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), + ("_count", _NUM_REQUESTS)], "vllm:prompt_tokens": [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], - "vllm:generation_tokens": - [("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], + "vllm:generation_tokens": [ + ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST) + ], "vllm:request_success": [("_total", _NUM_REQUESTS)], } @pytest.mark.asyncio -async def test_metrics_counts(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_counts(server: RemoteOpenAIServer, + client: openai.AsyncClient): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -89,7 +92,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): prompt=_TOKENIZED_PROMPT, max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) print(response.text) assert response.status_code == HTTPStatus.OK @@ -150,6 +153,9 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): "vllm:request_params_n_sum", "vllm:request_params_n_bucket", "vllm:request_params_n_count", + "vllm:request_params_max_tokens_sum", + "vllm:request_params_max_tokens_bucket", + "vllm:request_params_max_tokens_count", "vllm:num_preemptions_total", "vllm:prompt_tokens_total", "vllm:generation_tokens_total", @@ -170,16 +176,15 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): @pytest.mark.asyncio -async def test_metrics_exist(client: openai.AsyncOpenAI): - base_url = str(client.base_url)[:-3].strip("/") - +async def test_metrics_exist(server: RemoteOpenAIServer, + client: openai.AsyncClient): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", max_tokens=5, temperature=0.0) - response = requests.get(base_url + "/metrics") + response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK for metric in EXPECTED_METRICS: diff --git a/vllm/tests/entrypoints/openai/test_prompt_validation.py b/vllm/tests/entrypoints/openai/test_prompt_validation.py index 0a573a006..1ae64ef49 100644 --- a/vllm/tests/entrypoints/openai/test_prompt_validation.py +++ b/vllm/tests/entrypoints/openai/test_prompt_validation.py @@ -20,3 +20,38 @@ async def test_empty_prompt(): prompt="", max_tokens=5, temperature=0.0) + + +@pytest.mark.asyncio +async def test_out_of_vocab_token_ids(): + model_name = "gpt2" + server_args = ["--enforce-eager"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile('.*out of vocabulary.*')): + await client.completions.create(model=model_name, + prompt=[999999], + max_tokens=5, + temperature=0.0) + + +@pytest.mark.asyncio +async def test_reject_multistep_with_guided_decoding(): + model_name = "gpt2" + server_args = ["--enforce-eager", "--num-scheduler-steps", "8"] + with RemoteOpenAIServer(model_name, server_args) as remote_server: + client = remote_server.get_async_client() + + with pytest.raises(openai.BadRequestError, + match=re.compile( + '.*Guided decoding .* multi-step decoding.*')): + await client.completions.create( + model=model_name, + prompt="Hello", + max_tokens=5, + temperature=0.0, + extra_body={"response_format": { + "type": "json_object" + }}) diff --git a/vllm/tests/entrypoints/openai/test_root_path.py b/vllm/tests/entrypoints/openai/test_root_path.py new file mode 100644 index 000000000..20f796061 --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_root_path.py @@ -0,0 +1,103 @@ +import contextlib +import os +from typing import Any, List, NamedTuple + +import openai # use the official client for correctness check +import pytest + +from ...utils import RemoteOpenAIServer + +# # any model with a chat template should work here +MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" +DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 +API_KEY = "abc-123" +ERROR_API_KEY = "abc" +ROOT_PATH = "llm" + + +@pytest.fixture(scope="module") +def server(): + args = [ + # use half precision for speed and memory savings in CI environment + "--dtype", + "float16", + "--enforce-eager", + "--max-model-len", + "4080", + "--root-path", # use --root-path=/llm for testing + "/" + ROOT_PATH, + "--chat-template", + DUMMY_CHAT_TEMPLATE, + ] + envs = os.environ.copy() + + envs["VLLM_API_KEY"] = API_KEY + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: + yield remote_server + + +class TestCase(NamedTuple): + model_name: str + base_url: List[str] + api_key: str + expected_error: Any + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "test_case", + [ + TestCase( + model_name=MODEL_NAME, + base_url=["v1"], # http://localhost:8000/v1 + api_key=ERROR_API_KEY, + expected_error=openai.AuthenticationError), + TestCase( + model_name=MODEL_NAME, + base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1 + api_key=ERROR_API_KEY, + expected_error=openai.AuthenticationError), + TestCase( + model_name=MODEL_NAME, + base_url=["v1"], # http://localhost:8000/v1 + api_key=API_KEY, + expected_error=None), + TestCase( + model_name=MODEL_NAME, + base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1 + api_key=API_KEY, + expected_error=None), + ], +) +async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer, + test_case: TestCase): + saying: str = "Here is a common saying about apple. An apple a day, keeps" + ctx = contextlib.nullcontext() + if test_case.expected_error is not None: + ctx = pytest.raises(test_case.expected_error) + with ctx: + client = openai.AsyncOpenAI( + api_key=test_case.api_key, + base_url=server.url_for(*test_case.base_url), + max_retries=0) + chat_completion = await client.chat.completions.create( + model=test_case.model_name, + messages=[{ + "role": "user", + "content": "tell me a common saying" + }, { + "role": "assistant", + "content": saying + }], + extra_body={ + "continue_final_message": True, + "add_generation_prompt": False + }) + + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "stop" + message = choice.message + assert len(message.content) > 0 + assert message.role == "assistant" diff --git a/vllm/tests/entrypoints/openai/test_score.py b/vllm/tests/entrypoints/openai/test_score.py new file mode 100644 index 000000000..7565ff719 --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_score.py @@ -0,0 +1,93 @@ +import pytest +import requests + +from vllm.entrypoints.openai.protocol import ScoreResponse + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "BAAI/bge-reranker-v2-m3" + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--enforce-eager", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, + model_name: str): + text_1 = "What is the capital of France?" + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + assert score.data[0].score[0] <= 0.01 + assert score.data[1].score[0] >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, + model_name: str): + text_1 = [ + "What is the capital of the United States?", + "What is the capital of France?" + ] + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 2 + assert score.data[0].score[0] <= 0.01 + assert score.data[1].score[0] >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, + model_name: str): + text_1 = "What is the capital of France?" + text_2 = "The capital of France is Paris." + + score_response = requests.post(server.url_for("v1/score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + score_response.raise_for_status() + score = ScoreResponse.model_validate(score_response.json()) + + assert score.id is not None + assert score.data is not None + assert len(score.data) == 1 + assert score.data[0].score[0] >= 0.9 diff --git a/vllm/tests/entrypoints/openai/test_serving_chat.py b/vllm/tests/entrypoints/openai/test_serving_chat.py index e969d3377..93660e611 100644 --- a/vllm/tests/entrypoints/openai/test_serving_chat.py +++ b/vllm/tests/entrypoints/openai/test_serving_chat.py @@ -26,7 +26,6 @@ class MockModelConfig: tokenizer = MODEL_NAME trust_remote_code = False tokenizer_mode = "auto" - chat_template_text_format = "string" max_model_len = 100 tokenizer_revision = None multimodal_config = MultiModalConfig() @@ -49,6 +48,7 @@ async def _async_serving_chat_init(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) @@ -70,6 +70,7 @@ def test_serving_chat_should_set_correct_max_tokens(): BASE_MODEL_PATHS, response_role="assistant", chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", lora_modules=None, prompt_adapters=None, request_logger=None) diff --git a/vllm/tests/entrypoints/openai/test_tokenization.py b/vllm/tests/entrypoints/openai/test_tokenization.py index 859a676a9..b1956a8cb 100644 --- a/vllm/tests/entrypoints/openai/test_tokenization.py +++ b/vllm/tests/entrypoints/openai/test_tokenization.py @@ -1,4 +1,3 @@ -import openai # use the official client for correctness check import pytest import pytest_asyncio import requests @@ -55,9 +54,11 @@ async def client(server): [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_completions(client: openai.AsyncOpenAI, - model_name: str, tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_completions( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -65,7 +66,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, prompt = "vllm1 This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_special_tokens": add_special, "model": model_name, @@ -86,9 +87,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_tokenize_chat( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") @@ -121,7 +124,7 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post(base_url + "/tokenize", + response = requests.post(server.url_for("tokenize"), json={ "add_generation_prompt": add_generation, @@ -146,17 +149,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], indirect=["tokenizer_name"], ) -async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, - tokenizer_name: str): - base_url = str(client.base_url)[:-3].strip("/") +async def test_detokenize( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer_mode="fast") prompt = "This is a test prompt. vllm1" tokens = tokenizer.encode(prompt, add_special_tokens=False) - print(f"CALLING {base_url} FOR {model_name}") - response = requests.post(base_url + "/detokenize", + response = requests.post(server.url_for("detokenize"), json={ "model": model_name, "tokens": tokens diff --git a/vllm/tests/entrypoints/openai/test_video.py b/vllm/tests/entrypoints/openai/test_video.py new file mode 100644 index 000000000..294b25036 --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_video.py @@ -0,0 +1,345 @@ +from typing import Dict, List + +import openai +import pytest +import pytest_asyncio + +from vllm.multimodal.utils import encode_video_base64, fetch_video + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" +MAXIMUM_VIDEOS = 4 + +TEST_VIDEO_URLS = [ + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4", + "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "generate", + "--dtype", + "bfloat16", + "--max-model-len", + "32768", + "--max-num-seqs", + "2", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"video={MAXIMUM_VIDEOS}", + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_video() -> Dict[str, str]: + return { + video_url: encode_video_base64(fetch_video(video_url)) + for video_url in TEST_VIDEO_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video(client: openai.AsyncOpenAI, + model_name: str, video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=6299, total_tokens=6309) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_base64encoded( + client: openai.AsyncOpenAI, model_name: str, video_url: str, + base64_encoded_video: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=6299, total_tokens=6309) + + message = choice.message + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_single_chat_session_video_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, video_url: str, + base64_encoded_video: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": + f"data:video/jpeg;base64,{base64_encoded_video[video_url]}" + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +async def test_chat_streaming_video(client: openai.AsyncOpenAI, + model_name: str, video_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "video_url", + "video_url": { + "url": video_url + } + }, + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize( + "video_urls", + [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]) +async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str, + video_urls: List[str]): + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "video_url", + "video_url": { + "url": video_url + } + } for video_url in video_urls), + { + "type": "text", + "text": "What's in this video?" + }, + ], + }] + + if len(video_urls) > MAXIMUM_VIDEOS: + with pytest.raises(openai.BadRequestError): # test multi-video input + await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + + # the server should still work afterwards + completion = await client.completions.create( + model=model_name, + prompt=[0, 0, 0, 0, 0], + max_tokens=5, + temperature=0.0, + ) + completion = completion.choices[0].text + assert completion is not None and len(completion) >= 0 + else: + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + temperature=0.0, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 diff --git a/vllm/tests/entrypoints/openai/test_vision.py b/vllm/tests/entrypoints/openai/test_vision.py index 8311a5cb3..157d873a7 100644 --- a/vllm/tests/entrypoints/openai/test_vision.py +++ b/vllm/tests/entrypoints/openai/test_vision.py @@ -78,11 +78,12 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -101,12 +102,48 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, + model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -133,11 +170,12 @@ async def test_single_chat_session_image_base64encoded( }] # test single completion - chat_completion = await client.chat.completions.create(model=model_name, - messages=messages, - max_tokens=10, - logprobs=True, - top_logprobs=5) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=10, + logprobs=True, + top_logprobs=5) assert len(chat_completion.choices) == 1 choice = chat_completion.choices[0] @@ -156,12 +194,47 @@ async def test_single_chat_session_image_base64encoded( chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, ) message = chat_completion.choices[0].message assert message.content is not None and len(message.content) >= 0 +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image_base64encoded_beamsearch( + client: openai.AsyncOpenAI, model_name: str, image_url: str, + base64_encoded_image: Dict[str, str]): + + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": + f"data:image/jpeg;base64,{base64_encoded_image[image_url]}" + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + n=2, + max_completion_tokens=10, + extra_body=dict(use_beam_search=True)) + assert len(chat_completion.choices) == 2 + assert chat_completion.choices[ + 0].message.content != chat_completion.choices[1].message.content + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @@ -188,7 +261,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) output = chat_completion.choices[0].message.content @@ -198,7 +271,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, stream = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, stream=True, ) @@ -249,7 +322,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) @@ -266,7 +339,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, chat_completion = await client.chat.completions.create( model=model_name, messages=messages, - max_tokens=10, + max_completion_tokens=10, temperature=0.0, ) message = chat_completion.choices[0].message diff --git a/vllm/tests/entrypoints/openai/test_vision_embedding.py b/vllm/tests/entrypoints/openai/test_vision_embedding.py new file mode 100644 index 000000000..d0c43b47b --- /dev/null +++ b/vllm/tests/entrypoints/openai/test_vision_embedding.py @@ -0,0 +1,99 @@ +from typing import Dict + +import pytest +import pytest_asyncio +import requests + +from vllm.multimodal.utils import encode_image_base64, fetch_image + +from ...utils import VLLM_PATH, RemoteOpenAIServer + +MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" +MAXIMUM_IMAGES = 2 + +vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja" +assert vlm2vec_jinja_path.exists() + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--task", + "embedding", + "--dtype", + "bfloat16", + "--max-model-len", + "2048", + "--max-num-seqs", + "5", + "--enforce-eager", + "--trust-remote-code", + "--limit-mm-per-prompt", + f"image={MAXIMUM_IMAGES}", + "--chat-template", + str(vlm2vec_jinja_path), + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +@pytest.fixture(scope="session") +def base64_encoded_image() -> Dict[str, str]: + return { + image_url: encode_image_base64(fetch_image(image_url)) + for image_url in TEST_IMAGE_URLS + } + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, + image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }] + + response = requests.post(server.url_for("v1/embeddings"), + json={ + "model": model_name, + "messages": messages, + "encoding_format": "float" + }) + response.raise_for_status() + + embeddings = response.json() + assert embeddings["id"] is not None + assert len(embeddings["data"]) == 1 + assert len(embeddings["data"][0]["embedding"]) == 3072 + assert embeddings["usage"]["completion_tokens"] == 0 + assert embeddings["usage"]["prompt_tokens"] == 762 + assert embeddings["usage"]["total_tokens"] == 762 diff --git a/vllm/tests/entrypoints/openai/tool_parsers/__init__.py b/vllm/tests/entrypoints/openai/tool_parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/vllm/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py new file mode 100644 index 000000000..47b0b6bb8 --- /dev/null +++ b/vllm/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py @@ -0,0 +1,160 @@ +from typing import List +from unittest.mock import MagicMock + +import pytest + +from tests.entrypoints.openai.tool_parsers.utils import ( + run_tool_extraction, run_tool_extraction_streaming) +from vllm.entrypoints.openai.protocol import FunctionCall +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager + +# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1 +SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')" +SIMPLE_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "San Francisco", "metric": "celsius"}', +) +MORE_TYPES_FUNCTION_OUTPUT = ( + "register_user(name='John Doe', " + "age=37, " + "address={'city': 'San Francisco', 'state': 'CA'}, " + "role=None, " + "passed_test=True, " + "aliases=['John', 'Johnny'])") +MORE_TYPES_FUNCTION_CALL = FunctionCall( + name="register_user", + arguments='{"name": "John Doe", ' + '"age": 37, ' + '"address": {"city": "San Francisco", "state": "CA"}, ' + '"role": null, ' + '"passed_test": true, ' + '"aliases": ["John", "Johnny"]}', +) +PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()" +PARAMETERLESS_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{}', +) +EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})" +EMPTY_DICT_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"additional_data": {}}', +) +EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])" +EMPTY_LIST_FUNCTION_CALL = FunctionCall( + name="do_something_cool", + arguments='{"steps": []}', +) +ESCAPED_STRING_FUNCTION_OUTPUT = ( + r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')") +ESCAPED_STRING_FUNCTION_CALL = FunctionCall( + name="get_weather", + arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}', +) + + +@pytest.mark.parametrize("streaming", [True, False]) +def test_no_tool_call(streaming: bool): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + model_output = "How can I help you today?" + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content == model_output + assert len(tool_calls) == 0 + + +TEST_CASES = [ + pytest.param(True, + f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL], + id="simple_streaming"), + pytest.param(False, + f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL], + id="simple_nonstreaming"), + pytest.param(True, + f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL], + id="more_types_streaming"), + pytest.param(False, + f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL], + id="more_types_nonstreaming"), + pytest.param(True, + f"[{PARAMETERLESS_FUNCTION_OUTPUT}]", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_streaming"), + pytest.param(False, + f"[{PARAMETERLESS_FUNCTION_OUTPUT}]", + [PARAMETERLESS_FUNCTION_CALL], + id="parameterless_nonstreaming"), + pytest.param(True, + f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_streaming"), + pytest.param(False, + f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL], + id="empty_dict_nonstreaming"), + pytest.param(True, + f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_streaming"), + pytest.param(False, + f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL], + id="empty_list_nonstreaming"), + pytest.param(True, + f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_streaming"), + pytest.param(False, + f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]", + [ESCAPED_STRING_FUNCTION_CALL], + id="escaped_string_nonstreaming"), + pytest.param(True, + f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_streaming"), + pytest.param(False, + f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]", + [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL], + id="parallel_calls_nonstreaming"), +] + + +@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", + TEST_CASES) +def test_tool_call(streaming: bool, model_output: str, + expected_tool_calls: List[FunctionCall]): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + + content, tool_calls = run_tool_extraction(tool_parser, + model_output, + streaming=streaming) + + assert content is None + assert len(tool_calls) == len(expected_tool_calls) + for actual, expected in zip(tool_calls, expected_tool_calls): + assert actual.type == "function" + assert actual.function == expected + + +def test_streaming_tool_call_with_large_steps(): + mock_tokenizer = MagicMock() + tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( + mock_tokenizer) + model_output_deltas = [ + "[get_weather(city='San", + " Francisco', metric='celsius'), " + f"{PARAMETERLESS_FUNCTION_OUTPUT}, " + f"{EMPTY_LIST_FUNCTION_OUTPUT}]", + ] + + reconstructor = run_tool_extraction_streaming( + tool_parser, model_output_deltas, assert_one_tool_per_delta=False) + + assert reconstructor.other_content == "" + assert len(reconstructor.tool_calls) == 3 + assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL + assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL + assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL diff --git a/vllm/tests/entrypoints/openai/tool_parsers/utils.py b/vllm/tests/entrypoints/openai/tool_parsers/utils.py new file mode 100644 index 000000000..f0a2a32c1 --- /dev/null +++ b/vllm/tests/entrypoints/openai/tool_parsers/utils.py @@ -0,0 +1,123 @@ +from typing import Iterable, List, Tuple, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers import ToolParser + + +class StreamingToolReconstructor: + + def __init__(self, assert_one_tool_per_delta: bool = True): + self.tool_calls: List[ToolCall] = [] + self.other_content: str = "" + self._assert_one_tool_per_delta = assert_one_tool_per_delta + + def append_delta(self, delta: DeltaMessage): + if delta.content is not None: + self.other_content += delta.content + else: + assert delta.tool_calls, ( + "Streaming results should have either content or tool calls " + "(or both)") + if self._assert_one_tool_per_delta: + # Note: This isn't strictly required by the API and may not be + # possible to adhere to depending on the token space and number of + # tokens per streamed response from the model, but it is required + # by tool_use tests, so we enforce it here by default also. + assert len(delta.tool_calls) < 2, ( + "Streaming should include only one tool call per update.") + for call_delta in delta.tool_calls: + assert call_delta.type == "function", ( + "Streaming tool calls should only emit function calls. Got " + f"{call_delta.type}") + current_tool_call = self.tool_calls[ + call_delta.index] if call_delta.index < len( + self.tool_calls) else None + if current_tool_call: + assert (not call_delta.function.name), ( + "Streaming tool calls should emit the full function name " + f"exactly once. Got {call_delta.function.name}") + assert (not call_delta.id), ( + "Streaming tool calls must emit function id only once. Got " + f"{call_delta.id}") + assert (call_delta.index == len(self.tool_calls) - 1), ( + f"Incorrect index for tool delta. Got {call_delta.index}, " + f"expected {len(self.tool_calls) - 1}") + current_tool_call.function.arguments += ( + call_delta.function.arguments) + else: + assert call_delta.id is not None, ( + "Streaming tool calls must have an id on first appearance") + assert call_delta.function.name is not None, ( + "Streaming tool calls must have a function name on first " + "appearance") + assert call_delta.index == len(self.tool_calls), ( + f"Incorrect index for tool delta. Got {call_delta.index}, " + f"expected {len(self.tool_calls)}") + self.tool_calls.append( + ToolCall(id=call_delta.id, + function=FunctionCall( + name=call_delta.function.name, + arguments=call_delta.function.arguments + or ""))) + + +def run_tool_extraction( + tool_parser: ToolParser, + model_output: str, + request: Union[ChatCompletionRequest, None] = None, + streaming: bool = False, + assert_one_tool_per_delta: bool = True, +) -> Tuple[Union[str, None], List[ToolCall]]: + if streaming: + reconstructor = run_tool_extraction_streaming( + tool_parser, + model_output, + request, + assert_one_tool_per_delta=assert_one_tool_per_delta) + return reconstructor.other_content or None, reconstructor.tool_calls + else: + extracted = run_tool_extraction_nonstreaming(tool_parser, model_output, + request) + assert extracted.tools_called == bool(extracted.tool_calls) + return extracted.content, extracted.tool_calls + + +def run_tool_extraction_nonstreaming( + tool_parser: ToolParser, + model_output: str, + request: Union[ChatCompletionRequest, None] = None +) -> ExtractedToolCallInformation: + request = request or ChatCompletionRequest(messages=[], model="test-model") + return tool_parser.extract_tool_calls(model_output, request) + + +def run_tool_extraction_streaming( + tool_parser: ToolParser, + model_deltas: Iterable[str], + request: Union[ChatCompletionRequest, None] = None, + assert_one_tool_per_delta: bool = True, +) -> StreamingToolReconstructor: + request = request or ChatCompletionRequest(messages=[], model="test-model") + reconstructor = StreamingToolReconstructor( + assert_one_tool_per_delta=assert_one_tool_per_delta) + previous_text = "" + previous_tokens: List[int] = [] + for delta in model_deltas: + token_delta = [ + tool_parser.vocab.get(token) + for token in tool_parser.model_tokenizer.tokenize(delta) + if token in tool_parser.vocab + ] + current_text = previous_text + delta + current_tokens = previous_tokens + token_delta + delta_message = tool_parser.extract_tool_calls_streaming( + previous_text, current_text, delta, previous_tokens, + current_tokens, token_delta, request) + if delta_message is not None: + reconstructor.append_delta(delta_message) + previous_text = current_text + previous_tokens = current_tokens + return reconstructor diff --git a/vllm/tests/entrypoints/test_chat_utils.py b/vllm/tests/entrypoints/test_chat_utils.py index 5fa466f8f..996e60bfe 100644 --- a/vllm/tests/entrypoints/test_chat_utils.py +++ b/vllm/tests/entrypoints/test_chat_utils.py @@ -6,15 +6,24 @@ from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.chat_utils import (parse_chat_messages, - parse_chat_messages_futures) +from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, + parse_chat_messages, + parse_chat_messages_futures, + resolve_chat_template_content_format) from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup +from ..utils import VLLM_PATH + +EXAMPLES_DIR = VLLM_PATH / "examples" + PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" +ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3" +QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct" MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct" +LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B" @pytest.fixture(scope="function") @@ -26,7 +35,6 @@ def phi3v_model_config(): trust_remote_code=True, dtype="bfloat16", seed=0, - chat_template_text_format="string", limit_mm_per_prompt={ "image": 2, }) @@ -94,19 +102,24 @@ def test_parse_chat_messages_single_image( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -121,19 +134,24 @@ async def test_parse_chat_messages_single_image_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in the image?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -147,24 +165,29 @@ def test_parse_chat_messages_multiple_images( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -181,24 +204,29 @@ async def test_parse_chat_messages_multiple_images_async( phi3v_tokenizer, image_url, ): - conversation, mm_future = parse_chat_messages_futures([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_future = parse_chat_messages_futures( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -214,27 +242,31 @@ def test_parse_chat_messages_placeholder_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to <|image_2|>?" - }] - }], phi3v_model_config, phi3v_tokenizer) - + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to <|image_2|>?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": "user", @@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": - "text", - "text": - "What's in <|image_1|> and how does it compare to the other one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": + "text", + "text": + "What's in <|image_1|> and how does it compare to the other one?" # noqa: E501 + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -285,34 +326,39 @@ def test_parse_chat_messages_multiple_images_across_messages( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "text", - "text": "What about this one?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about this one?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [ { @@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format( phi3v_model_config, phi3v_tokenizer, ): - phi3v_model_config.chat_template_text_format = "openai" conversation, mm_data = parse_chat_messages( [{ "role": "user", @@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format( }, { "role": "user", "content": "What about this one?" - }], phi3v_model_config, phi3v_tokenizer) + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="openai", + ) assert conversation == [ { @@ -389,29 +438,34 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What's in these images?" - }] - }], phi3v_model_config, phi3v_tokenizer) + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_rejects_too_many_images_across_messages( @@ -427,39 +481,44 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ValueError, match="At most 2 image\\(s\\) may be provided in one request\\." ): - parse_chat_messages([{ - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + parse_chat_messages( + [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in this image?" + }] }, { - "type": "text", - "text": "What's in this image?" - }] - }, { - "role": "assistant", - "content": "Some stuff." - }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": image_url - } + "role": "assistant", + "content": "Some stuff." }, { - "type": "image_url", - "image_url": { - "url": image_url - } - }, { - "type": "text", - "text": "What about these two?" - }] - }], phi3v_model_config, phi3v_tokenizer) + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What about these two?" + }] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) def test_parse_chat_messages_multiple_images_uncommon_input( @@ -467,17 +526,22 @@ def test_parse_chat_messages_multiple_images_uncommon_input( phi3v_tokenizer, image_url, ): - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - "What's in these images?", { - "image_url": image_url - }, { - "image_url": image_url - } - ] - }], phi3v_model_config, phi3v_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + "What's in these images?", { + "image_url": image_url + }, { + "image_url": image_url + } + ] + }], + phi3v_model_config, + phi3v_tokenizer, + content_format="string", + ) assert conversation == [{ "role": @@ -495,16 +559,21 @@ def test_mllama_single_image( image_url, ): """Ensures that a single image is parsed correctly mllama.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [{ - 'type': 'text', - 'text': 'The content of this image is:' - }, { - "image_url": image_url - }] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [{ + 'type': 'text', + 'text': 'The content of this image is:' + }, { + "image_url": image_url + }] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 1) assert conversation == [{ 'role': @@ -524,26 +593,31 @@ def test_mllama_interleaved_images( image_url, ): """Ensures that multiple image are parsed as interleaved dicts.""" - conversation, mm_data = parse_chat_messages([{ - "role": - "user", - "content": [ - { - 'type': 'text', - 'text': 'The content of the first image is:' - }, - { - "image_url": image_url - }, - { - 'type': 'text', - 'text': 'The content of the second image is:' - }, - { - "image_url": image_url - }, - ] - }], mllama_model_config, mllama_tokenizer) + conversation, mm_data = parse_chat_messages( + [{ + "role": + "user", + "content": [ + { + 'type': 'text', + 'text': 'The content of the first image is:' + }, + { + "image_url": image_url + }, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + { + "image_url": image_url + }, + ] + }], + mllama_model_config, + mllama_tokenizer, + content_format="openai", + ) _assert_mm_data_is_image_input(mm_data, 2) assert conversation == [{ 'role': @@ -626,6 +700,7 @@ def get_conversation(is_hf: bool): vllm_conversation, model_config, tokenizer_group, + content_format="openai", ) vllm_result = apply_hf_chat_template( @@ -636,3 +711,89 @@ def get_conversation(is_hf: bool): ) assert hf_result == vllm_result + + +# yapf: disable +@pytest.mark.parametrize( + ("model", "expected_format"), + [(PHI3V_MODEL_ID, "string"), + (QWEN2VL_MODEL_ID, "openai"), + (ULTRAVOX_MODEL_ID, "string"), + (MLLAMA_MODEL_ID, "openai"), + (LLAMA_GUARD_MODEL_ID, "openai")], +) +# yapf: enable +def test_resolve_content_format_hf_defined(model, expected_format): + tokenizer_group = TokenizerGroup( + model, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + tokenizer = tokenizer_group.tokenizer + + chat_template = tokenizer.chat_template + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + None, # Test detecting the tokenizer's chat_template + "auto", + tokenizer, + ) + + assert resolved_format == expected_format + + +# yapf: disable +@pytest.mark.parametrize( + ("template_path", "expected_format"), + [("template_alpaca.jinja", "string"), + ("template_baichuan.jinja", "string"), + ("template_blip2.jinja", "string"), + ("template_chatglm.jinja", "string"), + ("template_chatglm2.jinja", "string"), + ("template_chatml.jinja", "string"), + ("template_falcon_180b.jinja", "string"), + ("template_falcon.jinja", "string"), + ("template_inkbot.jinja", "string"), + ("template_llava.jinja", "string"), + ("template_vlm2vec.jinja", "openai"), + ("tool_chat_template_granite_20b_fc.jinja", "string"), + ("tool_chat_template_hermes.jinja", "string"), + ("tool_chat_template_internlm2_tool.jinja", "string"), + ("tool_chat_template_llama3.1_json.jinja", "openai"), + ("tool_chat_template_llama3.2_json.jinja", "openai"), + ("tool_chat_template_mistral_parallel.jinja", "string"), + ("tool_chat_template_mistral.jinja", "string")], +) +# yapf: enable +def test_resolve_content_format_examples(template_path, expected_format): + tokenizer_group = TokenizerGroup( + PHI3V_MODEL_ID, + enable_lora=False, + max_num_seqs=5, + max_input_length=None, + ) + dummy_tokenizer = tokenizer_group.tokenizer + dummy_tokenizer.chat_template = None + + chat_template = load_chat_template(EXAMPLES_DIR / template_path) + assert isinstance(chat_template, str) + + print("[TEXT]") + print(chat_template) + print("[AST]") + print(_try_extract_ast(chat_template)) + + resolved_format = resolve_chat_template_content_format( + chat_template, + "auto", + dummy_tokenizer, + ) + + assert resolved_format == expected_format diff --git a/vllm/tests/kernels/test_activation.py b/vllm/tests/kernels/test_activation.py index 0e3d3c3a2..a84501f9c 100644 --- a/vllm/tests/kernels/test_activation.py +++ b/vllm/tests/kernels/test_activation.py @@ -8,13 +8,13 @@ from vllm.model_executor.layers.activation import (FastGELU, FatreluAndMul, GeluAndMul, NewGELU, QuickGELU, SiluAndMul) -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing -D = [512, 4096, 5120, 13824] # Arbitrary values for testing +D = [512, 13824] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) @@ -37,7 +37,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu": @@ -85,7 +85,7 @@ def test_activation( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/vllm/tests/kernels/test_attention.py b/vllm/tests/kernels/test_attention.py index 1604aa4d2..3e3c06681 100644 --- a/vllm/tests/kernels/test_attention.py +++ b/vllm/tests/kernels/test_attention.py @@ -7,7 +7,7 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import get_max_shared_memory_bytes, seed_everything +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol @@ -33,7 +33,7 @@ # FlashAttention forward only supports head dimension at most 128 # https://github.com/ROCmSoftwarePlatform/flash-attention/blob/3d2b6f5d037782cc2c906909a46fb7e2e1b48b25/csrc/flash_attn_rocm/flash_api.cpp#L62 -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [16, 32] USE_ALIBI = [False, True] @@ -144,7 +144,7 @@ def test_paged_attention( or (version == "rocm" and head_size not in (64, 128))): pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -382,7 +382,7 @@ def test_multi_query_kv_attention( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use diff --git a/vllm/tests/kernels/test_attention_selector.py b/vllm/tests/kernels/test_attention_selector.py index 3fe9ca0b0..d37f95d48 100644 --- a/vllm/tests/kernels/test_attention_selector.py +++ b/vllm/tests/kernels/test_attention_selector.py @@ -5,6 +5,7 @@ from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import which_attn_to_use +from vllm.platforms import cpu, cuda, openvino, rocm from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL @@ -19,31 +20,35 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.current_platform.is_cpu", - return_value=True): + with patch("vllm.attention.selector.current_platform", + cpu.CpuPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.current_platform.is_rocm", - return_value=True): + with patch("vllm.attention.selector.current_platform", + rocm.RocmPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": - with patch("vllm.attention.selector.current_platform.is_openvino", - return_value=True): + with patch("vllm.attention.selector.current_platform", + openvino.OpenVinoPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" else: - backend = which_attn_to_use(16, torch.float16, torch.float16, 16, - False) + with patch("vllm.attention.selector.current_platform", + cuda.CudaPlatform()): + backend = which_attn_to_use(16, torch.float16, torch.float16, 16, + False) assert backend.name == name def test_flash_attn(monkeypatch): """Test FlashAttn validation.""" + # TODO: When testing for v1, pipe in `use_v1` as an argument to + # which_attn_to_use override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL) diff --git a/vllm/tests/kernels/test_awq_marlin.py b/vllm/tests/kernels/test_awq_marlin.py index 59917dd2c..238d6426b 100644 --- a/vllm/tests/kernels/test_awq_marlin.py +++ b/vllm/tests/kernels/test_awq_marlin.py @@ -14,13 +14,17 @@ awq_marlin_quantize) from vllm.scalar_type import scalar_types +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] +GROUP_SIZES = [-1, 32, 128] -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", GROUP_SIZES) @pytest.mark.skipif(not (ops.supports_moe_ops and hasattr(torch.ops._moe_C, "marlin_gemm_moe")), reason="Marlin is not supported on this GPU type.") diff --git a/vllm/tests/kernels/test_awq_triton.py b/vllm/tests/kernels/test_awq_triton.py index e95e5bd94..406a0c8dd 100644 --- a/vllm/tests/kernels/test_awq_triton.py +++ b/vllm/tests/kernels/test_awq_triton.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.awq_triton import ( AWQ_TRITON_SUPPORTED_GROUP_SIZES, awq_dequantize_triton, awq_gemm_triton) -from vllm.utils import seed_everything +from vllm.platforms import current_platform device = "cuda" @@ -80,7 +80,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - seed_everything(0) + current_platform.seed_everything(0) qweight = torch.randint(0, torch.iinfo(torch.int32).max, @@ -134,7 +134,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - seed_everything(0) + current_platform.seed_everything(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, diff --git a/vllm/tests/kernels/test_blocksparse_attention.py b/vllm/tests/kernels/test_blocksparse_attention.py index b65efb3ab..fad342d1b 100644 --- a/vllm/tests/kernels/test_blocksparse_attention.py +++ b/vllm/tests/kernels/test_blocksparse_attention.py @@ -8,7 +8,7 @@ from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) from vllm.platforms import current_platform -from vllm.utils import get_max_shared_memory_bytes, seed_everything +from vllm.utils import get_max_shared_memory_bytes from .allclose_default import get_default_atol, get_default_rtol @@ -25,10 +25,10 @@ DTYPES = [torch.half, torch.bfloat16] NUM_GEN_SEQS = [3] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing +NUM_HEADS = [(40, 40)] # Arbitrary values for testing HEAD_SIZES = [64, 112] -BLOCK_SIZES = [16, 32] +BLOCK_SIZES = [16] USE_ALIBI = [False, True] KV_CACHE_DTYPE = ["auto", "fp8"] SEEDS = [0] @@ -37,7 +37,7 @@ BLOCKSPARSE_VERT_STRIDES = [8] BLOCKSPARSE_BLOCK_SIZES = [64] -BLOCKSPARSE_HEADS_SLIDINGS = [0, 2, -1] +BLOCKSPARSE_HEADS_SLIDINGS = [2, -1] BLOCKSPARSE_HOMO_HEADS = [True, False] @@ -173,7 +173,7 @@ def test_paged_attention( blocksparse_block_size: int, blocksparse_head_sliding_step: int, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads @@ -384,7 +384,7 @@ def test_varlen_blocksparse_attention_prefill( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. # As the xformers library is already tested with its own tests, we can use diff --git a/vllm/tests/kernels/test_cache.py b/vllm/tests/kernels/test_cache.py index b0e7097fd..40550ed51 100644 --- a/vllm/tests/kernels/test_cache.py +++ b/vllm/tests/kernels/test_cache.py @@ -6,14 +6,14 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform COPYING_DIRECTION = [('cuda', 'cpu'), ('cuda', 'cuda'), ('cpu', 'cuda')] DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [42] # Arbitrary values for testing NUM_LAYERS = [1] # Arbitrary values for testing NUM_HEADS = [8] # Arbitrary values for testing -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 120, 256] BLOCK_SIZES = [8, 16, 32] # Arbitrary values for testing @@ -56,7 +56,7 @@ def test_copy_blocks( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Generate random block mappings where each source block is mapped to two # destination blocks. @@ -132,7 +132,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. num_slots = block_size * num_blocks @@ -224,7 +224,7 @@ def test_reshape_and_cache_flash( device: str, kv_cache_dtype: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) # Create a random slot mapping. @@ -258,19 +258,20 @@ def test_reshape_and_cache_flash( del key_caches del value_caches + k_scale = key.amax().item() / 256 + v_scale = value.amax().item() / 256 + # Clone the KV caches. if kv_cache_dtype == "fp8": cloned_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(cloned_key_cache, key_cache) + ops.convert_fp8(cloned_key_cache, key_cache, k_scale, kv_cache_dtype) cloned_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(cloned_value_cache, value_cache) + ops.convert_fp8(cloned_value_cache, value_cache, v_scale, + kv_cache_dtype) else: cloned_key_cache = key_cache.clone() cloned_value_cache = value_cache.clone() - # Using default kv_scale - k_scale = v_scale = 1.0 - # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache_flash, (key, value, key_cache, value_cache, slot_mapping, kv_cache_dtype, @@ -281,9 +282,15 @@ def test_reshape_and_cache_flash( if kv_cache_dtype == "fp8": result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) - ops.convert_fp8(result_key_cache, key_cache) + ops.convert_fp8(result_key_cache, + key_cache, + k_scale, + kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) - ops.convert_fp8(result_value_cache, value_cache) + ops.convert_fp8(result_value_cache, + value_cache, + v_scale, + kv_dtype=kv_cache_dtype) # Run the reference implementation. block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") @@ -339,7 +346,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - seed_everything(seed) + current_platform.seed_everything(seed) src_device = device if direction[0] == "cuda" else 'cpu' dst_device = device if direction[1] == "cuda" else 'cpu' @@ -408,7 +415,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) low = -224.0 high = 224.0 diff --git a/vllm/tests/kernels/test_causal_conv1d.py b/vllm/tests/kernels/test_causal_conv1d.py index 277d7e497..f9b110182 100644 --- a/vllm/tests/kernels/test_causal_conv1d.py +++ b/vllm/tests/kernels/test_causal_conv1d.py @@ -9,7 +9,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.causal_conv1d import ( causal_conv1d_fn, causal_conv1d_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def causal_conv1d_ref( @@ -70,7 +70,7 @@ def causal_conv1d_update_ref(x, bias: (dim,) cache_seqlens: (batch,), dtype int32. If not None, the conv_state is treated as a circular buffer. - The conv_state will be updated by copying x to the + The conv_state will be updated by copying x to the conv_state starting at the index @cache_seqlens % state_len before performing the convolution. @@ -151,7 +151,7 @@ def causal_conv1d_opcheck_fn(x: torch.Tensor, @pytest.mark.parametrize("has_bias", [True]) @pytest.mark.parametrize("width", [4]) @pytest.mark.parametrize( - 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 2048, 4096]) + 'seqlen', [1, 8, 16, 32, 64, 128, 256, 512, 784, 1024, 1025, 2048, 4096]) @pytest.mark.parametrize('dim', [64]) @pytest.mark.parametrize('batch', [1]) def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, @@ -161,7 +161,7 @@ def test_causal_conv1d(batch, dim, seqlen, width, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) x = torch.randn(batch, dim, seqlen, device=device, dtype=itype).contiguous() @@ -223,7 +223,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x_ref = x.clone() @@ -270,7 +270,7 @@ def test_causal_conv1d_update_with_batch_gather(with_padding, dim, width, rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 3 padding = 5 if with_padding else 0 @@ -343,7 +343,7 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - seed_everything(0) + current_platform.seed_everything(0) seqlens = [] batch_size = 4 if seqlen < 10: @@ -420,7 +420,10 @@ def test_causal_conv1d_varlen(with_padding, dim, seqlen, width, has_bias, unpadded_out = out[:, :out_ref_tensor.shape[-1]] assert torch.allclose(unpadded_out, out_ref_tensor, rtol=rtol, atol=atol) - assert torch.allclose(final_states, final_states_ref, rtol=rtol, atol=atol) + assert torch.allclose(final_states[state_indices], + final_states_ref[state_indices], + rtol=rtol, + atol=atol) causal_conv1d_opcheck_fn(x.squeeze(0), weight, bias, cumsum.cuda(), padded_state_indices, has_initial_states, diff --git a/vllm/tests/kernels/test_cutlass.py b/vllm/tests/kernels/test_cutlass.py index 993e67e82..afe537973 100644 --- a/vllm/tests/kernels/test_cutlass.py +++ b/vllm/tests/kernels/test_cutlass.py @@ -11,6 +11,28 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 496), + (16, 256, 496), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 496), + (64, 16384, 1024), + (100, 8192, 496), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] @@ -116,9 +138,7 @@ def cutlass_int8_gemm_helper(m: int, (out, a, b, scale_a, scale_b, bias)) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 100, 33]) -@pytest.mark.parametrize("n", [2048, 4096, 8192, 16384, 24576, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) @@ -129,9 +149,7 @@ def test_cutlass_fp8_gemm(m: int, n: int, k: int, per_act_token: bool, cutlass_fp8_gemm_helper(m, n, k, per_act_token, per_out_ch, use_bias) -@pytest.mark.parametrize("m", [1, 16, 32, 64, 128, 256, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 8192, 16384, 256, 1024]) -@pytest.mark.parametrize("k", [128, 496, 1024]) +@pytest.mark.parametrize("m,n,k", MNK_FACTORS) @pytest.mark.parametrize("per_act_token", [True, False]) @pytest.mark.parametrize("per_out_ch", [True, False]) @pytest.mark.parametrize("use_bias", [True, False]) diff --git a/vllm/tests/kernels/test_encoder_decoder_attn.py b/vllm/tests/kernels/test_encoder_decoder_attn.py index bc99c5559..d943b048b 100644 --- a/vllm/tests/kernels/test_encoder_decoder_attn.py +++ b/vllm/tests/kernels/test_encoder_decoder_attn.py @@ -16,13 +16,14 @@ from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP -from vllm.attention.selector import (_Backend, +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.forward_context import set_forward_context from vllm.platforms import current_platform # List of support backends for encoder/decoder models -LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] - +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] HEAD_SIZES = [64, 256] NUM_HEADS = [1, 16] @@ -145,7 +146,8 @@ class that Attention will automatically select when it is constructed. test_pt.num_heads, test_pt.head_size, test_pt.block_size, - device=CUDA_DEVICE) + device=CUDA_DEVICE, + backend=test_pt.backend_name) return TestResources(scale, attn_backend, attn, kv_cache) @@ -592,6 +594,8 @@ def _run_encoder_attention_test( attn: Attention, encoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run encoder attention. @@ -610,6 +614,8 @@ def _run_encoder_attention_test( (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed {query,key,value} and @@ -619,20 +625,32 @@ def _run_encoder_attention_test( attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata, vllm_config): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), + attn_metadata, + attn_type=attn_type) def _run_decoder_self_attention_test( test_rsrcs: TestResources, decoder_test_params: PhaseTestParameters, attn_metadata: AttentionMetadata, + test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run decoder self-attention test. @@ -650,6 +668,8 @@ def _run_decoder_self_attention_test( query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -660,12 +680,22 @@ def _run_decoder_self_attention_test( kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None - return attn.forward(packed_qkv.query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata, vllm_config): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + packed_qkv.key, + packed_qkv.value, + kv_cache, + attn_metadata, + attn_type=attn_type) def _run_encoder_decoder_cross_attention_test( @@ -673,6 +703,8 @@ def _run_encoder_decoder_cross_attention_test( decoder_test_params: PhaseTestParameters, cross_test_params: Optional[PhaseTestParameters], attn_metadata: AttentionMetadata, + test_pt: TestPoint, + vllm_config: VllmConfig, ) -> torch.Tensor: ''' Run encoder/decoder cross-attention test. @@ -701,6 +733,8 @@ def _run_encoder_decoder_cross_attention_test( (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention + * test_pt: The TestPoint object containing test details like number of + model heads, head size, name of the backend being used etc. Returns: * Attention.forward() applied to packed_{query,key,value}, kv_cache @@ -718,12 +752,37 @@ def _run_encoder_decoder_cross_attention_test( cross_pckd_qkv = cross_test_params.packed_qkvo.packed_qkv key = (None if cross_pckd_qkv is None else cross_pckd_qkv.key) value = (None if cross_pckd_qkv is None else cross_pckd_qkv.value) - return attn.forward(decoder_test_params.packed_qkvo.packed_qkv.query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + with set_forward_context(attn_metadata, vllm_config): + # In the test setup the shape of the query is + # [batch_size, seq_len, num_heads, head_size]. However + # the attention backend expect the shape to be + # [num_tokens, hidden_size]. Hence reshape the query before + # invoking the forward method. + # TODO - Update the way we construct the query so that it + # is shaped as [num_tokens, hidden_size] and we can skip the reshape. + reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( + -1, test_pt.num_heads * test_pt.head_size) + return attn.forward(reshaped_query, + key, + value, + kv_cache, + attn_metadata, + attn_type=attn_type) + + +@pytest.fixture(autouse=True) +def set_reset_environment(attn_backend): + # Set the default torch datatype to bfloat16 to enable + # testing of the Flash Attention backend. Also clear the + # cached value of the backend. + default_dtype = torch.get_default_dtype() + if attn_backend.name == 'FLASH_ATTN': + torch.set_default_dtype(torch.bfloat16) + _cached_get_attn_backend.cache_clear() + yield + # Reset the torch datatype to what it was before the test + # so as not to impact the remaining tests. + torch.set_default_dtype(default_dtype) @pytest.mark.skipif(current_platform.is_rocm(), @@ -773,10 +832,8 @@ def test_encoder_only( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -786,7 +843,9 @@ def test_encoder_only( # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init - test_rsrcs = _make_test_resources(test_pt) + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + test_rsrcs = _make_test_resources(test_pt) # Construct encoder attention test params (only used # during prefill) @@ -807,10 +866,15 @@ def test_encoder_only( # PREFILL: encoder attention enc_pckd_act_out: torch.Tensor = (_run_encoder_attention_test( - test_rsrcs.attn, enc_test_params, prephase_attn_metadata)) + test_rsrcs.attn, + enc_test_params, + prephase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config)) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) @pytest.mark.skipif(current_platform.is_rocm(), @@ -892,10 +956,8 @@ def test_e2e_enc_dec_attn( * max_dec_seq_len: max length of decoder input sequences * max_enc_seq_len: max length of encoder input sequences ''' - # Force Attention wrapper backend with global_force_attn_backend_context_manager(attn_backend): - # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test @@ -905,7 +967,9 @@ def test_e2e_enc_dec_attn( # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init - test_rsrcs = _make_test_resources(test_pt) + vllm_config = VllmConfig() + with set_current_vllm_config(vllm_config): + test_rsrcs = _make_test_resources(test_pt) # Construct encoder attention test params (only used # during prefill) @@ -955,29 +1019,42 @@ def test_e2e_enc_dec_attn( enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, enc_test_params, - prephase_attn_metadata) + prephase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config) # - Is encoder attention result correct? - assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) + assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out, + attn_backend.name) # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config) # - Is prefill decoder self-attention correct? assert_actual_matches_ideal(prephase_dec_test_params, - prephase_dec_pckd_act_out) + prephase_dec_pckd_act_out, + attn_backend.name) # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, - prephase_attn_metadata) + test_rsrcs, + prephase_dec_test_params, + prephase_cross_test_params, + prephase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? assert_actual_matches_ideal(prephase_cross_test_params, - prephase_cross_pckd_act_out) + prephase_cross_pckd_act_out, + attn_backend.name) # DECODE: build decode-phase attention metadata @@ -993,17 +1070,28 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, decphase_dec_test_params, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + decphase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? assert_actual_matches_ideal(decphase_dec_test_params, - decphase_dec_pckd_act_out) + decphase_dec_pckd_act_out, + attn_backend.name) # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata) + test_rsrcs, + decphase_dec_test_params, + None, + decphase_attn_metadata, + test_pt=test_pt, + vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? assert_actual_matches_ideal(decphase_cross_test_params, - decphase_cross_pckd_act_out) + decphase_cross_pckd_act_out, + attn_backend.name) diff --git a/vllm/tests/kernels/test_flash_attn.py b/vllm/tests/kernels/test_flash_attn.py index 35c29c5bd..1ae78d7b4 100644 --- a/vllm/tests/kernels/test_flash_attn.py +++ b/vllm/tests/kernels/test_flash_attn.py @@ -3,7 +3,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform from vllm.vllm_flash_attn import (flash_attn_varlen_func, flash_attn_with_kvcache) @@ -71,6 +71,7 @@ def ref_paged_attn( return torch.cat(outputs, dim=0) +@pytest.mark.parametrize("use_out", [True, False]) @pytest.mark.parametrize("kv_lens", [[1328, 18, 463], [1, 54, 293, 70]]) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -81,6 +82,7 @@ def ref_paged_attn( @pytest.mark.parametrize("sliding_window", [None, 256]) @torch.inference_mode() def test_flash_attn_with_paged_kv( + use_out: bool, kv_lens: List[int], num_heads: Tuple[int, int], head_size: int, @@ -91,7 +93,7 @@ def test_flash_attn_with_paged_kv( sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -116,17 +118,22 @@ def test_flash_attn_with_paged_kv( (num_seqs, max_num_blocks_per_seq), dtype=torch.int32) + q = query.unsqueeze(1) + out = torch.empty_like(q) if use_out else None output = flash_attn_with_kvcache( - q=query.unsqueeze(1), + q=q, k_cache=key_cache, v_cache=value_cache, + out=out, softmax_scale=scale, causal=True, block_table=block_tables, cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, window_size=window_size, - ).squeeze(1) + ) + output = output if not use_out else out + output = output.squeeze(1) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -141,7 +148,10 @@ def test_flash_attn_with_paged_kv( f"{torch.max(torch.abs(output - ref_output))}" -@pytest.mark.parametrize("seq_lens", [[(1, 1328), (5, 18), (129, 463)]]) +@pytest.mark.parametrize("use_out", [True, False]) +@pytest.mark.parametrize("seq_lens", + [[(1, 1328), (5, 18), + (129, 463)], [(1, 523), (1, 37), (1, 2011)]]) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("block_size", BLOCK_SIZES) @@ -151,6 +161,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @torch.inference_mode() def test_varlen_with_paged_kv( + use_out: bool, seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int], head_size: int, @@ -161,7 +172,7 @@ def test_varlen_with_paged_kv( num_blocks: int, ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -197,10 +208,12 @@ def test_varlen_with_paged_kv( (num_seqs, max_num_blocks_per_seq), dtype=torch.int32) + out = torch.empty_like(query) if use_out else None output = flash_attn_varlen_func( q=query, k=key_cache, v=value_cache, + out=out, cu_seqlens_q=cu_query_lens, cu_seqlens_k=cu_kv_lens, max_seqlen_q=max_query_len, @@ -211,6 +224,7 @@ def test_varlen_with_paged_kv( block_table=block_tables, softcap=soft_cap if soft_cap is not None else 0, ) + output = output if not use_out else out ref_output = ref_paged_attn( query=query, diff --git a/vllm/tests/kernels/test_flashinfer.py b/vllm/tests/kernels/test_flashinfer.py index 80a388db6..a2c8f7166 100644 --- a/vllm/tests/kernels/test_flashinfer.py +++ b/vllm/tests/kernels/test_flashinfer.py @@ -4,7 +4,7 @@ import pytest import torch -from vllm.utils import seed_everything +from vllm.platforms import current_platform NUM_HEADS = [(16, 16), (32, 8), (64, 8), (6, 1)] HEAD_SIZES = [128, 256] @@ -84,7 +84,7 @@ def test_flashinfer_decode_with_paged_kv( soft_cap: Optional[float], ) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -170,7 +170,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -268,7 +268,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( head_size: int, dtype: torch.dtype, block_size: int, soft_cap: Optional[float]) -> None: torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -381,7 +381,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - seed_everything(0) + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/vllm/tests/kernels/test_fp8_quant.py b/vllm/tests/kernels/test_fp8_quant.py index c18f5f468..ebaaae232 100644 --- a/vllm/tests/kernels/test_fp8_quant.py +++ b/vllm/tests/kernels/test_fp8_quant.py @@ -6,7 +6,7 @@ ref_dynamic_per_tensor_fp8_quant, ref_dynamic_per_token_quant) from tests.kernels.utils import opcheck -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] HIDDEN_SIZES = [1, 2, 3, 4, 16, 67, 768, 2048, 5120, 5137, 8192, @@ -46,7 +46,7 @@ def opcheck_fp8_quant(output, def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 # avoid nans @@ -76,7 +76,7 @@ def test_dynamic_per_token_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -95,7 +95,7 @@ def test_dynamic_per_tensor_fp8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/vllm/tests/kernels/test_gguf.py b/vllm/tests/kernels/test_gguf.py index 1513fc196..893af99ba 100644 --- a/vllm/tests/kernels/test_gguf.py +++ b/vllm/tests/kernels/test_gguf.py @@ -7,7 +7,7 @@ from huggingface_hub import snapshot_download import vllm._custom_ops as ops -from vllm.utils import seed_everything +from vllm.platforms import current_platform GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") @@ -75,7 +75,7 @@ def test_dequantize(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -111,7 +111,7 @@ def test_mmvq(hidden_size: int, dtype: torch.dtype, @torch.inference_mode() def test_mmq(num_tokens: int, hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - seed_everything(0) + current_platform.seed_everything(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") diff --git a/vllm/tests/kernels/test_int8_quant.py b/vllm/tests/kernels/test_int8_quant.py index 41e103e1d..761eb95c4 100644 --- a/vllm/tests/kernels/test_int8_quant.py +++ b/vllm/tests/kernels/test_int8_quant.py @@ -4,14 +4,13 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] -HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192, - 8193] # Arbitrary values for testing +HIDDEN_SIZES = [16, 67, 768, 5137, 8193] # Arbitrary values for testing NUM_TOKENS = [1, 7, 83, 4096] # Arbitrary values for testing SEEDS = [0] -SCALE = [0.1, 0.5, 0.8, 1.2, 2.1] +SCALE = [0.1, 2.1] def opcheck_int8_quant_static(output, input, scale, azp=None): @@ -45,7 +44,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): @torch.inference_mode() def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -68,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int, @torch.inference_mode() def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, @@ -87,10 +86,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, assert torch_out.min() >= int8_traits.min and torch_out.max( ) <= int8_traits.max - ops_out = torch.empty_like(x, dtype=torch.int8) - scales_out = torch.empty_like(scales, dtype=torch.float32) - azp_out = torch.empty_like(azps, dtype=torch.int32) - torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out, azp_out) + ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False) if (not torch.allclose(scales_out, scales)): print(torch.argmax(torch.abs(scales_out - scales))) @@ -112,7 +108,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -120,7 +116,8 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, out1 = (x / scale_arg).round().clamp(int8_traits.min, int8_traits.max).to(torch.int8) - out2, _, _ = scaled_int8_quant(x, scale_arg) + out2, scale2, _ = scaled_int8_quant(x, scale_arg) + assert scale2 is scale_arg # big atol to account for rounding errors torch.testing.assert_close(out1, out2, atol=1, rtol=0.0) @@ -132,13 +129,13 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int, @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("scale", SCALE[2:]) # Reduce test time +@pytest.mark.parametrize("scale", SCALE) @pytest.mark.parametrize("azp", [-255, 54]) @torch.inference_mode() def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float, azp: int) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, @@ -146,11 +143,15 @@ def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int, out1 = ((x / scale).round() + azp).clamp(int8_traits.min, int8_traits.max).to(torch.int8) - out2 = torch.empty_like(x, dtype=torch.int8) scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda") azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda") - torch.ops._C.static_scaled_int8_quant(out2, x, scale_arg, azp_arg) + out2, scale2, azp2 = scaled_int8_quant(x, + scale_arg, + azp_arg, + symmetric=False) + assert scale2 is scale_arg + assert azp2 is azp_arg # big atol to account for rounding errors torch.testing.assert_close(out1, out2, atol=1, rtol=0.0) @@ -185,6 +186,5 @@ def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None: val_i8 = int8_traits.max if is_max else int8_traits.min expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda") - out = torch.empty_like(expected) - torch.ops._C.static_scaled_int8_quant(out, x, scale, azp) + out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False) torch.testing.assert_close(expected, out, atol=0, rtol=0) diff --git a/vllm/tests/kernels/test_layernorm.py b/vllm/tests/kernels/test_layernorm.py index 382079d47..727769e07 100644 --- a/vllm/tests/kernels/test_layernorm.py +++ b/vllm/tests/kernels/test_layernorm.py @@ -1,13 +1,14 @@ import pytest import torch +from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.utils import seed_everything +from vllm.platforms import current_platform DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing -HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, +HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192, 8199] # Arbitrary values for testing ADD_RESIDUAL = [False, True] SEEDS = [0] @@ -31,7 +32,7 @@ def test_rms_norm( seed: int, device: str, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) @@ -59,3 +60,75 @@ def test_rms_norm( else: opcheck(torch.ops._C.rms_norm, (out, x, layer.weight.data, layer.variance_epsilon)) + + +@pytest.mark.parametrize("num_tokens", NUM_TOKENS) +@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) +@pytest.mark.parametrize("add_residual", ADD_RESIDUAL) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0]) +@pytest.mark.parametrize("seed", SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_fused_rms_norm_quant( + num_tokens: int, + hidden_size: int, + add_residual: bool, + dtype: torch.dtype, + quant_scale: float, + seed: int, + device: str, +) -> None: + current_platform.seed_everything(seed) + torch.set_default_device(device) + + weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) + scale = 1 / (2 * hidden_size) + x = torch.randn(num_tokens, hidden_size, dtype=dtype) + x *= scale + if add_residual: + residual = torch.randn_like(x) * scale + residual_fused = residual.clone() + else: + residual = residual_fused = None + + out_norm = torch.empty_like(x) + out_quant = torch.empty_like(x, dtype=FP8_DTYPE) + out_quant_fused = torch.empty_like(out_quant) + + quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32) + + if add_residual: + torch.ops._C.fused_add_rms_norm_static_fp8_quant( + out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6) + + # Unfused kernel is in-place so it goes second + # Also use a separate clone of x to avoid modifying the input + x_unfused = x.clone() + torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6) + torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused, + quant_scale_t) + + torch.cuda.synchronize() + torch.testing.assert_close(residual_fused, + residual, + atol=1e-2, + rtol=1e-2) + + opcheck( + torch.ops._C.fused_add_rms_norm_static_fp8_quant, + (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)) + else: + torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight, + quant_scale_t, 1e-6) + + torch.ops._C.rms_norm(out_norm, x, weight, 1e-6) + torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm, + quant_scale_t) + + opcheck(torch.ops._C.rms_norm_static_fp8_quant, + (out_quant_fused, x, weight, quant_scale_t, 1e-6)) + + torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32), + out_quant.to(dtype=torch.float32), + atol=1e-3, + rtol=1e-3) diff --git a/vllm/tests/kernels/test_machete_gemm.py b/vllm/tests/kernels/test_machete_gemm.py deleted file mode 100644 index 59c0a2475..000000000 --- a/vllm/tests/kernels/test_machete_gemm.py +++ /dev/null @@ -1,284 +0,0 @@ -"""Tests for the machete kernel. - -Run `pytest tests/kernels/test_machete_gemm.py`. -""" - -import math -from typing import Optional, Tuple - -import pytest -import torch - -from tests.kernels.utils import opcheck -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - pack_rows, quantize_weights) -from vllm.platforms import current_platform -from vllm.scalar_type import ScalarType, scalar_types - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -MNK_SHAPES = [ - (1, 128, 128), - (1, 512, 1024), - (1, 4096, 4096), - (1, 8192, 28672), - (13, 8192, 4096), - (26, 4096, 8192), - (64, 4096, 4096), - (64, 8192, 28672), - (257, 128, 4096), - (257, 4224, 4160), - (257, 4096, 4096), - (1024, 4096, 8192), - (1024, 8192, 4096), -] - -ACT_TYPES = [torch.float16, torch.bfloat16] -WTYPE_ZEROPOINTS = [ - # GPTQ style - (scalar_types.uint4b8, False), - (scalar_types.uint8b128, False), - # AWQ style - (scalar_types.uint4, True), - (scalar_types.uint8, True), -] - -# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel -# unit tests to a common utility function. Currently the use of -# `is_quant_method_supported` conflates kernels with quantization methods -# an assumption which is breaking down as quantizations methods can have -# have kernels and some kernels support multiple quantization methods. -IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90) - - -def rand_data(shape, dtype=torch.float16): - return 10 * (torch.rand(shape, dtype=dtype, device="cuda") - 0.3) - - -def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): - return zps if zps is None else -1 * s * (zps.to(s.dtype)) - - -def machete_quantize_and_pack(w: torch.Tensor, - wtype: ScalarType, - group_size: int, - zero_points: bool = False): - assert wtype.is_integer(), "TODO: support floating point weights" - - w_ref, w_q, w_s, w_zp = quantize_weights( - w, - wtype, - group_size, - zero_points=zero_points, - # to match how the kernel applies zps - ref_zero_points_after_scales=True) - - w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) - w_q = w_q.t().contiguous().t() # convert to col major - w_q_machete = ops.machete_prepack_B(w_q, wtype) - - opcheck(torch.ops._C.machete_prepack_B, (w_q, wtype.id)) - - return w_ref, w_q_machete, w_s, w_zp - - -def machete_gemm_test_helper(a: torch.Tensor, b: torch.Tensor, - wtype: ScalarType, group_size: int, - zero_points: bool): - w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( - b, wtype, group_size, zero_points) - - output_ref = torch.matmul(a, w_ref) - - output = ops.machete_gemm( - a=a, - b_q=w_q_packed, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - ) - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(a.shape[1]), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("shape", - MNK_SHAPES, - ids=lambda x: "x".join(str(v) for v in x)) -@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x)) -@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS) -@pytest.mark.parametrize("group_size", [128, None]) -def test_machete_all_schedules(shape, atype: torch.dtype, - wtype_zeropoints: Tuple[ScalarType, bool], - group_size: Optional[int]): - m, n, k = shape - wtype, zero_points = wtype_zeropoints - - if group_size is not None and k % group_size != 0: - return - - print(f"MNK = {m} {n} {k}") - - # Normalize group_size - if group_size is None: - group_size = k - assert group_size <= k - - a = rand_data((m, k), atype) - w = rand_data((k, n), atype) - - w_ref, w_q_machete, w_s, w_zp = machete_quantize_and_pack( - w, wtype, group_size, zero_points) - - output_ref = torch.matmul(a, w_ref) - - for schedule in ops.machete_supported_schedules(wtype): - print(f"Testing schedule {schedule}") - output = ops.machete_gemm( - a, - b_q=w_q_machete, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - schedule=schedule, - ) - - opcheck( - torch.ops._C.machete_gemm, - (a, w_q_machete, wtype.id, w_s, maybe_convert_zeropoints( - w_zp, w_s), group_size, None, None, None, schedule)) - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol),\ - f"Schedule failed {schedule}" - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("shape", - MNK_SHAPES, - ids=lambda x: "x".join(str(v) for v in x)) -@pytest.mark.parametrize("atype", ACT_TYPES, ids=lambda x: str(x)) -@pytest.mark.parametrize("wtype_zeropoints", WTYPE_ZEROPOINTS) -@pytest.mark.parametrize("group_size", [128, None]) -def test_machete_heuristic(shape, atype: torch.dtype, - wtype_zeropoints: Tuple[ScalarType, bool], - group_size: Optional[int]): - m, n, k = shape - wtype, zero_points = wtype_zeropoints - - if group_size is not None and k % group_size != 0: - return - - # Normalize group_size - if group_size is None: - group_size = k - assert group_size <= k - - a = rand_data((m, k), atype) - b = rand_data((k, n), atype) - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test working on other devices -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -@pytest.mark.parametrize("device", CUDA_DEVICES) -def test_machete_devices(device: str): - m, n, k = 512, 4096, 4096 - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - print(f"MNK = {m} {n} {k}, device = {device}") - - a = rand_data((m, k), torch.float16).to(device) - b = rand_data((k, n), torch.float16).to(device) - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test working with a subset of A and B -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -def test_machete_subset(): - big_m, big_n, big_k = 1024, 1024, 1024 - m, n, k = 512, 512, 512 - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - whole_a = rand_data((big_m, big_k), torch.float16) - whole_b = rand_data((big_k, big_n), torch.float16) - - a = whole_a[0:m, 0:k] - b = whole_b[0:k, 0:n] - - machete_gemm_test_helper(a, b, wtype, group_size, zero_points) - - -# Test to make sure cuda graphs work -class MacheteLayer(torch.nn.Module): - - def __init__(self, **kwargs): - super().__init__() - self.kwargs = kwargs - - def forward(self, a): - return ops.machete_gemm(**self.kwargs) - - -@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, - reason="Machete is not supported on this GPU type.") -def test_machete_cuda_graph(): - m, n, k = 512, 4096, 4096 - - a = rand_data((m, k), torch.float16) - b = rand_data((k, n), torch.float16) - wtype = scalar_types.uint4b8 - group_size = 128 - zero_points = False - - w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( - b, wtype, group_size, zero_points) - - # Construct a trivial model with a single layer that calls a machete kernel - model = MacheteLayer( - a=a, - b_q=w_q_packed, - b_type=wtype, - b_scales=w_s, - b_zeros=maybe_convert_zeropoints(w_zp, w_s), - b_group_size=group_size, - ) - - output_ref = torch.matmul(a, w_ref) - - # Run the model with a cuda graph - stream = torch.cuda.Stream() - with torch.cuda.stream(stream): - g = torch.cuda.CUDAGraph() - with torch.cuda.graph(g): - output = model(a) - output.zero_() - g.replay() - - # Relax atol as our reduction dim becomes larger (more rounding error) - # Relax atol when we have zeropoints since the way machete applies - # zeropoints (after scales) causes noise around 0 - atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) - torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) diff --git a/vllm/tests/kernels/test_machete_mm.py b/vllm/tests/kernels/test_machete_mm.py new file mode 100644 index 000000000..1c6eb2dd9 --- /dev/null +++ b/vllm/tests/kernels/test_machete_mm.py @@ -0,0 +1,406 @@ +"""Tests for the machete kernel. + +Run `pytest tests/kernels/test_machete_mm.py`. +""" + +import math +from dataclasses import dataclass, fields +from typing import List, Optional, Tuple + +import pytest +import torch + +from tests.kernels.utils import opcheck +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_rows, quantize_weights) +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel +# unit tests to a common utility function. Currently the use of +# `is_quant_method_supported` conflates kernels with quantization methods +# an assumption which is breaking down as quantizations methods can have +# have kernels and some kernels support multiple quantization methods. +IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 + +MNK_SHAPES = [ + (1, 128, 128), + (1, 512, 1024), + (1, 4096, 4096), + (1, 8192, 28672), + (13, 8192, 4096), + (26, 4096, 8192), + (64, 4096, 4096), + (64, 8192, 28672), + (257, 128, 4096), + (257, 4224, 4160), + (257, 4096, 4096), + (1024, 4096, 8192), + (1024, 8192, 4096), +] + +GROUP_SIZES_TO_TEST: List[Optional[int]] = [128, -1] + + +@dataclass +class TypeConfig: + act_type: torch.dtype + weight_type: ScalarType + output_type: Optional[torch.dtype] + group_scale_type: Optional[torch.dtype] + group_zero_type: Optional[torch.dtype] + channel_scale_type: Optional[torch.dtype] + token_scale_type: Optional[torch.dtype] + + +@dataclass +class Tensors: + w_ref: torch.Tensor + a_ref: torch.Tensor + a: torch.Tensor + w_q: torch.Tensor + w_g_s: Optional[torch.Tensor] + w_g_zp: Optional[torch.Tensor] + w_ch_s: Optional[torch.Tensor] + w_tok_s: Optional[torch.Tensor] + + +# (Act Type, Weight Type, Output Type, Scale Type, ZeroPoints, +# Ch Scales Type, Tok Scales Type) +# NOTE: None "Scale Type" means the act type is floating point +# None "Output Type" means the output type is the same as the act type +TestTypeTuple = Tuple[List[torch.dtype], ScalarType, Optional[torch.dtype], + Optional[torch.dtype], bool] +TEST_TYPES = [ + # GPTQ style + *(TypeConfig(act_type=a_type, + weight_type=w_type, + output_type=None, + group_scale_type=a_type, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + for w_type in [scalar_types.uint4b8, scalar_types.uint8b128] + for a_type in [torch.float16, torch.bfloat16]), + # AWQ style + *(TypeConfig(act_type=a_type, + weight_type=w_type, + output_type=None, + group_scale_type=a_type, + group_zero_type=a_type, + channel_scale_type=None, + token_scale_type=None) + for w_type in [scalar_types.uint4, scalar_types.uint8] + for a_type in [torch.float16, torch.bfloat16]), + # QQQ style + *(TypeConfig(act_type=torch.int8, + weight_type=scalar_types.uint4b8, + output_type=torch.float16, + group_scale_type=group_scale_type, + group_zero_type=None, + channel_scale_type=torch.float, + token_scale_type=torch.float) + for group_scale_type in [None, torch.float16]), + *(TypeConfig(act_type=torch.float8_e4m3fn, + weight_type=scalar_types.uint4b8, + output_type=torch.float16, + group_scale_type=group_scale_type, + group_zero_type=None, + channel_scale_type=torch.float, + token_scale_type=torch.float) + for group_scale_type in [None, torch.float16]), +] + +# TODO: in future PR refactor this and `is_quant_method_supported` in the kernel +# unit tests to a common utility function. Currently the use of +# `is_quant_method_supported` conflates kernels with quantization methods +# an assumption which is breaking down as quantizations methods can have +# have kernels and some kernels support multiple quantization methods. +IS_SUPPORTED_BY_GPU = current_platform.has_device_capability(90) + + +def rand_data(shape, dtype=torch.float16, scale=1, offset=0): + if dtype.is_floating_point: + return (scale * torch.rand(shape, device="cuda") - offset).to(dtype) + else: + return torch.randint(-8, 7, shape, dtype=dtype, device="cuda") + + +def maybe_convert_zeropoints(zps: Optional[torch.Tensor], s: torch.Tensor): + return zps if zps is None else -1 * s * (zps.to(s.dtype)) + + +def group_size_valid(shape: Tuple[int, int, int], + group_size: Optional[int]) -> bool: + return group_size is None or group_size == -1 or group_size % shape[2] == 0 + + +def machete_quantize_and_pack(atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: Optional[torch.dtype], + group_size: Optional[int], + zero_points: bool = False): + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, + wtype, + group_size=group_size, + zero_points=zero_points, + # to match how the kernel applies zps + ref_zero_points_after_scales=True) + + w_q = pack_rows(w_q, wtype.size_bits, *w_q.shape) + w_q = w_q.t().contiguous().t() # convert to col major + + w_q_machete = ops.machete_prepack_B(w_q, atype, wtype, stype) + opcheck(torch.ops._C.machete_prepack_B, (w_q, atype, wtype.id, stype)) + + return w_ref, w_q_machete, w_s, w_zp + + +def create_test_tensors(shape: Tuple[int, int, int], + types: TypeConfig, + group_size: Optional[int], + subset_stride_factor: Optional[int] = None) -> Tensors: + m, n, k = shape + factor = subset_stride_factor or 1 + + print("create_test_tensors, shape:", shape, "types:", types, "group_size:", + group_size) + + a = rand_data((m * factor, k * factor), types.act_type, scale=3, offset=2) + w = rand_data((k * factor, n * factor), types.act_type, scale=3, offset=1) + + if factor > 1: + a = a[0:m, 0:k] + w = w[0:k, 0:n] + + if types.group_scale_type is not None: + w = w.to(types.group_scale_type) + if w.dtype.itemsize == 1: + w = w.to(torch.float16) + + w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( + a.dtype, w, types.weight_type, types.group_scale_type, group_size, + types.group_zero_type is not None) + + if not a.dtype.is_floating_point: + aiinfo = torch.iinfo(a.dtype) + w_ref = w_ref.round().clamp(aiinfo.min, aiinfo.max) + + a_ref = a.to(torch.float32) + w_ref = w_ref.to(torch.float32) + + w_ch_s = None if types.channel_scale_type is None else\ + rand_data((n,), types.channel_scale_type) + w_tok_s = None if types.token_scale_type is None else\ + rand_data((m,), types.token_scale_type) + + return Tensors(w_ref=w_ref, + a_ref=a_ref, + a=a, + w_q=w_q_packed, + w_g_s=w_s, + w_g_zp=maybe_convert_zeropoints(w_zp, w_s), + w_ch_s=w_ch_s, + w_tok_s=w_tok_s) + + +# None stype means scales use the same dtype as a +def machete_mm_test_helper(types: TypeConfig, + tensors: Tensors, + group_size: Optional[int] = None, + schedule: Optional[str] = None): + output_ref = torch.matmul(tensors.a_ref, tensors.w_ref) + output_ref_type = output_ref.dtype + + if tensors.w_ch_s is not None: + output_ref = (output_ref.to(tensors.w_ch_s.dtype) * + tensors.w_ch_s.unsqueeze(0)).to(output_ref_type) + if tensors.w_tok_s is not None: + output_ref = (output_ref.to(tensors.w_tok_s.dtype) * + tensors.w_tok_s.unsqueeze(1)).to(output_ref_type) + + output = ops.machete_mm( + a=tensors.a, + b_q=tensors.w_q, + b_type=types.weight_type, + b_group_scales=tensors.w_g_s, + b_group_zeros=tensors.w_g_zp, + b_group_size=group_size, + b_channel_scales=tensors.w_ch_s, + a_token_scales=tensors.w_tok_s, + out_type=types.output_type, + schedule=schedule, + ) + + print(output) + print(output_ref) + + # Relax atol as our reduction dim becomes larger (more rounding error) + # Relax atol when we have zeropoints since the way machete applies + # zeropoints (after scales) causes noise around 0 + atol = 1 if tensors.w_g_zp is not None\ + else min(5e-2 * math.sqrt(tensors.a.shape[1]), 1) + rtol = 1e-1 if tensors.a.element_size() >= 2 else 2e-1 + torch.testing.assert_close(output, + output_ref.to(output.dtype), + rtol=rtol, + atol=atol) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("shape", + MNK_SHAPES, + ids=lambda x: "x".join(str(v) for v in x)) +@pytest.mark.parametrize("types", TEST_TYPES) +def test_machete_all_schedules(shape, types: TypeConfig): + + group_sizes: List[Optional[int]] = [] + if types.group_scale_type is None: + group_sizes = [None] + else: + group_sizes = GROUP_SIZES_TO_TEST + + for group_size in group_sizes: + if not group_size_valid(shape, group_size): + continue + + tensors = create_test_tensors(shape, types, group_size) + print(f"MNK = {shape}") + for schedule in ops.machete_supported_schedules( + types.act_type, + types.weight_type, + group_scales_type=types.group_scale_type, + group_zeros_type=types.group_scale_type, + out_type=types.output_type): + print(f"Testing schedule {schedule}") + machete_mm_test_helper(types, tensors, group_size, schedule) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("shape", + MNK_SHAPES, + ids=lambda x: "x".join(str(v) for v in x)) +@pytest.mark.parametrize("types", TEST_TYPES) +def test_machete_heuristic(shape, types: TypeConfig): + group_sizes: List[Optional[int]] = [] + if types.group_scale_type is None: + group_sizes = [None] + else: + group_sizes = GROUP_SIZES_TO_TEST + + for group_size in group_sizes: + if not group_size_valid(shape, group_size): + continue + + tensors = create_test_tensors(shape, types, group_size) + machete_mm_test_helper(types, tensors, group_size) + + +# Test working on other devices +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_machete_devices(device: str): + group_size = 128 + + type_config = TypeConfig(act_type=torch.float16, + weight_type=scalar_types.uint4b8, + output_type=None, + group_scale_type=torch.float16, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + + tensors = create_test_tensors((512, 4096, 4096), type_config, group_size) + + for field in fields(Tensors): + tensor = getattr(tensors, field.name) + if isinstance(tensor, torch.Tensor): + setattr(tensors, field.name, tensor.to(device)) + + machete_mm_test_helper(type_config, tensors, group_size) + + +# Test working with a subset of A and B +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +def test_machete_subset(): + group_size = 128 + + type_config = TypeConfig(act_type=torch.float16, + weight_type=scalar_types.uint4b8, + output_type=None, + group_scale_type=torch.float16, + group_zero_type=None, + channel_scale_type=None, + token_scale_type=None) + + tensors = create_test_tensors((512, 4096, 4096), + type_config, + group_size, + subset_stride_factor=2) + machete_mm_test_helper(type_config, tensors, group_size) + + +# Test to make sure cuda graphs work +class MacheteLayer(torch.nn.Module): + + def __init__(self, **kwargs): + super().__init__() + self.kwargs = kwargs + + def forward(self, a): + return ops.machete_mm(a=a, **self.kwargs) + + +@pytest.mark.skipif(not IS_SUPPORTED_BY_GPU, + reason="Machete is not supported on this GPU type.") +def test_machete_cuda_graph(): + m, n, k = 512, 4096, 4096 + + a = rand_data((m, k), torch.float16) + b = rand_data((k, n), torch.float16) + wtype = scalar_types.uint4b8 + stype = torch.float16 + group_size = 128 + zero_points = False + + w_ref, w_q_packed, w_s, w_zp = machete_quantize_and_pack( + a.dtype, b, wtype, stype, group_size, zero_points) + + # Construct a trivial model with a single layer that calls a machete kernel + model = MacheteLayer( + b_q=w_q_packed, + b_type=wtype, + b_group_scales=w_s, + b_group_zeros=maybe_convert_zeropoints(w_zp, w_s), + b_group_size=group_size, + ) + + output_ref = torch.matmul(a, w_ref) + + # Run the model with a cuda graph + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + output = model(a) + output.zero_() + g.replay() + + # Relax atol as our reduction dim becomes larger (more rounding error) + # Relax atol when we have zeropoints since the way machete applies + # zeropoints (after scales) causes noise around 0 + atol = 1 if zero_points else min(5e-2 * math.sqrt(k), 1) + torch.testing.assert_close(output, output_ref, rtol=1e-1, atol=atol) diff --git a/vllm/tests/kernels/test_mamba_ssm.py b/vllm/tests/kernels/test_mamba_ssm.py index e92d40136..19d1158c7 100644 --- a/vllm/tests/kernels/test_mamba_ssm.py +++ b/vllm/tests/kernels/test_mamba_ssm.py @@ -8,7 +8,7 @@ from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.model_executor.layers.mamba.ops.mamba_ssm import ( selective_scan_fn, selective_state_update) -from vllm.utils import seed_everything +from vllm.platforms import current_platform def selective_state_update_ref(state, @@ -235,7 +235,7 @@ def test_selective_scan(is_variable_B, is_variable_C, varBC_groups, has_D, rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 dim = 4 dstate = 8 @@ -358,7 +358,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - seed_everything(0) + current_platform.seed_everything(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) @@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C, for var in (u_ref, delta_ref, B_ref, C_ref, z_ref) ] for i in range(len(seqlens[0])): - u_s, delta_s, B_s, C_s, z_s = [v[i].unsqueeze(0) for v in splits] + u_s, delta_s, B_s, C_s, z_s = (v[i].unsqueeze(0) for v in splits) if padded_state_indices[i] == PAD_SLOT_ID: continue out_ref_s, _ = selective_scan_ref( @@ -555,7 +555,7 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, device = "cuda" rtol, atol = (3e-4, 1e-3) if itype == torch.float32 else (5e-3, 1e-2) if itype == torch.bfloat16: - rtol, atol = 7e-2, 7e-2 + rtol, atol = 1e-1, 1e-1 if torch.version.hip: atol *= 2 # set seed @@ -610,8 +610,8 @@ def test_selective_state_update_with_batch_indices(with_padding, dim, dstate, dt_bias=dt_bias, dt_softplus=True) - print("Output diff max", (out - out_ref[0]).max()) - print("Output diff mean", (out - out_ref[0]).mean()) + print("Output diff max", (out[:batch_size] - out_ref).max()) + print("Output diff mean", (out[:batch_size] - out_ref).mean()) print("Output state diff max", (state[state_indices, :] - state_ref).max()) print("Output state diff mean", (state[state_indices, :] - state_ref).mean()) diff --git a/vllm/tests/kernels/test_marlin_gemm.py b/vllm/tests/kernels/test_marlin_gemm.py index 5cfd4d6da..5e047f4b0 100644 --- a/vllm/tests/kernels/test_marlin_gemm.py +++ b/vllm/tests/kernels/test_marlin_gemm.py @@ -29,17 +29,20 @@ marlin_qqq_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( awq_pack, gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights) +from vllm.scalar_type import scalar_types ACT_ORDER_OPTS = [False, True] K_FULL_OPTS = [False, True] USE_FP32_REDUCE_OPTS = [False, True] MARLIN_K_CHUNKS = [128] -MARLIN_N_CHUNKS = [64, 128, 256] +MARLIN_N_CHUNKS = [64, 256] MARLIN_24_K_CHUNKS = [128] MARLIN_24_N_CHUNKS = [512] +HQQ_SUPPORTED_GROUP_SIZES = [64] + MNK_FACTORS = [ (1, 1, 1), (1, 4, 8), @@ -47,6 +50,8 @@ (13, 17, 67), (26, 37, 13), (67, 13, 11), + (257, 13, 11), + (658, 13, 11), ] DTYPES = [torch.float16, torch.bfloat16] @@ -226,7 +231,7 @@ def test_gptq_marlin_gemm( torch.ops._C.gptq_marlin_gemm, (a_input, marlin_q_w, marlin_s, marlin_zp, g_idx, sort_indices, workspace.scratch, quant_type.id, a_input.shape[0], b_weight.shape[1], - a_input.shape[1], is_k_full, False, use_fp32_reduce), + a_input.shape[1], is_k_full, False, use_fp32_reduce, False), test_utils=DEFAULT_OPCHECK_TEST_UTILS) output = ops.gptq_marlin_gemm( @@ -244,6 +249,7 @@ def test_gptq_marlin_gemm( is_k_full=is_k_full, has_zp=False, use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, ) output_ref = torch.matmul(a_input, w_ref) @@ -441,6 +447,7 @@ def test_awq_marlin_gemm( is_k_full=is_k_full, has_zp=has_zp, use_fp32_reduce=use_fp32_reduce, + is_zp_float=False, ) output_ref = torch.matmul(a_input, w_ref) @@ -451,6 +458,87 @@ def test_awq_marlin_gemm( assert max_diff < 0.04 +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), + reason="Marlin is not supported on this GPU type.") +@pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) +@pytest.mark.parametrize("n_chunk", MARLIN_N_CHUNKS) +@pytest.mark.parametrize("group_size", HQQ_SUPPORTED_GROUP_SIZES) +@pytest.mark.parametrize("mnk_factors", MNK_FACTORS) +@pytest.mark.parametrize("use_fp32_reduce", USE_FP32_REDUCE_OPTS) +def test_hqq_marlin_gemm( + k_chunk, + n_chunk, + group_size, + mnk_factors, + use_fp32_reduce, +): + m_factor, n_factor, k_factor = mnk_factors + + size_m = m_factor + size_k = k_chunk * k_factor + size_n = n_chunk * n_factor + + quant_type = scalar_types.uint4 + + a_input = rand_data((size_m, size_k)) + dev = a_input.device + + b_weight = torch.randint(0, + 10, (size_n, size_k), + dtype=torch.uint8, + device=dev) + scale = rand_data((size_n, size_k // group_size)) + zero = rand_data((size_n, size_k // group_size)) + + gptq_w_q = gptq_pack(b_weight.transpose(1, 0), 4, size_k, size_n) + + sort_indices = torch.empty(0, dtype=torch.int, device=dev) + marlin_w_q = ops.gptq_marlin_repack(gptq_w_q, sort_indices, size_k, size_n, + 4).to(dev) + marlin_s = marlin_permute_scales(scale.transpose(1, 0), size_k, size_n, + group_size).to(dev) + marlin_zp = marlin_permute_scales(zero.transpose(1, 0), size_k, size_n, + group_size).to(dev) + + g_idx = marlin_make_empty_g_idx(dev) + g_idx_sort_indices = marlin_make_empty_g_idx(dev) + + workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N, + GPTQ_MARLIN_MAX_PARALLEL) + + output = ops.gptq_marlin_gemm( + a_input, + marlin_w_q, + marlin_s, + marlin_zp, + g_idx, + g_idx_sort_indices, + workspace.scratch, + quant_type, + a_input.shape[0], + b_weight.shape[0], + a_input.shape[1], + is_k_full=True, + has_zp=True, + use_fp32_reduce=use_fp32_reduce, + is_zp_float=True, + ) + + b_flat = b_weight.reshape(-1, group_size) + zp_flat = zero.reshape(-1, 1) + s_flat = scale.reshape(-1, 1) + dequant = (b_flat - zp_flat) * s_flat + + output_ref = torch.matmul(a_input, + dequant.reshape(b_weight.shape).transpose(1, 0)) + + torch.cuda.synchronize() + + max_diff = compute_max_diff(output, output_ref) + + assert max_diff < 0.04 + + @pytest.mark.skipif(not is_quant_method_supported("qqq"), reason="Marlin is not supported on this GPU type.") @pytest.mark.parametrize("k_chunk", MARLIN_K_CHUNKS) diff --git a/vllm/tests/kernels/test_moe.py b/vllm/tests/kernels/test_moe.py index 70906ab21..8b23b6282 100644 --- a/vllm/tests/kernels/test_moe.py +++ b/vllm/tests/kernels/test_moe.py @@ -19,14 +19,16 @@ from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import seed_everything +NUM_EXPERTS = [8, 64] +TOP_KS = [2, 6] -@pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [2048, 256, 1024]) + +@pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) @pytest.mark.parametrize("k", [128, 511, 1024]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) def test_fused_moe( m: int, @@ -43,7 +45,7 @@ def test_fused_moe( score = torch.randn((m, e), device="cuda", dtype=dtype) triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False) torch_output = torch_moe(a, w1, w2, score, topk) - torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) @pytest.mark.parametrize("dtype", @@ -94,12 +96,12 @@ def test_mixtral_moe(dtype: torch.dtype): atol=mixtral_moe_tol[dtype]) -@pytest.mark.parametrize("m", [64, 512, 222, 33, 1]) -@pytest.mark.parametrize("n", [128, 2048, 256, 1024]) -@pytest.mark.parametrize("k", [128, 1024, 512]) -@pytest.mark.parametrize("e", [8, 64]) -@pytest.mark.parametrize("topk", [2, 6]) -@pytest.mark.parametrize("group_size", [-1, 32, 64, 128]) +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 2048]) +@pytest.mark.parametrize("k", [128, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("group_size", [-1, 32, 128]) @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) @@ -115,7 +117,7 @@ def test_fused_marlin_moe( num_bits: int, is_k_full: bool, ): - seed_everything(7) + current_platform.seed_everything(7) # Filter act_order if act_order: diff --git a/vllm/tests/kernels/test_pos_encoding.py b/vllm/tests/kernels/test_pos_encoding.py index 94da00915..eee77c22a 100644 --- a/vllm/tests/kernels/test_pos_encoding.py +++ b/vllm/tests/kernels/test_pos_encoding.py @@ -5,16 +5,16 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .allclose_default import get_default_atol, get_default_rtol IS_NEOX_STYLE = [True, False] DTYPES = [torch.half, torch.bfloat16, torch.float] -HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256] +HEAD_SIZES = [64, 80, 112, 120, 256] ROTARY_DIMS = [None, 32] # None means rotary dim == head size -NUM_HEADS = [7, 17] # Arbitrary values for testing -BATCH_SIZES = [1, 5] # Arbitrary values for testing +NUM_HEADS = [17] # Arbitrary values for testing +BATCH_SIZES = [5] # Arbitrary values for testing SEQ_LENS = [11, 8192] # Arbitrary values for testing SEEDS = [0] CUDA_DEVICES = [ @@ -48,7 +48,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -100,7 +100,7 @@ def test_batched_rotary_embedding( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size @@ -160,7 +160,7 @@ def test_batched_rotary_embedding_multi_lora( max_position: int = 8192, base: int = 10000, ) -> None: - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/vllm/tests/kernels/test_prefix_prefill.py b/vllm/tests/kernels/test_prefix_prefill.py index 3181d9256..3fdb7996b 100644 --- a/vllm/tests/kernels/test_prefix_prefill.py +++ b/vllm/tests/kernels/test_prefix_prefill.py @@ -9,7 +9,8 @@ from vllm.attention.backends.xformers import _make_alibi_bias from vllm.attention.ops.prefix_prefill import context_attention_fwd -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, seed_everything +from vllm.platforms import current_platform +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 8, 64] @@ -39,7 +40,14 @@ def test_contexted_kv_attention( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + + if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability( + 89): + pytest.skip( + 'Triton limitation: fp8e4nv data type is not supported on CUDA' + ' arch < 89') + + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -234,7 +242,14 @@ def test_contexted_kv_attention_alibi( kv_cache_dtype: str, device: str, ) -> None: - seed_everything(0) + + if 'fp8' in kv_cache_dtype and not current_platform.has_device_capability( + 89): + pytest.skip( + 'Triton limitation: fp8e4nv data type is not supported on CUDA' + ' arch < 89') + + current_platform.seed_everything(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -461,3 +476,52 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: print(f"xformers Time: {(end_time - start_time)*1000:.2f} ms") atol = 1e-3 if "fp8" in kv_cache_dtype else 1e-6 torch.testing.assert_close(output, output_ref, atol=atol, rtol=0) + + +# These tests are optional to only run when explicitly invoked +# +# pytest -v -s --optional \ +# tests/kernels/test_prefix_prefill.py::test_contexted_kv_attention_f32 +# +# These tests are useful to test model dtype float32 on Turing devices. +# We skip them to not increase the time when running tests on CI +@pytest.mark.optional +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", [torch.float32]) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOW) +@torch.inference_mode() +def test_contexted_kv_attention_f32( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + sliding_window: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, +) -> None: + test_contexted_kv_attention(num_heads, num_queries_per_kv, head_size, + sliding_window, dtype, kv_cache_dtype, device) + + +@pytest.mark.optional +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_queries_per_kv", NUM_QUERIES_PER_KV) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", [torch.float32]) +@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +@torch.inference_mode() +def test_contexted_kv_attention_alibi_f32( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + dtype: torch.dtype, + kv_cache_dtype: str, + device: str, +) -> None: + test_contexted_kv_attention_alibi(num_heads, num_queries_per_kv, head_size, + dtype, kv_cache_dtype, device) diff --git a/vllm/tests/kernels/test_triton_scaled_mm.py b/vllm/tests/kernels/test_triton_scaled_mm.py new file mode 100644 index 000000000..8e96a2f70 --- /dev/null +++ b/vllm/tests/kernels/test_triton_scaled_mm.py @@ -0,0 +1,106 @@ +"""Tests for the triton_scaled_mm kernel + +Run `pytest tests/kernels/test_triton_scaled_mm.py`. +""" +import importlib +from typing import Optional, Type + +import pytest +import torch + +from vllm.platforms import current_platform + +device = "cuda" + + +def scaled_mm_torch(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + out = torch.mm(a.to(torch.float32), b.to(torch.float32)) + out = scale_a * out + out = scale_b.T * out + out = out.to(out_dtype) + if bias is not None: + out = out + bias + + return out + + +def get_8bit_types(): + types = [torch.int8] + supports_fp8 = current_platform.has_device_capability(89) + if current_platform.is_rocm() and supports_fp8: + types.append(torch.float8_e4m3fnuz) + elif current_platform.is_cuda() and supports_fp8: + types.append(torch.float8_e4m3fn) + return types + + +@pytest.mark.parametrize("M", [1, 33, 64, 512]) +@pytest.mark.parametrize("N", [256, 971, 20486]) +@pytest.mark.parametrize("K", [128, 496, 1024]) +@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16]) +@pytest.mark.parametrize("in_dtype", get_8bit_types()) +@pytest.mark.parametrize("use_scalar_scale_a", [True, False]) +@pytest.mark.parametrize("use_scalar_scale_b", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, + use_scalar_scale_b, use_bias): + is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t + ).is_floating_point() + + current_platform.seed_everything(0) + + # NOTE: There are cases, where if the matrix is large enough, an output + # like 65504.4 can be produced, and can easily turn into inf when + # multiplied when using float16/bfloat16. This means one function, e.g., + # testing function, and another function, e.g. golden function, can + # produce a non-inf value while the other produces an inf value, and + # will cause assert_close/allclose to fail, even though if overflow + # wouldn't have occurred, the values would have been "close." + # + # So, the values here are kept small enough to avoid this situation. + if is_floating_point_type(in_dtype): + a = (0.25 * torch.rand( + (M, K), dtype=torch.float32, device=device)).to(in_dtype) + b = (0.25 * torch.rand( + (K, N), dtype=torch.float32, device=device)).to(in_dtype) + else: + a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device) + b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device) + + if use_scalar_scale_a: + scale_a = torch.rand((1, 1), device=device) + else: + scale_a = 0.25 * torch.rand((M, 1), device=device) + + if use_scalar_scale_b: + scale_b = torch.rand((1, 1), device=device) + else: + scale_b = 0.25 * torch.rand((N, 1), device=device) + + bias = None + if use_bias: + bias = torch.rand((N, ), device=device, dtype=out_dtype) + + triton_scaled_mm_module = importlib.import_module( + "vllm.model_executor.layers.quantization.compressed_tensors." + "triton_scaled_mm") + triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm + + c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) + + a_cpu = a.cpu() + b_cpu = b.cpu() + scale_a_cpu = scale_a.cpu() + scale_b_cpu = scale_b.cpu() + bias_cpu = None if bias is None else bias.cpu() + + c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu, + out_dtype, bias_cpu) + + c_check_cpu = c_check.cpu() + torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1) diff --git a/vllm/tests/kernels/utils.py b/vllm/tests/kernels/utils.py index a2d414f63..e7865fb25 100644 --- a/vllm/tests/kernels/utils.py +++ b/vllm/tests/kernels/utils.py @@ -13,8 +13,8 @@ from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul -from vllm.utils import (STR_BACKEND_ENV_VAR, STR_XFORMERS_ATTN_VAL, - make_tensor_with_pad) +from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, + STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. @@ -525,17 +525,22 @@ def make_backend(backend_name: str) -> AttentionBackend: if backend_name == STR_XFORMERS_ATTN_VAL: # NOTE: xFormers backend cannot be imported for CPU and AMD GPUs. from vllm.attention.backends.xformers import XFormersBackend - return XFormersBackend() + elif backend_name == STR_FLASH_ATTN_VAL: + from vllm.attention.backends.flash_attn import FlashAttentionBackend + return FlashAttentionBackend() + raise AssertionError( f"Unrecognized backend_name {backend_name} for unit test") def _make_metadata_tensors( - seq_lens: Optional[List[int]], context_lens: Optional[List[int]], - encoder_seq_lens: Optional[List[int]], device: Union[torch.device, str] -) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[List[int]], - torch.Tensor, Optional[int]]: + seq_lens: Optional[List[int]], + context_lens: Optional[List[int]], + encoder_seq_lens: Optional[List[int]], + device: Union[torch.device, str], +) -> Tuple[torch.Tensor, torch.Tensor, Any, Any, Optional[torch.Tensor], + torch.Tensor, torch.Tensor, Optional[int]]: ''' Build scalar & tensor values required to build attention metadata structure. @@ -553,6 +558,8 @@ def _make_metadata_tensors( * max_context_len: max(context_lens) * max_seq_len: max(seq_lens) * seq_start_loc: start idx of each sequence + * encoder_seq_lens_tensor: encoder seq_lens list, as tensor + * encoder_seq_start_loc: start idx of each encoder sequence * max_encoder_seq_len: encoder seq_lens list, as tensor ''' seq_lens_tensor = maybe_make_int_tensor(seq_lens, device) @@ -566,8 +573,26 @@ def _make_metadata_tensors( seq_start_loc = None + if seq_lens_tensor is not None: + seq_start_loc = torch.zeros(seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=seq_lens_tensor.device) + torch.cumsum(seq_lens_tensor, + dim=0, + dtype=seq_start_loc.dtype, + out=seq_start_loc[1:]) + + encoder_seq_start_loc = torch.zeros(encoder_seq_lens_tensor.shape[0] + 1, + dtype=torch.int32, + device=encoder_seq_lens_tensor.device) + torch.cumsum(encoder_seq_lens_tensor, + dim=0, + dtype=encoder_seq_start_loc.dtype, + out=encoder_seq_start_loc[1:]) + return (seq_lens_tensor, context_lens_tensor, max_context_len, max_seq_len, - seq_start_loc, encoder_seq_lens_tensor, max_encoder_seq_len) + seq_start_loc, encoder_seq_lens_tensor, encoder_seq_start_loc, + max_encoder_seq_len) def make_kv_cache(num_blocks: int, @@ -575,6 +600,7 @@ def make_kv_cache(num_blocks: int, head_size: int, block_size: int, device: Union[torch.device, str], + backend: str, default_val: float = 0.0) -> torch.Tensor: ''' Create a fake KV cache. @@ -591,10 +617,20 @@ def make_kv_cache(num_blocks: int, Returns: * kv_cache: 2 x num_blocks x (block_size * num_heads * head_size) + * for backend 'XFORMERS' + * kv_cache: 2 x num_blocks x block_size x num_heads x head_size + * for backend 'FLASH_ATTN' ''' - - kv_cache = torch.rand( - (2, num_blocks, block_size * num_heads * head_size)).to(device) + if backend == 'XFORMERS': + kv_cache = torch.rand( + (2, num_blocks, block_size * num_heads * head_size)).to(device) + elif backend == 'FLASH_ATTN': + kv_cache = torch.rand( + (2, num_blocks, block_size, num_heads, head_size)).to(device) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") if default_val is not None: kv_cache[:, :, :] = default_val return kv_cache @@ -858,8 +894,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -869,10 +906,12 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=None if seq_lens is None else max(seq_lens), max_decode_seq_len=0, context_lens_tensor=context_lens_tensor, @@ -881,6 +920,7 @@ def make_test_metadata( num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -903,8 +943,9 @@ def make_test_metadata( context_lens_tensor, _, _, - _, + seq_start_loc, encoder_seq_lens_tensor, + encoder_seq_start_loc, max_encoder_seq_len, ) = _make_metadata_tensors(seq_lens, context_lens, @@ -914,18 +955,22 @@ def make_test_metadata( return attn_backend.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, + multi_modal_placeholder_index_maps=None, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, + seq_start_loc=seq_start_loc, max_prefill_seq_len=0, max_decode_seq_len=max(seq_lens), + max_decode_query_len=1, context_lens_tensor=context_lens_tensor, block_tables=kv_mmap.block_tables, use_cuda_graph=False, num_encoder_tokens=num_encoder_tokens, encoder_seq_lens=encoder_seq_lens, encoder_seq_lens_tensor=encoder_seq_lens_tensor, + encoder_seq_start_loc=encoder_seq_start_loc, max_encoder_seq_len=max_encoder_seq_len, cross_slot_mapping=(None if cross_kv_mmap is None else cross_kv_mmap.slot_mapping), @@ -934,7 +979,8 @@ def make_test_metadata( def assert_actual_matches_ideal(test_params: PhaseTestParameters, - output_under_test: torch.Tensor) -> None: + output_under_test: torch.Tensor, + backend: str) -> None: ''' Assert that observed output matches the ideal output contained in the test parameters data structure. @@ -945,8 +991,22 @@ def assert_actual_matches_ideal(test_params: PhaseTestParameters, * output_under_test: actually observed output value ''' ideal_output = test_params.packed_qkvo.ideal_output - torch.testing.assert_close(ideal_output, - output_under_test.view_as(ideal_output)) + if backend == 'XFORMERS': + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output)) + + elif backend == 'FLASH_ATTN': + # For FlashAttention override the accuracy thresholds to non default + # values since we notice a higher difference between the ideal and + # actual output. + torch.testing.assert_close(ideal_output, + output_under_test.view_as(ideal_output), + atol=0.01, + rtol=0.016) + else: + raise ValueError( + f"Unknown backend value: '{backend}'. Expected 'XFORMERS' or " + f"'FLASH_ATTN'.") # Copied/modified from torch._refs.__init__.py diff --git a/vllm/tests/kv_transfer/disagg_test.py b/vllm/tests/kv_transfer/disagg_test.py new file mode 100644 index 000000000..adc6150ed --- /dev/null +++ b/vllm/tests/kv_transfer/disagg_test.py @@ -0,0 +1,119 @@ +import os +import subprocess +import sys +import time +from subprocess import Popen + +import pytest +import requests +import torch + + +# Fixture to set up environment variables and teardown servers after tests +@pytest.fixture(scope="module", autouse=True) +def setup_servers(): + if torch.cuda.device_count() < 4: + pytest.skip("Skipping test: fewer than 4 GPUs available") + + # Set up environment variables + VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", + shell=True).decode().strip() + os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP + + # Start prefill instance + prefill_cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "--port", + "8100", + "--gpu-memory-utilization", + "0.5", + "--max-model-len", + "1000", + "--kv-transfer-config", + '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer",'\ + '"kv_rank":0,"kv_parallel_size":2}', + ] + prefill_env = os.environ.copy() + prefill_env["CUDA_VISIBLE_DEVICES"] = "0" + prefill_proc = Popen(prefill_cmd, env=prefill_env) + + # Start decode instance + decode_cmd = [ + sys.executable, + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "--port", + "8200", + "--gpu-memory-utilization", + "0.5", + "--max-model-len", + "1000", + "--kv-transfer-config", + '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer",'\ + '"kv_rank":1,"kv_parallel_size":2}', + ] + decode_env = os.environ.copy() + decode_env["CUDA_VISIBLE_DEVICES"] = "1" + decode_proc = Popen(decode_cmd, env=decode_env) + + # Wait for servers to be ready + assert wait_for_server(8100), "Prefill server did not start in time" + assert wait_for_server(8200), "Decode server did not start in time" + + # Yield to the test function and handle teardown after tests + yield + + # Cleanup: kill the processes + prefill_proc.terminate() + decode_proc.terminate() + + # Additional cleanup if needed + prefill_proc.wait() + decode_proc.wait() + + +# Helper function to wait for server +def wait_for_server(port, timeout=240): + start_time = time.time() + while time.time() - start_time < timeout: + try: + response = requests.get(f"http://localhost:{port}/v1/completions") + if response.status_code in [200, 405]: + return True + except requests.ConnectionError: + time.sleep(1) + return False + + +# Test function to send curl requests and validate responses +@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"]) +def test_disaggregated_prefilling(prompt): + # Send to prefill + response = requests.post("http://localhost:8100/v1/completions", + headers={"Content-Type": "application/json"}, + json={ + "model": + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "prompt": prompt, + "max_tokens": 1, + "temperature": 0 + }) + assert response.status_code == 200 + + # Send to decode + response = requests.post("http://localhost:8200/v1/completions", + headers={"Content-Type": "application/json"}, + json={ + "model": + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "prompt": prompt, + "max_tokens": 10, + "temperature": 0 + }) + assert response.status_code == 200 diff --git a/vllm/tests/kv_transfer/module_test.py b/vllm/tests/kv_transfer/module_test.py new file mode 100644 index 000000000..355461919 --- /dev/null +++ b/vllm/tests/kv_transfer/module_test.py @@ -0,0 +1,64 @@ +import subprocess +import sys + +import pytest +import torch + + +def run_python_script(script_name, timeout): + script_name = f'kv_transfer/{script_name}' + try: + # Start both processes asynchronously using Popen + process0 = subprocess.Popen( + [sys.executable, script_name], + env={"RANK": + "0"}, # Set the RANK environment variable for process 0 + stdout=sys.stdout, # Pipe stdout to current stdout + stderr=sys.stderr, # Pipe stderr to current stderr + ) + + process1 = subprocess.Popen( + [sys.executable, script_name], + env={"RANK": + "1"}, # Set the RANK environment variable for process 1 + stdout=sys.stdout, # Pipe stdout to current stdout + stderr=sys.stderr, # Pipe stderr to current stderr + ) + + # Wait for both processes to complete, with a timeout + process0.wait(timeout=timeout) + process1.wait(timeout=timeout) + + # Check the return status of both processes + if process0.returncode != 0: + pytest.fail( + f"Test {script_name} failed for RANK=0, {process0.returncode}") + if process1.returncode != 0: + pytest.fail( + f"Test {script_name} failed for RANK=1, {process1.returncode}") + + except subprocess.TimeoutExpired: + # If either process times out, terminate both and fail the test + process0.terminate() + process1.terminate() + pytest.fail(f"Test {script_name} timed out") + except Exception as e: + pytest.fail(f"Test {script_name} failed with error: {str(e)}") + + +# Define the test cases using pytest's parametrize +@pytest.mark.parametrize( + "script_name,timeout", + [ + ("test_lookup_buffer.py", + 60), # Second test case with a 60-second timeout + ("test_send_recv.py", 120) # First test case with a 120-second timeout + ]) +def test_run_python_script(script_name, timeout): + # Check the number of GPUs + if torch.cuda.device_count() < 2: + pytest.skip( + f"Skipping test {script_name} because <2 GPUs are available") + + # Run the test if there are at least 2 GPUs + run_python_script(script_name, timeout) diff --git a/vllm/tests/kv_transfer/test_lookup_buffer.py b/vllm/tests/kv_transfer/test_lookup_buffer.py new file mode 100644 index 000000000..96b0e5871 --- /dev/null +++ b/vllm/tests/kv_transfer/test_lookup_buffer.py @@ -0,0 +1,160 @@ +import os +import random + +import torch +from tqdm import tqdm + +from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer import ( + SimpleBuffer) +from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe + +# TODO: the test depends on a lot of fields in the current implementation. +# We should have standard interface instead direct field access + + +def test_run(my_rank, buffer, device): + + # buffer should be empty in the beginning + if my_rank == 0: + assert buffer.buffer_size == 0 + assert len(buffer.buffer) == 0 + + print("My rank: %d, device: %s" % (my_rank, device)) + + # insert + tokens = torch.tensor([1, 2, 3]).to(device) + roi = (tokens > 0) + if my_rank == 0: + key = 2.0 * torch.ones([5, 6]).to(device) + value = 3.0 * torch.ones([5, 6]).to(device) + + placeholder = torch.tensor([1]).to(device) + + buffer.insert(tokens, roi, key, value, placeholder) + + torch.distributed.barrier() + + # drop_select + if my_rank == 1: + tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi) + assert torch.allclose(tokens, tok) + assert torch.allclose(roi, roi_) + assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device)) + assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device)) + torch.distributed.barrier() + + if my_rank == 0: + assert buffer.buffer_size == 0 + assert len(buffer.buffer) == 0 + + print("Test run passed!") + + +def stress_test(my_rank, buf, device): + + torch.distributed.barrier() + torch.manual_seed(100) + + reqs = [ + ( + torch.rand(100).to(device), # tokens + torch.ones(100).bool().to(device), # roi + torch.rand(100).to(device), # key + torch.rand(100).to(device), # value + torch.rand(100).to(device), # hidden + ) for i in tqdm(range(200)) + ] + + random.seed(my_rank) + random.shuffle(reqs) + + torch.distributed.barrier() + + n = 0 + + # the buffer size can only store 100 reqs + # so the sender will occasionally block to wait for the receiver. + for req in tqdm(reqs): + if my_rank == 0: + buf.insert(*req) + else: + tok, roi, k, v, h = req + tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi) + + if tok_ is None: + assert roi_ is None + assert k_ is None + assert v_ is None + assert h_ is None + n += 1 + else: + assert torch.allclose(tok, tok_) + assert torch.allclose(roi, roi_) + assert torch.allclose(k, k_) + assert torch.allclose(v, v_) + assert torch.allclose(h, h_) + print('Rank %d done' % my_rank) + torch.distributed.barrier() + + if my_rank == 0: + x = torch.tensor([0]) + torch.distributed.recv(x, 1) + # the # of None received is the kv that are not selected + assert x.item() == len(buf.buffer) + # and the size of the buffer should be 2000 * buffer len + print(buf.buffer_size) + assert buf.buffer_size == 1700 * len(buf.buffer) + else: + torch.distributed.send(torch.tensor([n]), 0) + + print("Passed stress test!") + + +if __name__ == "__main__": + + my_rank = int(os.environ['RANK']) + + torch.distributed.init_process_group( + backend='gloo', + init_method='tcp://localhost:12398', + world_size=2, + rank=my_rank, + ) + + print("initialized! My rank is %d" % my_rank) + + config = KVTransferConfig( + kv_connector='PyNcclConnector', + kv_buffer_device='cuda', + kv_buffer_size=1e9, + kv_rank=my_rank, + kv_role="kv_both", # this arg doesn't matter in this test + kv_parallel_size=2, + kv_ip="127.0.0.1", + kv_port=12345, + ) + + data_pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + device="cuda", + port_offset=0, + ) + cpu_pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + device="cpu", + port_offset=1, + ) + + buffer = SimpleBuffer(cpu_pipe, data_pipe, 170000) + + test_run(my_rank, buffer, data_pipe.device) + + stress_test(my_rank, buffer, data_pipe.device) + + buffer.close() + data_pipe.close() + cpu_pipe.close() + print('Done') diff --git a/vllm/tests/kv_transfer/test_lookup_buffer.sh b/vllm/tests/kv_transfer/test_lookup_buffer.sh new file mode 100644 index 000000000..09d7ee018 --- /dev/null +++ b/vllm/tests/kv_transfer/test_lookup_buffer.sh @@ -0,0 +1,3 @@ +#!/bin/bash +RANK=0 python test_lookup_buffer.py & +RANK=1 python test_lookup_buffer.py & \ No newline at end of file diff --git a/vllm/tests/kv_transfer/test_send_recv.py b/vllm/tests/kv_transfer/test_send_recv.py new file mode 100644 index 000000000..65973bf10 --- /dev/null +++ b/vllm/tests/kv_transfer/test_send_recv.py @@ -0,0 +1,155 @@ +import os +import time +from typing import List + +import torch +from tqdm import tqdm + +from vllm.config import KVTransferConfig +from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import PyNcclPipe + + +def test_run(my_rank, pipe): + # test run + x = torch.tensor([1]).to(pipe.device) + y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device) + if my_rank == 0: + pipe.send_tensor(x) + print("sent tensor x") + pipe.send_tensor(y) + print("sent tensor y") + x2 = pipe.recv_tensor() + print("received x2 = ", x2) + y2 = pipe.recv_tensor() + print("received y2 = ", x2) + + else: + x2 = pipe.recv_tensor() + print("received x2 = ", x2) + y2 = pipe.recv_tensor() + print("received y2 = ", x2) + pipe.send_tensor(x) + print("sent tensor x") + pipe.send_tensor(y) + print("sent tensor y") + + assert torch.allclose(x, x2) + assert torch.allclose(y, y2) + + +def stress_test(my_rank, pipe): + + torch.distributed.barrier() + + tensors: List[torch.Tensor] = [] + + torch.manual_seed(0) + + for i in tqdm(range(500)): + mean = torch.rand(1).item() * 100 + std = torch.rand(1).item() * 100 + size = torch.randint(900, 1000, (2, )) + x = torch.normal(mean * 1.0, std * 1.0, + size=size.tolist()).to(pipe.device) + + # 5% probability of sending a None + if torch.rand(1).item() < 0.05: + tensors.append(None) + tensors.append(None) + tensors.append(None) + else: + tensors.append(x) + tensors.append(x.mean().unsqueeze(0)) + tensors.append(x.std().unsqueeze(0)) + + torch.distributed.barrier() + + for i in tqdm(range(500)): + if my_rank == int((i % 10) > 3): + pipe.send_tensor(tensors[3 * i]) + pipe.send_tensor(tensors[3 * i + 1]) + pipe.send_tensor(tensors[3 * i + 2]) + else: + x = pipe.recv_tensor() + mean = pipe.recv_tensor() + std = pipe.recv_tensor() + + if x is None: + assert mean is None + assert std is None + else: + assert torch.allclose(x, tensors[3 * i]) + assert x.mean() == mean[0] + assert x.std() == std[0] + + torch.distributed.barrier() + + +def latency_test(my_rank, pipe, nelement, ntensor): + + latencies = [] + + torch.distributed.barrier() + + for i in tqdm(range(500)): + + tensors = [] + + if my_rank == 0: + # create tensor + tensors = [ + torch.rand(nelement).to(pipe.device) for _ in range(ntensor) + ] + + torch.distributed.barrier() + + if my_rank == 0: + t = torch.tensor([time.time()], + dtype=torch.float64).to(pipe.device) + for tensor in tensors: + pipe.send_tensor(tensor) + pipe.send_tensor(t) + else: + for _ in range(ntensor): + pipe.recv_tensor() + t = pipe.recv_tensor() + latencies.append(time.time() - t.item()) + + torch.distributed.barrier() + + print('Latency test passed.') + print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms') + + +if __name__ == "__main__": + + my_rank = int(os.environ['RANK']) + + torch.distributed.init_process_group( + backend='gloo', + init_method='tcp://localhost:12398', + world_size=2, + rank=my_rank, + ) + + config = KVTransferConfig( + kv_connector='PyNcclConnector', + kv_buffer_device='cuda', + kv_buffer_size=1e9, + kv_rank=my_rank, + kv_role="kv_both", # this arg doesn't matter in this test + kv_parallel_size=2, + kv_ip="127.0.0.1", + kv_port=12345, + ) + + pipe = PyNcclPipe( + local_rank=my_rank, + config=config, + ) + + test_run(my_rank, pipe) + stress_test(my_rank, pipe) + + # Use this function if you want to test the latency of pipe impl. + # latency_test(my_rank, pipe, 1024 * 8 * 128, 80) diff --git a/vllm/tests/kv_transfer/test_send_recv.sh b/vllm/tests/kv_transfer/test_send_recv.sh new file mode 100644 index 000000000..1e89e246b --- /dev/null +++ b/vllm/tests/kv_transfer/test_send_recv.sh @@ -0,0 +1,3 @@ +#!/bin/bash +RANK=0 python3 test_send_recv.py & +RANK=1 python3 test_send_recv.py & \ No newline at end of file diff --git a/vllm/tests/lora/conftest.py b/vllm/tests/lora/conftest.py index e40f0dd74..29ecf3780 100644 --- a/vllm/tests/lora/conftest.py +++ b/vllm/tests/lora/conftest.py @@ -152,6 +152,11 @@ def sql_lora_files(sql_lora_huggingface_id): return snapshot_download(repo_id=sql_lora_huggingface_id) +@pytest.fixture(scope="session") +def lora_bias_files(): + return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias") + + @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test @@ -248,11 +253,10 @@ def llama_2_7b_engine_extra_embeddings(): cleanup_dist_env_and_memory(shutdown_ray=True) get_model_old = get_model - def get_model_patched(*, model_config, device_config, **kwargs): - kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8) - return get_model_old(model_config=model_config, - device_config=device_config, - **kwargs) + def get_model_patched(**kwargs): + kwargs["vllm_config"].lora_config = LoRAConfig(max_loras=4, + max_lora_rank=8) + return get_model_old(**kwargs) with patch("vllm.worker.model_runner.get_model", get_model_patched): engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) diff --git a/vllm/tests/lora/test_chatglm3.py b/vllm/tests/lora/test_chatglm3_tp.py similarity index 56% rename from vllm/tests/lora/test_chatglm3.py rename to vllm/tests/lora/test_chatglm3_tp.py index de4cbea80..f17464573 100644 --- a/vllm/tests/lora/test_chatglm3.py +++ b/vllm/tests/lora/test_chatglm3_tp.py @@ -1,12 +1,21 @@ from typing import List import vllm +from tests.utils import fork_new_process_for_each_test from vllm.lora.request import LoRARequest +from ..utils import multi_gpu_test + MODEL_PATH = "THUDM/chatglm3-6b" PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 +EXPECTED_LORA_OUTPUT = [ + "SELECT count(*) FROM singer", + "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 + "SELECT name , country , age FROM singer ORDER BY age", +] + def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ @@ -20,7 +29,6 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: "Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501 ), ] - print(prompts) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) outputs = llm.generate( prompts, @@ -37,23 +45,58 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@fork_new_process_for_each_test def test_chatglm3_lora(chatglm3_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, enable_lora=True, max_loras=4, max_lora_rank=64, + tensor_parallel_size=1, trust_remote_code=True) - expected_lora_output = [ - "SELECT count(*) FROM singer", - "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 - "SELECT name , country , age FROM singer ORDER BY age", - ] + output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] + output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_chatglm3_lora_tp4(chatglm3_lora_files): + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + max_lora_rank=64, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=False) + + output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] + output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files): + llm = vllm.LLM(MODEL_PATH, + max_model_len=1024, + enable_lora=True, + max_loras=4, + max_lora_rank=64, + tensor_parallel_size=4, + trust_remote_code=True, + fully_sharded_loras=True) output1 = do_sample(llm, chatglm3_lora_files, lora_id=1) - for i in range(len(expected_lora_output)): - assert output1[i] == expected_lora_output[i] + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output1[i] == EXPECTED_LORA_OUTPUT[i] output2 = do_sample(llm, chatglm3_lora_files, lora_id=2) - for i in range(len(expected_lora_output)): - assert output2[i] == expected_lora_output[i] + for i in range(len(EXPECTED_LORA_OUTPUT)): + assert output2[i] == EXPECTED_LORA_OUTPUT[i] diff --git a/vllm/tests/lora/test_layers.py b/vllm/tests/lora/test_layers.py index db877219a..a113e3f7a 100644 --- a/vllm/tests/lora/test_layers.py +++ b/vllm/tests/lora/test_layers.py @@ -39,7 +39,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask) from vllm.model_executor.utils import set_random_seed -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .utils import DummyLoRAManager @@ -51,6 +51,7 @@ CUDA_DEVICES = [ f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) ] + # We will launch different triton kernels between the prefill and decode # stages, so we need to verify this. prefill stage(True) or decode stage(False) STAGES = [True, False] @@ -120,11 +121,12 @@ def populate_loras( subloras: List[LoRALayerWeights] = [] sublora_len = layer_weights.shape[0] // repeats for i in range(repeats): - sublora = DummyLoRAManager().init_random_lora( - module_name=f"fake_{i}", - weight=layer_weights, - generate_embeddings_tensor=generate_embeddings_tensor, - ) + sublora = DummyLoRAManager( + layer_weights.device).init_random_lora( + module_name=f"fake_{i}", + weight=layer_weights, + generate_embeddings_tensor=generate_embeddings_tensor, + ) sublora.lora_b = sublora.lora_b[:, (sublora_len * i):(sublora_len * (i + 1))] sublora.optimize() @@ -152,6 +154,7 @@ def create_random_inputs( input_size: Tuple[int, ...], input_range: Tuple[float, float], input_type: torch.dtype = torch.int, + device: torch.device = "cuda" ) -> Tuple[List[torch.Tensor], List[int], List[int]]: """Creates random inputs. @@ -173,10 +176,14 @@ def create_random_inputs( for _ in range(num_inputs): if input_type == torch.int: inputs.append( - torch.randint(low=int(low), high=int(high), size=input_size)) + torch.randint(low=int(low), + high=int(high), + size=input_size, + device=device)) else: inputs.append( - torch.rand(size=input_size, dtype=input_type) * high + low) + torch.rand(size=input_size, dtype=input_type, device=device) * + high + low) lora_id = random.choice(active_lora_ids) index_mapping += [lora_id] * input_size[0] @@ -191,6 +198,10 @@ def create_random_inputs( @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("stage", STAGES) def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA + # device, see: https://github.com/triton-lang/triton/issues/2925 + # Same below. + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 @@ -225,7 +236,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -263,7 +274,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -291,6 +302,7 @@ def create_random_embedding_layer(): def test_embeddings_with_new_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -345,7 +357,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -400,7 +412,7 @@ def create_random_embedding_layer(): num_inputs=num_loras * 3, input_size=(200, ), input_range=(1, vocab_size), - ) + device=device) original_inputs = deepcopy(inputs) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, @@ -426,6 +438,7 @@ def create_random_embedding_layer(): def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, stage) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) max_loras = 8 punica_wrapper = PunicaWrapper(8192, 256, device) @@ -471,7 +484,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -520,7 +533,7 @@ def _pretest(): input_size=(1, 1024), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -552,14 +565,18 @@ def _pretest(): @pytest.mark.parametrize("num_loras", [1, 2, 4, 8]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) -def test_linear_replicated(dist_init, num_loras, device, stage) -> None: +@pytest.mark.parametrize("bias_enabled", [True, False]) +def test_linear_replicated(dist_init, num_loras, device, stage, + bias_enabled) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_replicated_layer(): @@ -571,7 +588,12 @@ def create_random_linear_replicated_layer(): lora_linear = ReplicatedLinearWithLoRA(linear) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -592,7 +614,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -631,7 +653,7 @@ def create_random_linear_replicated_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -655,16 +677,19 @@ def create_random_linear_replicated_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_random_linear_parallel_layer(): if orientation == "row": @@ -685,7 +710,12 @@ def create_random_linear_parallel_layer(): if not fully_shard else ColumnParallelLinearWithShardedLoRA(linear)) lora_linear.create_lora_weights(max_loras, lora_config) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == 1) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -706,7 +736,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -745,7 +775,7 @@ def create_random_linear_parallel_layer(): input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -769,16 +799,19 @@ def create_random_linear_parallel_layer(): @pytest.mark.parametrize("fully_shard", [True, False]) @pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("stage", STAGES) +@pytest.mark.parametrize("bias_enabled", [True, False]) def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, - device, stage) -> None: + device, stage, bias_enabled) -> None: + torch.cuda.set_device(device) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 lora_config = LoRAConfig(max_loras=max_loras, max_lora_rank=8, fully_sharded_loras=fully_shard, - lora_dtype=torch.float16) + lora_dtype=torch.float16, + bias_enabled=bias_enabled) def create_column_parallel_packed_layer(): if repeats == 2: @@ -816,10 +849,16 @@ class FakeConfig: num_key_value_heads = 32 num_attention_heads = 32 + n_slices = repeats lora_linear.create_lora_weights(max_loras, lora_config, model_config=FakeConfig()) - + assert (lora_linear.n_slices == len(lora_linear.lora_a_stacked) == len( + lora_linear.lora_b_stacked) == n_slices) + if bias_enabled: + assert len(lora_linear.lora_bias_stacked) == lora_linear.n_slices + else: + assert lora_linear.lora_bias_stacked is None return linear, lora_linear for i in range(10): @@ -842,7 +881,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -883,7 +922,7 @@ class FakeConfig: input_size=(1, 4096), input_range=(0, 1), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping, is_prefill=stage) @@ -895,7 +934,6 @@ class FakeConfig: 512, lora_config.lora_extra_vocab_size, ) - # lora_linear.set_mapping(*mapping_info) lora_result = lora_linear(torch.cat(inputs))[0] expected_result = linear(torch.cat(inputs))[0] @@ -923,7 +961,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, seq_len) -> None: dtype = torch.float16 seed = 0 - seed_everything(seed) + current_platform.seed_everything(seed) torch.set_default_device(device) punica_wrapper = PunicaWrapper(8192, 256, device) max_loras = 8 @@ -962,7 +1000,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device, input_size=(1, max_position), input_range=(0, lora_config.lora_extra_vocab_size), input_type=torch.float16, - ) + device=device) lora_mapping = LoRAMapping(index_mapping, prompt_mapping) long_lora_context = LongContextLoRAContext(list(scaling_factors), diff --git a/vllm/tests/lora/test_llama.py b/vllm/tests/lora/test_llama.py deleted file mode 100644 index e2a4f1ed0..000000000 --- a/vllm/tests/lora/test_llama.py +++ /dev/null @@ -1,146 +0,0 @@ -from typing import List - -import pytest -import ray - -import vllm -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.lora.request import LoRARequest - -MODEL_PATH = "meta-llama/Llama-2-7b-hf" - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=256, - stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -@pytest.mark.parametrize("tp_size", [1, 2, 4]) -def test_llama_lora(sql_lora_files, tp_size, num_gpus_available): - if num_gpus_available < tp_size: - pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=tp_size) - - expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 - ] - expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 - ] - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output - - print("removing lora") - - -def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): - if num_gpus_available < 4: - pytest.skip("Not enough GPUs for tensor parallelism 4") - - llm_tp1 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=1) - output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) - - del llm_tp1 - cleanup_dist_env_and_memory() - - llm_tp2 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=2) - output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) - - del llm_tp2 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp2 - - llm_tp4 = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - tensor_parallel_size=4) - output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) - - del llm_tp4 - cleanup_dist_env_and_memory() - - assert output_tp1 == output_tp4 - - -def test_llama_lora_warmup(sql_lora_files): - """Test that the LLM initialization works with a warmup LORA path and - is more conservative""" - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_lora(): - llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) - num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks - return num_gpu_blocks_lora_warmup - - @ray.remote(num_gpus=1) - def get_num_gpu_blocks_no_lora(): - llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) - num_gpu_blocks_no_lora_warmup = ( - llm.llm_engine.cache_config.num_gpu_blocks) - return num_gpu_blocks_no_lora_warmup - - num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) - num_gpu_blocks_no_lora_warmup = ray.get( - get_num_gpu_blocks_no_lora.remote()) - assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( - "The warmup with lora should be more " - "conservative than without lora, therefore the number of " - "memory blocks for the KV cache should be " - "less when using lora than when not using lora") diff --git a/vllm/tests/lora/test_llama_tp.py b/vllm/tests/lora/test_llama_tp.py new file mode 100644 index 000000000..d3ca7f878 --- /dev/null +++ b/vllm/tests/lora/test_llama_tp.py @@ -0,0 +1,155 @@ +from typing import List + +import ray + +import vllm +from tests.utils import fork_new_process_for_each_test +from vllm.lora.request import LoRARequest + +from ..utils import multi_gpu_test + +MODEL_PATH = "meta-llama/Llama-2-7b-hf" + +EXPECTED_NO_LORA_OUTPUT = [ + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 + "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 + " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 + "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 +] +EXPECTED_LORA_OUTPUT = [ + " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 + " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 + " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 + " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 + " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 + " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 +] + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +def generate_and_test(llm, sql_lora_files): + print("lora adapter created") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 1") + assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT + + print("no lora") + assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT + + print("lora 2") + assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT + + print("removing lora") + + +@fork_new_process_for_each_test +def test_llama_lora(sql_lora_files): + + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=1) + generate_and_test(llm, sql_lora_files) + + +@fork_new_process_for_each_test +def test_llama_lora_warmup(sql_lora_files): + """Test that the LLM initialization works with a warmup LORA path and + is more conservative""" + + @ray.remote(num_gpus=1) + def get_num_gpu_blocks_lora(): + llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16) + num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks + return num_gpu_blocks_lora_warmup + + @ray.remote(num_gpus=1) + def get_num_gpu_blocks_no_lora(): + llm = vllm.LLM(MODEL_PATH, max_num_seqs=16) + num_gpu_blocks_no_lora_warmup = ( + llm.llm_engine.cache_config.num_gpu_blocks) + return num_gpu_blocks_no_lora_warmup + + num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote()) + num_gpu_blocks_no_lora_warmup = ray.get( + get_num_gpu_blocks_no_lora.remote()) + assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, ( + "The warmup with lora should be more " + "conservative than without lora, therefore the number of " + "memory blocks for the KV cache should be " + "less when using lora than when not using lora") + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4(sql_lora_files): + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + ) + generate_and_test(llm, sql_lora_files) + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + ) + generate_and_test(llm, sql_lora_files) + + +@multi_gpu_test(num_gpus=4) +@fork_new_process_for_each_test +def test_llama_lora_tp4_fully_sharded_enable_bias(sql_lora_files): + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + tensor_parallel_size=4, + fully_sharded_loras=True, + enable_lora_bias=True, + ) + generate_and_test(llm, sql_lora_files) diff --git a/vllm/tests/lora/test_long_context.py b/vllm/tests/lora/test_long_context.py index c8edb02a8..eada902c8 100644 --- a/vllm/tests/lora/test_long_context.py +++ b/vllm/tests/lora/test_long_context.py @@ -138,13 +138,7 @@ def test_rotary_emb_replaced(dist_init): enable_lora=True) engine_config = engine_args.create_engine_config() model_runner = ModelRunner( - model_config=engine_config.model_config, - parallel_config=engine_config.parallel_config, - scheduler_config=engine_config.scheduler_config, - device_config=engine_config.device_config, - cache_config=engine_config.cache_config, - load_config=engine_config.load_config, - lora_config=engine_config.lora_config, + vllm_config=engine_config, is_driver_worker=True, ) model_runner.load_model() diff --git a/vllm/tests/lora/test_lora_bias_e2e.py b/vllm/tests/lora/test_lora_bias_e2e.py new file mode 100644 index 000000000..c2520c847 --- /dev/null +++ b/vllm/tests/lora/test_lora_bias_e2e.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ibm-granite/granite-3b-code-base" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + generated_texts: List[str] = [] + for output in outputs: + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + return generated_texts + + +@pytest.mark.parametrize("lora_bias", [True]) +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_lora_rank=8, + max_loras=1, + enable_lora_bias=lora_bias, + tensor_parallel_size=1, + fully_sharded_loras=fully_sharded) + + print("lora adapter created") + output1 = do_sample(llm, lora_bias_files, lora_id=0) + + print("lora") + output2 = do_sample(llm, lora_bias_files, lora_id=1) + + if lora_bias: + assert output1 != output2 + else: + assert output1 == output2 diff --git a/vllm/tests/lora/test_lora_manager.py b/vllm/tests/lora/test_lora_manager.py index 67cf298b4..8d109b2c8 100644 --- a/vllm/tests/lora/test_lora_manager.py +++ b/vllm/tests/lora/test_lora_manager.py @@ -25,8 +25,13 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] -def test_from_lora_tensors(sql_lora_files): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( @@ -36,7 +41,7 @@ def test_from_lora_tensors(sql_lora_files): 8, 16, tensors, - "cuda", + device, embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES) @@ -46,6 +51,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.lora_alpha == 16 assert lora.lora_a is not None assert lora.lora_b is not None + assert lora.lora_a.device == torch.device(device) + assert lora.lora_b.device == torch.device(device) assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" assert lora.lora_a.shape[1] == 8 @@ -60,8 +67,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, - sub_modules: List[str]) -> LoRAModel: +def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], + device: torch.device) -> LoRAModel: loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight @@ -69,8 +76,8 @@ def create_lora(lora_id: int, model: nn.Module, name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), - torch.rand([8, w.shape[0]], device="cuda"), + torch.rand([w.shape[1], 8], device=device), + torch.rand([8, w.shape[0]], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -80,6 +87,7 @@ def create_packed_lora( model: nn.Module, module_name, replaced_module_names, + device: torch.device, empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight @@ -91,9 +99,9 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device="cuda"), + torch.rand([w.shape[1], 8], device=device), torch.rand([8, w.shape[0] // len(replaced_module_names)], - device="cuda"), + device=device), ) return LoRAModel(lora_id, 8, loras) @@ -104,7 +112,8 @@ def test_replace_submodules(dist_init, dummy_model): model.packed_modules_mapping = {} manager = LoRAModelManager( model, 1, 1, 1, - LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8)) + LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), + torch.device("cuda")) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -116,16 +125,28 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -def test_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -161,17 +182,32 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 + assert manager.device == device + assert manager.punica_wrapper.device == device -def test_lora_lru_cache_model_manager(dist_init, dummy_model): + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -238,20 +274,37 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): with pytest.raises(ValueError): assert manager.pin_adapter(3) + assert manager.punica_wrapper.device == device + assert manager.device == device + -def test_lru_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora4 = create_lora(4, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) @@ -351,14 +404,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} + assert manager.punica_wrapper.device == device + assert manager.device == device +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = LRUCacheWorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -426,14 +482,19 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + +@pytest.mark.parametrize("device", CUDA_DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): # Should remove every LoRA not specified in the request. lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = WorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings) @@ -497,8 +558,13 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + -def test_packed_loras(dist_init, dummy_model_gate_up): +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] model.packed_modules_mapping = { @@ -511,18 +577,25 @@ def test_packed_loras(dist_init, dummy_model_gate_up): 1, model, module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"]) + replaced_module_names=["gate_proj", "up_proj"], + device=device) model_lora1 = create_packed_lora( 2, model, module_name="gate_up_proj", replaced_module_names=["gate_proj", "up_proj"], + device=device, empty_replaced_module_name="gate_proj", ) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2)) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) model = manager.model assert isinstance(model.get_submodule("gate_up_proj"), diff --git a/vllm/tests/lora/test_minicpmv.py b/vllm/tests/lora/test_minicpmv.py index be040060d..2c45ce514 100644 --- a/vllm/tests/lora/test_minicpmv.py +++ b/vllm/tests/lora/test_minicpmv.py @@ -1,8 +1,11 @@ from typing import List +import pytest + import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest +from vllm.platforms import current_platform MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5" @@ -53,6 +56,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="MiniCPM-V dependency xformers incompatible with ROCm") def test_minicpmv_lora(minicpmv_lora_files): llm = vllm.LLM( MODEL_PATH, @@ -63,7 +69,6 @@ def test_minicpmv_lora(minicpmv_lora_files): trust_remote_code=True, gpu_memory_utilization=0.97 # This model is pretty big for CI gpus ) - output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) for i in range(len(EXPECTED_OUTPUT)): assert EXPECTED_OUTPUT[i].startswith(output1[i]) diff --git a/vllm/tests/lora/test_punica_sizes.py b/vllm/tests/lora/test_punica_sizes.py index 41c37a481..66b5f82bb 100644 --- a/vllm/tests/lora/test_punica_sizes.py +++ b/vllm/tests/lora/test_punica_sizes.py @@ -1,11 +1,9 @@ """ -This script is mainly used to tests various hidden_sizes. We have collected the +This script is mainly used to tests various hidden_sizes. We have collected the hidden_sizes included in the LoRA models currently supported by vLLM. It tests whether the corresponding Triton kernel can run normally when tensor parallelism is set to [1, 2, 4, 8, 16, 32, 64]. """ -from unittest.mock import patch - import pytest import torch @@ -15,8 +13,7 @@ from vllm.lora.ops.sgmv_expand import sgmv_expand from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.sgmv_shrink import sgmv_shrink -from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything +from vllm.platforms import current_platform from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -146,7 +143,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -235,11 +232,8 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel - torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -262,33 +256,21 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -324,10 +306,9 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( @@ -374,22 +355,16 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/vllm/tests/lora/test_punica_variation.py b/vllm/tests/lora/test_punica_variation.py index 185da6399..3b2003327 100644 --- a/vllm/tests/lora/test_punica_variation.py +++ b/vllm/tests/lora/test_punica_variation.py @@ -1,21 +1,19 @@ """ -This script is mainly used to test whether trtion kernels can run normally -under different conditions, including various batches, numbers of LoRA , and +This script is mainly used to test whether trtion kernels can run normally +under different conditions, including various batches, numbers of LoRA , and maximum ranks. """ -from unittest.mock import patch - import pytest import torch -from vllm.lora.ops.bgmv_expand import bgmv_expand -from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice -from vllm.lora.ops.bgmv_shrink import bgmv_shrink -from vllm.lora.ops.sgmv_expand import sgmv_expand -from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice -from vllm.lora.ops.sgmv_shrink import sgmv_shrink -from vllm.triton_utils.libentry import LibEntry -from vllm.utils import seed_everything +# Enable custom op register +import vllm.lora.ops.bgmv_expand +import vllm.lora.ops.bgmv_expand_slice +import vllm.lora.ops.bgmv_shrink +import vllm.lora.ops.sgmv_expand +import vllm.lora.ops.sgmv_expand_slice +import vllm.lora.ops.sgmv_shrink # noqa: F401 +from vllm.platforms import current_platform from .utils import (generate_data, generate_data_for_expand_nslices, ref_torch_groupgemm) @@ -40,6 +38,16 @@ def assert_close(a, b): torch.testing.assert_close(a, b, rtol=rtol, atol=atol) +# Unlike test_punica_sizes.py, we directly utilize custom op for +# testing, which verifies the correct registration of these ops. +bgmv_expand = torch.ops.vllm.bgmv_expand +bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice +bgmv_shrink = torch.ops.vllm.bgmv_shrink +sgmv_expand = torch.ops.vllm.sgmv_expand +sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice +sgmv_shrink = torch.ops.vllm.sgmv_shrink + + @pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("num_loras", NUM_LORA) @pytest.mark.parametrize("rank", MAX_RANKS) @@ -61,7 +69,7 @@ def test_punica_sgmv( device: str, ): torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 ( @@ -150,11 +158,9 @@ def test_punica_bgmv( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel - from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 1 ( @@ -177,33 +183,22 @@ def test_punica_bgmv( device, ) if op_type == "shrink": - # The current _bgmv_shrink_kernel does not require the libentry - # decoration. The purpose of adding this patch is to test the - # correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel", - LibEntry(_bgmv_shrink_kernel), - ): - bgmv_shrink( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - scaling, - ) + bgmv_shrink( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + scaling, + ) else: - # ditto - with patch( - "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel", - LibEntry(_bgmv_expand_kernel), - ): - bgmv_expand( - inputs_tensor, - lora_weights, - our_out_tensor, - indices, - add_inputs=True, - ) + + bgmv_expand( + inputs_tensor, + lora_weights, + our_out_tensor, + indices, + add_inputs=True, + ) ref_torch_groupgemm( ref_out_tensor, inputs_tensor, @@ -239,10 +234,8 @@ def test_punica_expand_nslices( seed: int, device: str, ): - from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel - torch.set_default_device(device) - seed_everything(seed) + current_platform.seed_everything(seed) seq_length = 128 if op_type == "sgmv" else 1 ( @@ -289,22 +282,15 @@ def test_punica_expand_nslices( add_inputs=True, ) else: - # The current _bgmv_expand_slice_kernel does not require the - # libentry decoration. The purpose of adding this patch is to test - # the correctness of libentry. - with patch( - "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel", - LibEntry(_bgmv_expand_slice_kernel), - ): - bgmv_expand_slice( - inputs_tensor, - lora_weights, - our_outputs, - indices, - slice_offset, - slice_size=hidden_size, - add_inputs=True, - ) + bgmv_expand_slice( + inputs_tensor, + lora_weights, + our_outputs, + indices, + slice_offset, + slice_size=hidden_size, + add_inputs=True, + ) ref_torch_groupgemm( ref_outputs[:, slice_offset:slice_offset + hidden_size], inputs_tensor, diff --git a/vllm/tests/lora/test_tokenizer_group.py b/vllm/tests/lora/test_tokenizer_group.py index daa39b2a3..d225a3f7d 100644 --- a/vllm/tests/lora/test_tokenizer_group.py +++ b/vllm/tests/lora/test_tokenizer_group.py @@ -17,6 +17,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): tokenizer_id="gpt2", enable_lora=True, max_num_seqs=1, + max_loras=1, max_input_length=None, ) lora_request = LoRARequest("1", 1, sql_lora_files) @@ -53,3 +54,22 @@ def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer + + +@pytest.mark.parametrize("enable_lora", [True, False]) +@pytest.mark.parametrize("max_num_seqs", [1, 2]) +@pytest.mark.parametrize("max_loras", [1, 2]) +def test_lora_tokenizers(enable_lora, max_num_seqs, max_loras): + tokenizer_group = get_tokenizer_group( + get_tokenizer_pool_config(None), + tokenizer_id="gpt2", + enable_lora=enable_lora, + max_num_seqs=max_num_seqs, + max_loras=max_loras, + max_input_length=None, + ) + if enable_lora: + assert tokenizer_group.lora_tokenizers.capacity == max( + max_num_seqs, max_loras) + else: + assert tokenizer_group.lora_tokenizers.capacity == 0 diff --git a/vllm/tests/lora/test_utils.py b/vllm/tests/lora/test_utils.py index db02bacdb..85110b8fa 100644 --- a/vllm/tests/lora/test_utils.py +++ b/vllm/tests/lora/test_utils.py @@ -12,36 +12,40 @@ def test_parse_fine_tuned_lora_name_valid(): fixture = { - ("base_model.model.lm_head.lora_A.weight", "lm_head", True), - ("base_model.model.lm_head.lora_B.weight", "lm_head", False), + ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False), + ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False), ( "base_model.model.model.embed_tokens.lora_embedding_A", "model.embed_tokens", True, + False, ), ( "base_model.model.model.embed_tokens.lora_embedding_B", "model.embed_tokens", False, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight", "model.layers.9.mlp.down_proj", True, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight", "model.layers.9.mlp.down_proj", False, + False, ), } - for name, module_name, is_lora_a in fixture: - assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name) + for name, module_name, is_lora_a, is_bias in fixture: + assert (module_name, is_lora_a, + is_bias) == parse_fine_tuned_lora_name(name) def test_parse_fine_tuned_lora_name_invalid(): fixture = { - "weight", "base_model.weight", "base_model.model.weight", } diff --git a/vllm/tests/lora/test_worker.py b/vllm/tests/lora/test_worker.py index 2f7ac8550..9d814f657 100644 --- a/vllm/tests/lora/test_worker.py +++ b/vllm/tests/lora/test_worker.py @@ -4,7 +4,8 @@ from unittest.mock import patch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, ParallelConfig, SchedulerConfig) + ModelConfig, ParallelConfig, SchedulerConfig, + VllmConfig) from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.worker.worker import Worker @@ -12,7 +13,7 @@ @patch.dict(os.environ, {"RANK": "0"}) def test_worker_apply_lora(sql_lora_files): - worker = Worker( + vllm_config = VllmConfig( model_config=ModelConfig( "meta-llama/Llama-2-7b-hf", task="auto", @@ -34,10 +35,13 @@ def test_worker_apply_lora(sql_lora_files): gpu_memory_utilization=1., swap_space=0, cache_dtype="auto"), - local_rank=0, - rank=0, lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32, max_loras=32), + ) + worker = Worker( + vllm_config=vllm_config, + local_rank=0, + rank=0, distributed_init_method=f"file://{tempfile.mkstemp()[1]}", ) worker.init_device() diff --git a/vllm/tests/lora/utils.py b/vllm/tests/lora/utils.py index 00f8e26d1..e394c33b3 100644 --- a/vllm/tests/lora/utils.py +++ b/vllm/tests/lora/utils.py @@ -7,9 +7,10 @@ class DummyLoRAManager: - def __init__(self): + def __init__(self, device: torch.device = "cuda:0"): super().__init__() self._loras: Dict[str, LoRALayerWeights] = {} + self._device = device def set_module_lora(self, module_name: str, lora: LoRALayerWeights): self._loras[module_name] = lora @@ -28,16 +29,16 @@ def init_random_lora(self, lora_alpha=1, lora_a=torch.rand([weight.shape[1], rank], dtype=weight.dtype, - device="cuda"), + device=self._device), lora_b=torch.rand([rank, weight.shape[0]], dtype=weight.dtype, - device="cuda"), + device=self._device), ) if generate_embeddings_tensor: lora.embeddings_tensor = torch.rand(5, generate_embeddings_tensor, dtype=weight.dtype, - device="cuda") + device=self._device) self.set_module_lora(module_name, lora) return lora diff --git a/vllm/tests/metrics/test_metrics.py b/vllm/tests/metrics/test_metrics.py index 7a361ef32..4a824c7ac 100644 --- a/vllm/tests/metrics/test_metrics.py +++ b/vllm/tests/metrics/test_metrics.py @@ -365,6 +365,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool, "vllm:request_prompt_tokens", "vllm:request_generation_tokens", "vllm:request_params_n", + "vllm:request_params_max_tokens", ] for metric_name in request_histogram_metrics: metric_value = REGISTRY.get_sample_value(f"{metric_name}_count", diff --git a/vllm/tests/model_executor/test_enabled_custom_ops.py b/vllm/tests/model_executor/test_enabled_custom_ops.py index af267f804..0a3aba255 100644 --- a/vllm/tests/model_executor/test_enabled_custom_ops.py +++ b/vllm/tests/model_executor/test_enabled_custom_ops.py @@ -1,8 +1,8 @@ -import os from typing import List import pytest +from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.activation import (GeluAndMul, ReLUSquaredActivation, @@ -51,42 +51,39 @@ class Relu3(ReLUSquaredActivation): ]) def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int], default_on: bool): - os.environ["VLLM_CUSTOM_OPS"] = env - os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level) + vllm_config = VllmConfig(compilation_config=CompilationConfig( + level=torch_level, custom_ops=env.split(","))) + with set_current_vllm_config(vllm_config): + assert CustomOp.default_on() == default_on - # Reset default_on (computed once): - CustomOp.default_on.cache_clear() + ops_enabled = [bool(x) for x in ops_enabled] - assert CustomOp.default_on() == default_on + assert RMSNorm(1024).enabled() == ops_enabled[0] + assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0] - ops_enabled = [bool(x) for x in ops_enabled] + assert SiluAndMul().enabled() == ops_enabled[1] + assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1] - assert RMSNorm(1024).enabled() == ops_enabled[0] - assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0] + assert GeluAndMul().enabled() == ops_enabled[2] + assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2] - assert SiluAndMul().enabled() == ops_enabled[1] - assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1] + # If registered, subclasses should follow their own name + assert Relu3().enabled() == ops_enabled[3] + assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3] - assert GeluAndMul().enabled() == ops_enabled[2] - assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2] + # Unregistered subclass + class SiluAndMul2(SiluAndMul): + pass - # If registered, subclasses should follow their own name - assert Relu3().enabled() == ops_enabled[3] - assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3] - - # Unregistered subclass - class SiluAndMul2(SiluAndMul): - pass - - # Subclasses should not require registration - assert SiluAndMul2().enabled() == SiluAndMul().enabled() + # Subclasses should not require registration + assert SiluAndMul2().enabled() == SiluAndMul().enabled() @pytest.mark.parametrize( "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"]) def test_enabled_ops_invalid(env: str): - os.environ["VLLM_CUSTOM_OPS"] = env - CustomOp.default_on.cache_clear() - - with pytest.raises(AssertionError): - RMSNorm(1024).enabled() + with pytest.raises(Exception): # noqa + vllm_config = VllmConfig(compilation_config=CompilationConfig( + custom_ops=env.split(","))) + with set_current_vllm_config(vllm_config): + RMSNorm(1024).enabled() diff --git a/vllm/tests/model_executor/test_guided_processors.py b/vllm/tests/model_executor/test_guided_processors.py index 45fab8e96..9f4d81b58 100644 --- a/vllm/tests/model_executor/test_guided_processors.py +++ b/vllm/tests/model_executor/test_guided_processors.py @@ -36,7 +36,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema): @pytest.mark.asyncio -@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("backend", + ["outlines", "lm-format-enforcer", "xgrammar"]) async def test_guided_logits_processor_black_box(backend: str, sample_regex, sample_json_schema): tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta') diff --git a/vllm/tests/model_executor/test_model_load_with_params.py b/vllm/tests/model_executor/test_model_load_with_params.py new file mode 100644 index 000000000..ed321ba9f --- /dev/null +++ b/vllm/tests/model_executor/test_model_load_with_params.py @@ -0,0 +1,94 @@ +import os + +import pytest + +from vllm.model_executor.layers.pooler import PoolingType +from vllm.model_executor.models.bert import BertEmbeddingModel +from vllm.model_executor.models.roberta import RobertaEmbeddingModel +from vllm.platforms import current_platform + +MAX_MODEL_LEN = 128 +MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") +REVISION = os.environ.get("REVISION", "main") + +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", + "intfloat/multilingual-e5-large") +REVISION_ROBERTA = os.environ.get("REVISION", "main") + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME, + revision=REVISION, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_config = model.model.llm_engine.model_config + + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.encoder_config["max_seq_length"] == 512 + assert model_config.encoder_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooler_config.pooling_type == PoolingType.CLS.name + assert model_config.pooler_config.pooling_norm + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5" + assert model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer.model_max_length == 512 + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + # assert output + assert output + + +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") +def test_roberta_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME_ROBERTA, + revision=REVISION_ROBERTA, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_config = model.model.llm_engine.model_config + + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.encoder_config["max_seq_length"] == 512 + assert not model_config.encoder_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name + assert model_config.pooler_config.pooling_norm + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" + assert not model_tokenizer.tokenizer_config["do_lower_case"] + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + # assert output + assert output diff --git a/vllm/tests/models/decoder_only/audio_language/test_ultravox.py b/vllm/tests/models/decoder_only/audio_language/test_ultravox.py index bfffd34d1..e100c6b9b 100644 --- a/vllm/tests/models/decoder_only/audio_language/test_ultravox.py +++ b/vllm/tests/models/decoder_only/audio_language/test_ultravox.py @@ -2,12 +2,14 @@ import numpy as np import pytest +import pytest_asyncio from transformers import AutoModel, AutoTokenizer, BatchEncoding from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ....conftest import HfRunner, VllmRunner +from ....utils import RemoteOpenAIServer from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_3" @@ -17,6 +19,13 @@ VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" HF_PLACEHOLDER = "<|audio|>" +CHUNKED_PREFILL_KWARGS = { + "enable_chunked_prefill": True, + "max_num_seqs": 2, + # Use a very small limit to exercise chunked prefill. + "max_num_batched_tokens": 16 +} + @pytest.fixture(scope="session") def audio_assets(): @@ -30,6 +39,29 @@ def audio(request): return AudioAsset(request.param) +@pytest.fixture(params=[ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) +def server(request, audio_assets): + args = [ + "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager", + f"--limit-mm-per-prompt=audio={len(audio_assets)}" + ] + [ + f"--{key.replace('_','-')}={value}" + for key, value in request.param.items() + ] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + def _get_prompt(audio_count, question, placeholder): tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) placeholder = f"{placeholder}\n" * audio_count @@ -68,8 +100,7 @@ def run_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): """Inference result should be the same between hf and vllm.""" torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] @@ -79,11 +110,8 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: + with vllm_runner(model, dtype=dtype, enforce_eager=True, + **kwargs) as vllm_model: vllm_outputs_per_audio = [ vllm_model.generate_greedy_logprobs([vllm_prompt], max_tokens, @@ -92,7 +120,7 @@ def run_test( for vllm_prompt, _, audio in prompts_and_audios ] - def process(hf_inputs: BatchEncoding): + def process(hf_inputs: BatchEncoding, **kwargs): hf_inputs["audio_values"] = hf_inputs["audio_values"] \ .to(torch_dtype) # type: ignore return hf_inputs @@ -135,18 +163,16 @@ def run_multi_audio_test( dtype: str, max_tokens: int, num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, + **kwargs, ): with vllm_runner(model, dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, enforce_eager=True, limit_mm_per_prompt={ "audio": max((len(audio) for _, audio in prompts_and_audios)) - }) as vllm_model: + }, + **kwargs) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( [prompt for prompt, _ in prompts_and_audios], max_tokens, @@ -158,11 +184,16 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, - num_logprobs: int) -> None: + num_logprobs: int, vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) @@ -174,16 +205,21 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) +@pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("vllm_kwargs", [ + pytest.param({}, marks=pytest.mark.cpu_model), + pytest.param(CHUNKED_PREFILL_KWARGS), +]) def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: + max_tokens: int, num_logprobs: int, + vllm_kwargs: dict) -> None: vllm_prompt = _get_prompt(len(audio_assets), "Describe each of the audios above.", @@ -196,5 +232,37 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, - tensor_parallel_size=1, + **vllm_kwargs, ) + + +@pytest.mark.asyncio +async def test_online_inference(client, audio_assets): + """Exercises online inference with/without chunked prefill enabled.""" + + messages = [{ + "role": + "user", + "content": [ + *[{ + "type": "audio_url", + "audio_url": { + "url": audio.url + } + } for audio in audio_assets], + { + "type": + "text", + "text": + f"What's happening in these {len(audio_assets)} audio clips?" + }, + ], + }] + + chat_completion = await client.chat.completions.create(model=MODEL_NAME, + messages=messages, + max_tokens=10) + + assert len(chat_completion.choices) == 1 + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" diff --git a/vllm/tests/models/decoder_only/language/test_aqlm.py b/vllm/tests/models/decoder_only/language/test_aqlm.py index de4603211..a8cb5bbf9 100644 --- a/vllm/tests/models/decoder_only/language/test_aqlm.py +++ b/vllm/tests/models/decoder_only/language/test_aqlm.py @@ -38,6 +38,7 @@ ] +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("aqlm"), reason="AQLM is not supported on this GPU type.") @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) diff --git a/vllm/tests/models/decoder_only/language/test_big_models.py b/vllm/tests/models/decoder_only/language/test_big_models.py deleted file mode 100644 index fcfc159e4..000000000 --- a/vllm/tests/models/decoder_only/language/test_big_models.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Compare the outputs of HF and vLLM when using greedy sampling. - -This tests bigger models and use half precision. - -Run `pytest tests/models/test_big_models.py`. -""" -import pytest - -from vllm.platforms import current_platform - -from ...utils import check_logprobs_close, check_outputs_equal - -MODELS = [ - "meta-llama/Llama-2-7b-hf", - # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py - # "Deci/DeciLM-7b", # Broken - # "tiiuae/falcon-7b", # Broken - "EleutherAI/gpt-j-6b", - # "mosaicml/mpt-7b", # Broken - # "Qwen/Qwen1.5-0.5B" # Broken, -] - -if not current_platform.is_cpu(): - MODELS += [ - # fused_moe which not supported on CPU - "openbmb/MiniCPM3-4B", - # Head size isn't supported on CPU - "h2oai/h2o-danube3-4b-base", - ] - -# TODO: remove this after CPU float16 support ready -target_dtype = "float" if current_platform.is_cpu() else "half" - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [32]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - - if model == "openbmb/MiniCPM3-4B": - # the output becomes slightly different when upgrading to - # pytorch 2.5 . Changing to logprobs checks instead of exact - # output checks. - NUM_LOG_PROBS = 8 - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, NUM_LOG_PROBS) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, NUM_LOG_PROBS) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - else: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, - max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", [target_dtype]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/vllm/tests/models/decoder_only/language/test_fp8.py b/vllm/tests/models/decoder_only/language/test_fp8.py index 5a947ce62..53f23e245 100644 --- a/vllm/tests/models/decoder_only/language/test_fp8.py +++ b/vllm/tests/models/decoder_only/language/test_fp8.py @@ -15,17 +15,18 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize( "kv_cache_dtype,base_model,test_model,scale_path", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. - ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. - ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", - "meta-llama/Meta-Llama-3-8B-Instruct", None), + ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", + "meta-llama/Llama-3.2-1B-Instruct", None), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", @@ -33,7 +34,7 @@ ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) -@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # NOTE: Increasing this in this suite will fail CI because we currently cannot # reset distributed env properly. Use a value > 1 just when you test. diff --git a/vllm/tests/models/decoder_only/language/test_gguf.py b/vllm/tests/models/decoder_only/language/test_gguf.py index 5dc839426..2b8f5e2fa 100644 --- a/vllm/tests/models/decoder_only/language/test_gguf.py +++ b/vllm/tests/models/decoder_only/language/test_gguf.py @@ -17,26 +17,21 @@ MAX_MODEL_LEN = 1024 -# FIXME: Move this to confest -MODELS = [ - ("meta-llama/Llama-3.2-1B-Instruct", - hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", - filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")), - ("meta-llama/Llama-3.2-1B-Instruct", - hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF", - filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")), - ("Qwen/Qwen2-1.5B-Instruct", - hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", - filename="qwen2-1_5b-instruct-q4_k_m.gguf")), - ("Qwen/Qwen2-1.5B-Instruct", - hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", - filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")), -] - @pytest.mark.skipif(not is_quant_method_supported("gguf"), reason="gguf is not supported on this GPU type.") -@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ + ("meta-llama/Llama-3.2-1B-Instruct", + "bartowski/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-Q4_K_M.gguf"), + ("meta-llama/Llama-3.2-1B-Instruct", + "bartowski/Llama-3.2-1B-Instruct-GGUF", + "Llama-3.2-1B-Instruct-IQ4_XS.gguf"), + ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF", + "qwen2-1_5b-instruct-q4_k_m.gguf"), + ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", + "Qwen2-1.5B-Instruct.IQ4_XS.gguf"), +]) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -45,7 +40,9 @@ def test_models( num_gpus_available, vllm_runner, example_prompts, - model, + original_model, + gguf_id, + gguf_path, dtype: str, max_tokens: int, num_logprobs: int, @@ -54,7 +51,7 @@ def test_models( if num_gpus_available < tp_size: pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") - original_model, gguf_model = model + gguf_model = hf_hub_download(gguf_id, filename=gguf_path) tokenizer = AutoTokenizer.from_pretrained(original_model) messages = [[{ diff --git a/vllm/tests/models/decoder_only/language/test_gptq_marlin.py b/vllm/tests/models/decoder_only/language/test_gptq_marlin.py index 2155e83db..037411a18 100644 --- a/vllm/tests/models/decoder_only/language/test_gptq_marlin.py +++ b/vllm/tests/models/decoder_only/language/test_gptq_marlin.py @@ -22,30 +22,18 @@ MAX_MODEL_LEN = 1024 MODELS = [ - # act_order==False, group_size=channelwise - ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), - # act_order==False, group_size=128 - ("TheBloke/Llama-2-7B-GPTQ", "main"), - # act_order==True, group_size=128 ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), - # act_order==True, group_size=64 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), - # act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), # 8-bit, act_order==True, group_size=channelwise ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), - # 8-bit, act_order==True, group_size=128 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), - # 8-bit, act_order==True, group_size=32 - ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), # 4-bit, act_order==True, group_size=128 ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") ] +@pytest.mark.quant_model @pytest.mark.flaky(reruns=3) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") diff --git a/vllm/tests/models/decoder_only/language/test_gptq_marlin_24.py b/vllm/tests/models/decoder_only/language/test_gptq_marlin_24.py index d65be05f1..26cb3ec31 100644 --- a/vllm/tests/models/decoder_only/language/test_gptq_marlin_24.py +++ b/vllm/tests/models/decoder_only/language/test_gptq_marlin_24.py @@ -25,19 +25,20 @@ class ModelPair: # 4-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), - # 4-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), + # # 4-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), # 8-bit, group_size == 128 ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), - # 8-bit, group_size == channelwise - ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", - model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), + # # 8-bit, group_size == channelwise + # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", + # model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), ] +@pytest.mark.quant_model @pytest.mark.flaky(reruns=2) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"), reason="Marlin24 is not supported on this GPU type.") diff --git a/vllm/tests/models/decoder_only/language/test_granite.py b/vllm/tests/models/decoder_only/language/test_granite.py index 0b71f0d49..5e93842f4 100644 --- a/vllm/tests/models/decoder_only/language/test_granite.py +++ b/vllm/tests/models/decoder_only/language/test_granite.py @@ -7,7 +7,9 @@ from ...utils import check_logprobs_close MODELS = [ + # TODO(sang): Sliding window should be tested separately. "ibm/PowerLM-3b", + "ibm/PowerMoE-3b", ] @@ -24,7 +26,6 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: - # TODO(sang): Sliding window should be tested separately. with hf_runner(model, dtype=dtype) as hf_model: hf_outputs = hf_model.generate_greedy_logprobs_limit( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/tests/models/decoder_only/language/test_granitemoe.py b/vllm/tests/models/decoder_only/language/test_granitemoe.py deleted file mode 100644 index ba7337522..000000000 --- a/vllm/tests/models/decoder_only/language/test_granitemoe.py +++ /dev/null @@ -1,39 +0,0 @@ -"""Compare the outputs of HF and vLLM for Granite models using greedy sampling. - -Run `pytest tests/models/test_granite.py`. -""" -import pytest - -from ...utils import check_logprobs_close - -MODELS = [ - "ibm/PowerMoE-3b", -] - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [64]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/vllm/tests/models/decoder_only/language/test_jamba.py b/vllm/tests/models/decoder_only/language/test_jamba.py index 384ec77e5..cae25ae9f 100644 --- a/vllm/tests/models/decoder_only/language/test_jamba.py +++ b/vllm/tests/models/decoder_only/language/test_jamba.py @@ -1,8 +1,8 @@ import pytest from tests.utils import multi_gpu_test +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -33,6 +33,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -185,7 +189,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: @@ -271,6 +276,44 @@ def test_state_cleanup( "could be related to finished_requests_ids") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_multistep( + vllm_runner, + model: str, + dtype: str, + example_prompts, +) -> None: + # This test is verifying that multistep works correctly + #on mamba-like models + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_model.generate_greedy([example_prompts[0]] * 10, 1) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +def test_multistep_correctness(vllm_runner, model: str, dtype: str, + max_tokens: int, example_prompts) -> None: + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_outputs_multistep = vllm_model.generate_greedy( + example_prompts, max_tokens) + + with vllm_runner(model, num_scheduler_steps=1, + max_num_seqs=2) as vllm_model: + vllm_outputs_single_step = vllm_model.generate_greedy( + example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs_multistep, + outputs_1_lst=vllm_outputs_single_step, + name_0="vllm_outputs_multistep", + name_1="vllm_outputs_single_step", + ) + + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @@ -293,17 +336,3 @@ def test_jamba_distributed_produces_identical_generation( name_0="vllm_tp_1", name_1="vllm_tp_2", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/vllm/tests/models/decoder_only/language/test_mamba.py b/vllm/tests/models/decoder_only/language/test_mamba.py index 2dc231c59..35018c3c1 100644 --- a/vllm/tests/models/decoder_only/language/test_mamba.py +++ b/vllm/tests/models/decoder_only/language/test_mamba.py @@ -5,8 +5,8 @@ import pytest from transformers import AutoModelForCausalLM, AutoTokenizer +from vllm.config import VllmConfig from vllm.sampling_params import SamplingParams -from vllm.worker.model_runner import _get_graph_batch_size from ...utils import check_outputs_equal @@ -51,6 +51,10 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] @@ -196,7 +200,8 @@ def test_mamba_cache_cg_padding( # This test is for verifying that mamba cache is padded to CG captured # batch size. If it's not, a torch RuntimeError will be raised because # tensor dimensions aren't compatible - while len(example_prompts) == _get_graph_batch_size(len(example_prompts)): + while len(example_prompts) == VllmConfig.get_graph_batch_size( + len(example_prompts)): example_prompts.append(example_prompts[0]) try: @@ -283,13 +288,35 @@ def test_state_cleanup( @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( +def test_multistep( vllm_runner, model: str, dtype: str, + example_prompts, ) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_model.generate_greedy([example_prompts[0]] * 10, 1) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +def test_multistep_correctness(vllm_runner, model: str, dtype: str, + max_tokens: int, example_prompts) -> None: + with vllm_runner(model, num_scheduler_steps=8, + max_num_seqs=2) as vllm_model: + vllm_outputs_multistep = vllm_model.generate_greedy( + example_prompts, max_tokens) + + with vllm_runner(model, num_scheduler_steps=1, + max_num_seqs=2) as vllm_model: + vllm_outputs_single_step = vllm_model.generate_greedy( + example_prompts, max_tokens) + + check_outputs_equal( + outputs_0_lst=vllm_outputs_multistep, + outputs_1_lst=vllm_outputs_single_step, + name_0="vllm_outputs_multistep", + name_1="vllm_outputs_single_step", + ) diff --git a/vllm/tests/models/decoder_only/language/test_marlin.py b/vllm/tests/models/decoder_only/language/test_marlin.py deleted file mode 100644 index c802346de..000000000 --- a/vllm/tests/models/decoder_only/language/test_marlin.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Compare the outputs of a GPTQ model to a Marlin model. - -Note: GPTQ and Marlin do not have bitwise correctness. -As a result, in this test, we just confirm that the top selected tokens of the -Marlin/GPTQ models are in the top 3 selections of each other. - -Note: Marlin internally uses locks to synchronize the threads. This can -result in very slight nondeterminism for Marlin. As a result, we re-run the test -up to 3 times to see if we pass. - -Run `pytest tests/models/test_marlin.py`. -""" -from dataclasses import dataclass - -import pytest - -from tests.quantization.utils import is_quant_method_supported - -from ...utils import check_logprobs_close - - -@dataclass -class ModelPair: - model_marlin: str - model_gptq: str - - -model_pairs = [ - ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", - model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), - ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", - model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), - ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", - model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") -] - - -@pytest.mark.flaky(reruns=2) -@pytest.mark.skipif(not is_quant_method_supported("marlin"), - reason="Marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model_pair", model_pairs) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( - vllm_runner, - example_prompts, - model_pair: ModelPair, - dtype: str, - max_tokens: int, - num_logprobs: int, -) -> None: - with vllm_runner(model_pair.model_marlin, - dtype=dtype, - quantization="marlin") as marlin_model: - marlin_outputs = marlin_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model_pair.model_gptq, dtype=dtype, - quantization="gptq") as gptq_model: - gptq_outputs = gptq_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - check_logprobs_close( - outputs_0_lst=gptq_outputs, - outputs_1_lst=marlin_outputs, - name_0="gptq", - name_1="marlin", - ) diff --git a/vllm/tests/models/decoder_only/language/test_mistral.py b/vllm/tests/models/decoder_only/language/test_mistral.py index 174b905d9..99b5d5694 100644 --- a/vllm/tests/models/decoder_only/language/test_mistral.py +++ b/vllm/tests/models/decoder_only/language/test_mistral.py @@ -2,15 +2,24 @@ Run `pytest tests/models/test_mistral.py`. """ +import copy + import pytest -from vllm import LLM, SamplingParams +from vllm import SamplingParams +from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( # noqa + MistralToolParser) from ...utils import check_logprobs_close MODELS = [ "mistralai/Mistral-7B-Instruct-v0.1", +] + +MISTRAL_FORMAT_MODELS = [ "mistralai/Mistral-7B-Instruct-v0.3", + # uses the v3-Tekken tokenizer + "mistralai/Ministral-8B-Instruct-2410", # Mistral-Nemo is to big for CI, but passes locally # "mistralai/Mistral-Nemo-Instruct-2407" ] @@ -19,6 +28,8 @@ SYMBOLIC_LANG_PROMPTS = [ "勇敢な船乗りについての詩を書く", # japanese "寫一首關於勇敢的水手的詩", # chinese + "ပုံပြင်လေးပြောပြပါ်:\n", # burmese + "Repeat the phrase 'URGENCY🌶️':\nURGENCY🌶️\nURGENCY🌶️\n", # see https://github.com/vllm-project/vllm/pull/9625 ] # for function calling @@ -51,17 +62,69 @@ }, "required": ["city", "state", "unit"] } + }, +}, { + "type": "function", + "function": { + "name": "rewrite", + "description": "Rewrites text", + "parameters": { + "type": "object", + "required": [], + "properties": { + "text": { + "type": "string", + "description": "The input text to rewrite." + } + } + } } }] -MSGS = [{ - "role": - "user", - "content": ("Can you tell me what the temperate" - " will be in Dallas, in fahrenheit?") -}] -EXPECTED_FUNC_CALL = ( - '[{"name": "get_current_weather", "arguments": ' - '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]') +MSGS = [ + { + "role": "system", + "content": "You are an assistant." + }, + { + "role": + "user", + "content": + "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors." # noqa + }, + { + "role": + "assistant", + "content": + "", + "tool_calls": [{ + "id": "bbc5b7ede", + "type": "function", + "function": { + "name": + "rewrite", + "arguments": + '{\"text\":\"My English needs improvving, maybe I make errors.\"}' # noqa + } + }] + }, + { + "role": "tool", + "content": + "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}", # noqa + "tool_call_id": "bbc5b7ede", + "name": "rewrite" + }, + { + "role": "assistant", + "content": "---\n\nMy English needs improving, maybe I make errors" + }, + { + "role": + "user", + "content": ("Can you tell me what the temperate" + " will be in Dallas, in fahrenheit?") + } +] @pytest.mark.parametrize("model", MODELS) @@ -95,7 +158,7 @@ def test_models( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) @pytest.mark.parametrize("num_logprobs", [5]) @@ -135,28 +198,29 @@ def test_mistral_format( ) -@pytest.mark.parametrize("model", MODELS[1:]) +@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS) def test_mistral_symbolic_languages( + vllm_runner, model: str, dtype: str, - prompt: str, ) -> None: - prompt = "hi" - msg = {"role": "user", "content": prompt} - llm = LLM(model=model, - dtype=dtype, - max_model_len=8192, - tokenizer_mode="mistral", - config_format="mistral", - load_format="mistral") - outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS) - assert "�" not in outputs[0].outputs[0].text.strip() + with vllm_runner(model, + dtype=dtype, + max_model_len=8192, + tokenizer_mode="mistral", + config_format="mistral", + load_format="mistral") as vllm_model: + for prompt in SYMBOLIC_LANG_PROMPTS: + msg = {"role": "user", "content": prompt} + outputs = vllm_model.model.chat([msg], + sampling_params=SAMPLING_PARAMS) + assert "�" not in outputs[0].outputs[0].text.strip() @pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling +@pytest.mark.parametrize("model", + MISTRAL_FORMAT_MODELS) # v1 can't do func calling def test_mistral_function_calling( vllm_runner, model: str, @@ -167,8 +231,23 @@ def test_mistral_function_calling( tokenizer_mode="mistral", config_format="mistral", load_format="mistral") as vllm_model: - outputs = vllm_model.model.chat(MSGS, + + msgs = copy.deepcopy(MSGS) + outputs = vllm_model.model.chat(msgs, tools=TOOLS, sampling_params=SAMPLING_PARAMS) - assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL + tokenizer = vllm_model.model.get_tokenizer() + tool_parser = MistralToolParser(tokenizer) + + model_output = outputs[0].outputs[0].text.strip() + assert model_output.startswith(tool_parser.bot_token), model_output + parsed_message = tool_parser.extract_tool_calls(model_output, None) + + assert parsed_message.tools_called + assert parsed_message.tool_calls[0].id == "0UAqFzWsD" + assert parsed_message.tool_calls[ + 0].function.name == "get_current_weather" + assert parsed_message.tool_calls[ + 0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}' # noqa + assert parsed_message.content is None diff --git a/vllm/tests/models/decoder_only/language/test_modelopt.py b/vllm/tests/models/decoder_only/language/test_modelopt.py index e643b115d..077e50e3a 100644 --- a/vllm/tests/models/decoder_only/language/test_modelopt.py +++ b/vllm/tests/models/decoder_only/language/test_modelopt.py @@ -39,6 +39,7 @@ @pytest.mark.skip( reason= "Prevent unstable test based on golden strings from breaking the build.") +@pytest.mark.quant_model @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) diff --git a/vllm/tests/models/decoder_only/language/test_models.py b/vllm/tests/models/decoder_only/language/test_models.py index 68055cbe2..2a7ed8826 100644 --- a/vllm/tests/models/decoder_only/language/test_models.py +++ b/vllm/tests/models/decoder_only/language/test_models.py @@ -1,31 +1,57 @@ """Compare the outputs of HF and vLLM when using greedy sampling. -This test only tests small models. Big models such as 7B should be tested from -test_big_models.py because it could use a larger instance to run tests. - Run `pytest tests/models/test_models.py`. """ import pytest -from ...utils import check_outputs_equal - -MODELS = [ - "facebook/opt-125m", - "gpt2", - "bigcode/tiny_starcoder_py", - "EleutherAI/pythia-70m", - "bigscience/bloom-560m", # Testing alibi slopes. - "microsoft/phi-2", - "stabilityai/stablelm-3b-4e1t", - # "allenai/OLMo-1B", # Broken - "bigcode/starcoder2-3b", - "google/gemma-1.1-2b-it", -] +from ...utils import check_logprobs_close -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -@pytest.mark.parametrize("max_tokens", [96]) +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "bigscience/bloom-560m", # bloom - testing alibi slopes + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openai-community/gpt2", # gpt2 + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param("Milos/slovak-gpt-j-405M"), # gptj + pytest.param("bigcode/tiny_starcoder_py"), # gpt_bigcode + pytest.param("EleutherAI/pythia-70m"), # gpt_neox + pytest.param( + "google/gemma-1.1-2b-it", # gemma + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "meta-llama/Llama-3.2-1B-Instruct", # llama + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "openbmb/MiniCPM3-4B", + # fused_moe not supported on CPU + marks=[pytest.mark.core_model], + ), + pytest.param( + "facebook/opt-125m", # opt + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + pytest.param( + "microsoft/phi-2", # phi + marks=[pytest.mark.core_model], + ), + pytest.param( + "Qwen/Qwen2.5-0.5B-Instruct", # qwen2 + marks=[pytest.mark.core_model], + ), + pytest.param("stabilityai/stablelm-3b-4e1t"), # stablelm + pytest.param("bigcode/starcoder2-3b"), # starcoder2 + ]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) def test_models( hf_runner, vllm_runner, @@ -33,33 +59,24 @@ def test_models( model: str, dtype: str, max_tokens: int, + num_logprobs: int, ) -> None: - # To pass the small model tests, we need full precision. - assert dtype == "float" with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) - check_outputs_equal( + check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, name_0="hf", name_1="vllm", ) - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["float"]) -def test_model_print( - vllm_runner, - model: str, - dtype: str, -) -> None: - with vllm_runner(model, dtype=dtype) as vllm_model: - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py new file mode 100644 index 000000000..31896bfd1 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py @@ -0,0 +1,187 @@ +"""Tests for Idefics3's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest +import torch +import transformers +from transformers import AutoImageProcessor, AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["HuggingFaceM4/Idefics3-8B-Llama3"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_idefics3(): + from vllm.model_executor.models.idefics3 import ( + input_processor_for_idefics3) + return input_processor_for_idefics3 + + +@pytest.fixture() +def dummy_data_for_idefics3(): + from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3 + return dummy_data_for_idefics3 + + +@pytest.fixture() +def get_max_idefics3_image_tokens(): + from vllm.model_executor.models.idefics3 import ( + get_max_idefics3_image_tokens) + return get_max_idefics3_image_tokens + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336]) +def test_input_mapper_override(model: str, image_assets: _ImageAssets, + longest_edge: Optional[int]): + """Ensure that the [default] input mapper handles size properly.""" + + mm_processor_kwargs = { + "size": { + "longest_edge": longest_edge + } + } if longest_edge is not None else {} + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + hf_processor = AutoImageProcessor.from_pretrained(model, + trust_remote_code=True, + **mm_processor_kwargs) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + + assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge, expected_max_tokens", [ + (None, 2873), + (168, 169), + (336, 169), + (400, 338), + (672, 338), +]) +def test_max_tokens_override(get_max_idefics3_image_tokens, model: str, + longest_edge: Optional[int], + expected_max_tokens: int): + """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs.""" + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + actual_max_tokens = get_max_idefics3_image_tokens( + ctx=InputContext(ctx.model_config), + size=size, + ) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [ + (168, 169, 1), + (168, 169, 2), + (400, 338, 1), + (400, 338, 2), +]) +def test_dummy_data_override(dummy_data_for_idefics3, model: str, + longest_edge: int, toks_per_img: int, + num_imgs: int): + """Ensure dummy_data_for_idefics3 handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + dummy_data = dummy_data_for_idefics3( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + size=size) + sequence_data = dummy_data.seq_data + # Ensure we have the right number of placeholders per size + image_token_id = ctx.get_hf_config().image_token_id + img_tok_count = sequence_data.get_token_ids().count(image_token_id) + assert img_tok_count == toks_per_img * num_imgs + + +@pytest.mark.skipif(transformers.__version__ < "4.46.0", + reason="Model introduced in HF >= 4.46.0") +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [ + (336, 169 * (1**2 + 1), 1), + (336, 169 * (1**2 + 1), 2), + (400, 169 * (2**2 + 1), 1), + (400, 169 * (2**2 + 1), 2), +]) +def test_input_processor_override(input_processor_for_idefics3, + image_assets: _ImageAssets, model: str, + longest_edge: int, + expected_toks_per_img: int, num_imgs: int): + """Ensure input_processor_for_idefics3 handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + size = {"longest_edge": longest_edge} if longest_edge is not None else None + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + # Build the image str / prompt based on the number of images we pass + tokenizer = AutoTokenizer.from_pretrained(model) + placeholders = "" if num_imgs == 1 else "\n".join( + f"Image-{i}: \n" for i in range(1, num_imgs + 1)) + prompt = f"<|begin_of_text|>User:{placeholders}\n\nAssistant:" # noqa: E501 + images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = ctx.get_hf_config().image_token_id + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py new file mode 100644 index 000000000..af0c2aa21 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py @@ -0,0 +1,206 @@ +"""Tests for InternVL's multimodal preprocessing kwargs.""" +from typing import Callable, Optional + +import pytest +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["OpenGVLab/InternVL2-2B"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_internvl(): + from vllm.model_executor.models.internvl import InternVLInputPipeline + + pipeline = InternVLInputPipeline('', '', '') + return pipeline.input_processor + + +@pytest.fixture() +def dummy_data_for_internvl(): + from vllm.model_executor.models.internvl import InternVLInputPipeline + + pipeline = InternVLInputPipeline('', '', '') + return pipeline.dummy_data + + +@pytest.fixture() +def get_max_internvl_image_tokens(): + from vllm.model_executor.models.internvl import ( + get_max_internvl_image_tokens) + return get_max_internvl_image_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_input_mapper_override( + model: str, + image_assets: _ImageAssets, + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], +): + mm_processor_kwargs = { + "max_dynamic_patch": max_dynamic_patch, + } + if dynamic_image_size is not None: + mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size + + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image.resize((448 * 2, 448 * 2)) + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + assert vllm_result["pixel_values"].size(1) == expected_num_patches + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_max_tokens_override( + get_max_internvl_image_tokens: Callable, + model: str, + max_dynamic_patch: Optional[int], + dynamic_image_size: Optional[bool], +): + """Ensure get_max_internvl_image_tokens handles mm_processor_kwargs.""" + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + if max_dynamic_patch is None: + max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + expected_max_tokens = 256 * expected_num_patches + + actual_max_tokens = get_max_internvl_image_tokens( + ctx=InputContext(ctx.model_config), + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_imgs", [1, 2]) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +def test_dummy_data_override( + dummy_data_for_internvl: Callable, + model: str, + num_imgs: int, + max_dynamic_patch: Optional[int], + dynamic_image_size: Optional[bool], +): + """Ensure dummy_data_for_internvl handles kwargs properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + if max_dynamic_patch is None: + max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + expected_max_tokens = 256 * expected_num_patches + + dummy_data = dummy_data_for_internvl( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + sequence_data = dummy_data.seq_data + + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + image_token_id = tokenizer.encode('', + add_special_tokens=False)[0] + + # Ensure we have the right number of placeholders per size + img_tok_count = sequence_data.get_token_ids().count(image_token_id) + assert img_tok_count == expected_max_tokens * num_imgs + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("max_dynamic_patch", [1, 4]) +@pytest.mark.parametrize("dynamic_image_size", [True, False, None]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_input_processor_override( + input_processor_for_internvl: Callable, + image_assets: _ImageAssets, + model: str, + num_imgs: int, + max_dynamic_patch: int, + dynamic_image_size: Optional[bool], +): + """Ensure input_processor_for_internvl handles kwargs properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1 + if dynamic_image_size is False: + expected_num_patches = 1 + + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + expected_toks_per_img = 256 * expected_num_patches + + # Build the image str / prompt based on the number of images we pass + tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + placeholders = "" if num_imgs == 1 else "\n".join( + f"Image-{i}: \n" for i in range(1, num_imgs + 1)) + prompt = placeholders + images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_internvl( + ctx, + inputs, + max_dynamic_patch=max_dynamic_patch, + dynamic_image_size=dynamic_image_size, + ) + + # Ensure we have the right number of placeholders per num_crops size + image_token_id = tokenizer.encode('', + add_special_tokens=False)[0] + img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py new file mode 100644 index 000000000..51c008510 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py @@ -0,0 +1,70 @@ +import pytest + +from vllm.inputs import InputContext + +from ....utils import build_model_context + + +@pytest.fixture() +def get_max_llava_next_image_tokens(): + from vllm.model_executor.models.llava_next import ( + get_max_llava_next_image_tokens) + return get_max_llava_next_image_tokens + + +@pytest.fixture() +def dummy_data_for_llava_next(): + from vllm.model_executor.models.llava_next import dummy_data_for_llava_next + return dummy_data_for_llava_next + + +@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ + ([[336, 336]], 1176), + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), +]) +def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, + get_max_llava_next_image_tokens): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + # and calculate the resulting max tokens + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + + actual_max_tokens = get_max_llava_next_image_tokens( + InputContext(ctx.model_config)) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize( + "gridpoints,expected_size", + [ + # One point; it has to be the largest + ([[336, 336]], (336, 336)), + # Default for most llava next models; the 2x2 tile is the largest + ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], + (672, 672)), + # If two rectangular gridpoints are the same, the more vertical + # one has the higher feature count due to newline features + ([[336, 672], [672, 336]], (672, 336)) + ]) +def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, + gridpoints, expected_size): + ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") + + # Update the config image_grid_pinpoints + ctx.model_config.hf_config.image_grid_pinpoints = gridpoints + seq_len = 5000 # bigger than the max feature size for any image + + dummy_data = dummy_data_for_llava_next( + ctx, + seq_len=seq_len, + mm_counts={"image": 1}, + ) + seq_data = dummy_data.seq_data + mm_data = dummy_data.multi_modal_data + + # The dummy data dims should match the gridpoint with the biggest feat size + assert mm_data["image"].height == expected_size[0] + assert mm_data["image"].width == expected_size[1] + assert len(seq_data.get_token_ids()) >= seq_len diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py new file mode 100644 index 000000000..60a8f63eb --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -0,0 +1,182 @@ +"""Tests for phi3v's multimodal preprocessing kwargs.""" +from typing import Optional + +import pytest +import torch +from transformers import AutoImageProcessor, AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +models = ["microsoft/Phi-3.5-vision-instruct"] + + +# Wrap lazy imports to avoid initializing CUDA during test collection +@pytest.fixture() +def input_processor_for_phi3v(): + from vllm.model_executor.models.phi3v import input_processor_for_phi3v + return input_processor_for_phi3v + + +@pytest.fixture() +def dummy_data_for_phi3v(): + from vllm.model_executor.models.phi3v import dummy_data_for_phi3v + return dummy_data_for_phi3v + + +@pytest.fixture() +def get_max_phi3v_image_tokens(): + from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens + return get_max_phi3v_image_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops", [4, 16, None]) +def test_input_mapper_override(model: str, image_assets: _ImageAssets, + num_crops: Optional[int]): + """Ensure that the [default] input mapper handles num_crops properly.""" + # We pass the processor kwargs here since for this model, we fall back to + # the default mapper; this will fall back to the HF mapper and forward + # mm_processor_kwargs to it. + mm_processor_kwargs = { + "num_crops": num_crops + } if num_crops is not None else {} + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=mm_processor_kwargs, + ) + + hf_processor = AutoImageProcessor.from_pretrained(model, + trust_remote_code=True, + **mm_processor_kwargs) + + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(ctx.model_config) + + image = image_assets[0].pil_image + hf_result = hf_processor.preprocess( + image, + return_tensors="pt", + ) + + vllm_result = mm_registry.map_input( + ctx.model_config, + {"image": image}, + ) + + assert torch.all(hf_result["image_sizes"] == vllm_result["image_sizes"]) + assert torch.all( + hf_result["num_img_tokens"] == vllm_result["num_img_tokens"]) + + # For pixel values, the second axis should be the num_crops + 1 + # for the rescaled original image. The default value in VLLM falls + # back to the HF config, which is why we compare to the processor num_crops + assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"]) + assert vllm_result["pixel_values"].shape[1] == hf_processor.num_crops + 1 + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_max_tokens", [ + (4, 781), + (16, 2653), +]) +def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, + num_crops: int, expected_max_tokens: int): + """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" + # NOTE: mm_processor_kwargs on the context in this test is unused, since + # this is testing the mapper directly. In practice, the processor kwargs + # are wrapped in a closure when calling the max tokens func. We explicitly + # do NOT use the mm_processor_kwargs in the model context here to ensure + # that the max image tokens implementation is referencing a mix of the + # kwargs to the function and the original mm_processor_kwargs in case + # values are somehow updated and end up in a bad state. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + actual_max_tokens = get_max_phi3v_image_tokens( + InputContext(ctx.model_config), + num_crops=num_crops, + ) + + assert expected_max_tokens == actual_max_tokens + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,toks_per_img,num_imgs", [ + (4, 781, 1), + (4, 781, 2), + (16, 2653, 1), + (16, 2653, 2), +]) +def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int, + toks_per_img: int, num_imgs: int): + """Ensure dummy_data_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the dummy data func. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + mm_processor_kwargs=None, + ) + + dummy_data = dummy_data_for_phi3v( + ctx=ctx, + seq_len=8192, # Should be bigger than num_imgs * toks_per_img + mm_counts={"image": num_imgs}, + num_crops=num_crops, + ) + sequence_data = dummy_data.seq_data + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID) + assert img_tok_count == toks_per_img * num_imgs + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("num_crops,expected_toks_per_img,num_imgs", [ + (4, 757, 1), + (4, 757, 2), + (16, 1921, 1), + (16, 1921, 2), +]) +def test_input_processor_override(input_processor_for_phi3v, + image_assets: _ImageAssets, model: str, + num_crops: int, expected_toks_per_img: int, + num_imgs: int): + """Ensure input_processor_for_phi3v handles num_crops properly.""" + # Same as the previous test - don't initialize mm_processor_kwargs + # in this test and assume that the kwargs will be correctly expanded by + # the partial when calling the custom input processor. + ctx = build_model_context( + model_name=model, + tokenizer_name=model, + trust_remote_code=True, + ) + tokenizer = AutoTokenizer.from_pretrained(model) + # Build the image str / prompt based on the number of images we pass + img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) + prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" + images = [image_assets[0].pil_image] * num_imgs + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": images}) + + processed_inputs = input_processor_for_phi3v(ctx, + inputs, + num_crops=num_crops) + + # Ensure we have the right number of placeholders per num_crops size + img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID) + assert img_tok_count == expected_toks_per_img * num_imgs diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py new file mode 100644 index 000000000..163220c91 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -0,0 +1,144 @@ +"""Tests for Qwen's multimodal preprocessing kwargs.""" +from typing import Dict, List, Union + +import pytest +import torch +from PIL.Image import Image + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalKwargs +from vllm.multimodal.utils import cached_get_tokenizer + +from .....conftest import IMAGE_ASSETS +from ....utils import build_model_context + +### Multimodal preprocessing tests +SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image +# These values are specific to Qwen-VL/Chat; we can get these from the model +# config also, but they are hardcoded here to keep the parameterize/fixtures +# easy to read. +IMG_START_ID = 151857 +IMG_END_ID = 151858 +IMG_PAD_ID = 151859 +TOKS_PER_IMG = 256 +VIS_ENC_DIM = 4096 +IMG_SIZE = 448 + + +@pytest.fixture() +def input_mapper_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_mapper_for_qwen + return input_mapper_for_qwen + + +@pytest.fixture() +def input_processor_for_qwen(): + # Lazy import to avoid initializing CUDA during test collection + from vllm.model_executor.models.qwen import input_processor_for_qwen + return input_processor_for_qwen + + +@pytest.fixture() +def qwen_vl_context() -> InputContext: + """Get an InputContext for Qwen-VL.""" + return build_model_context(model_name="Qwen/Qwen-VL", + trust_remote_code=True) + + +# Happy path tests for single/multi-image scenarios for the multimodal +# input processor and mapper, respectively +@pytest.mark.parametrize("num_images", [1, 2]) +def test_input_processor_valid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + num_images: int): + """Happy cases for image inputs to Qwen's multimodal input processor.""" + prompt = "".join( + [f"Picture {num}: \n" for num in range(1, num_images + 1)]) + inputs = token_inputs( + prompt=prompt, + # When processing multimodal data for a multimodal model, the qwen + # input processor will overwrite the provided prompt_token_ids with + # the image prompts + prompt_token_ids=[], + multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, + ) + proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) + assert isinstance(proc_inputs, dict) + + # Each image should have one start / stop and a fixed context of 256 + proc_tokens = proc_inputs["prompt_token_ids"] + assert proc_tokens.count(IMG_START_ID) == num_images + assert proc_tokens.count(IMG_END_ID) == num_images + assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG + + +@pytest.mark.parametrize( + "img_data,expected_shape", + [ + # single / multi-image + (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)), + (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)), + # single / multi-image embeddings + (torch.rand( + (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), + (torch.rand( + (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)), + ]) +def test_input_mapper_valid_mm_data(input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], + Image], + expected_shape: List[int]): + """Happy cases for image inputs to Qwen's multimodal input mapper.""" + mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) + # Ensure that we get the appropriately shaped pixel_values + # for images and image embeddings, respectively. + assert isinstance(mapped_img_data, MultiModalKwargs) + assert "pixel_values" in mapped_img_data + assert mapped_img_data["pixel_values"].shape == expected_shape + + +# Sad path tests for the multimodal input processor and mapper, respectively +@pytest.mark.parametrize("mm_data", [ + { + "image": torch.rand(5) + }, + { + "image": torch.rand((5, 5, 5, 5, 5)) + }, +]) +def test_input_processor_invalid_mm_data(input_processor_for_qwen, + qwen_vl_context: InputContext, + mm_data: Dict[str, torch.Tensor]): + """Test sad cases validated in Qwen's multimodal input processor.""" + tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer, + trust_remote_code=True) + prompt = "Picture 1: \n" + prompt_token_ids = tokenizer.encode(prompt) + inputs = token_inputs(prompt=prompt, + prompt_token_ids=prompt_token_ids, + multi_modal_data=mm_data) + # Should fail since we have too many or too few dimensions for embeddings + with pytest.raises(ValueError): + input_processor_for_qwen(qwen_vl_context, inputs) + + +@pytest.mark.parametrize( + "img_data", + [ + # Wrong context length + torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)), + # Wrong visual encoder output size + torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)), + ]) +def test_input_mapper_invalid_mm_data( + input_mapper_for_qwen, + qwen_vl_context: InputContext, + img_data: Union[torch.Tensor, List[Image], Image], +): + """Sad cases validated in Qwen VL's multimodal input mapper.""" + with pytest.raises(ValueError): + input_mapper_for_qwen(qwen_vl_context, img_data) diff --git a/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py new file mode 100644 index 000000000..7e2bea130 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -0,0 +1,167 @@ +from typing import Any, Dict, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from .....conftest import _ImageAssets +from ....utils import build_model_context + +MODEL = "Qwen/Qwen2-VL-2B-Instruct" +MIN_PIXELS = "min_pixels" +MAX_PIXELS = "max_pixels" + + +# Fixtures lazy import to avoid initializing CUDA during test collection +# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple +# input mappers. +@pytest.fixture() +def image_input_mapper_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + image_input_mapper_for_qwen2_vl) + return image_input_mapper_for_qwen2_vl + + +@pytest.fixture() +def input_processor_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + input_processor_for_qwen2_vl) + return input_processor_for_qwen2_vl + + +@pytest.fixture() +def qwen2_vl_context() -> InputContext: + return build_model_context(model_name=MODEL) + + +@pytest.fixture() +def get_max_qwen2_vl_image_tokens(): + from vllm.model_executor.models.qwen2_vl import ( + get_max_qwen2_vl_image_tokens) + return get_max_qwen2_vl_image_tokens + + +@pytest.fixture() +def dummy_data_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl + return dummy_data_for_qwen2_vl + + +@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ + ({}, 1225), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324), +]) +def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + expected_max_tokens: int): + """Ensure that the max token calc handles min/max pixels properly.""" + actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, + **mm_processor_kwargs) + assert actual_max_tokens == expected_max_tokens + + +@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [ + [{}, 1225, (980, 980)], + [{ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324, (504, 504)], +]) +def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + token_count: int, img_size: Tuple[int, int]): + """Ensure that the dummy data handles min/max pixels properly.""" + seq_len = 3000 + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + # NOTE: video value is required, but isn't actually used + # when making the dummy data except for error handling currently + dummy_data = dummy_data_for_qwen2_vl( + ctx=qwen2_vl_context, + seq_len=seq_len, + mm_counts={ + "image": 1, + "video": 0 + }, + **mm_processor_kwargs, + ) + seq_data = dummy_data.seq_data + mm_data = dummy_data.multi_modal_data + + # Ensure we have the right number of placeholders for min/max pixel values + assert seq_data.get_token_ids().count(image_token_id) == token_count + + # Ensure the images were resized correctly + image = mm_data["image"] + assert isinstance(image, Image) + assert image.size == img_size + + +@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ + ({}, 1426), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 330), +]) +def test_input_processor(input_processor_for_qwen2_vl, + qwen2_vl_context: InputContext, + image_assets: _ImageAssets, num_placeholders: int, + mm_processor_kwargs: Dict[str, Any]): + """Ensure that the image processor handles min/max pixels properly.""" + tokenizer = AutoTokenizer.from_pretrained(MODEL) + prompt = "<|vision_start|><|image_pad|><|vision_end|>" + + image = image_assets[0].pil_image + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": [image]}) + + processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs, + **mm_processor_kwargs) + assert processed_inputs["prompt_token_ids"].count( + image_token_id) == num_placeholders + assert len(processed_inputs["multi_modal_data"]["image"]) == 1 + + +@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [ + ({}, [5704, 1176]), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, [1320, 1176]), +]) +def test_image_mapper_override(qwen2_vl_context: InputContext, + image_assets: _ImageAssets, + mm_processor_kwargs: Dict[str, Any], + pixels_shape: Tuple[int, int]): + """Ensure that the image mapper handles min/max pixels properly.""" + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config) + + image = image_assets[0].pil_image + + mapped_output = mm_registry.map_input( + qwen2_vl_context.model_config, + {"image": image}, + mm_processor_kwargs=mm_processor_kwargs, + ) + + # Dimension 0 of pixel values should match the product of image_grid_thw + actual_pixels_shape = mapped_output["pixel_values"].shape + assert list(actual_pixels_shape) == pixels_shape + assert actual_pixels_shape[0] == torch.prod( + mapped_output["image_grid_thw"]) diff --git a/vllm/tests/models/decoder_only/vision_language/test_awq.py b/vllm/tests/models/decoder_only/vision_language/test_awq.py new file mode 100644 index 000000000..6e6e5b40d --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/test_awq.py @@ -0,0 +1,120 @@ +from typing import List, Optional, Type + +import pytest +import torch + +from vllm.multimodal.utils import rescale_image_size + +from ....conftest import IMAGE_ASSETS, VllmRunner, _ImageAssets +from ...utils import check_logprobs_close + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "<|im_start|>User\n\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + "cherry_blossom": + "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 +}) + + +def run_awq_test( + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + source_model: str, + quant_model: str, + *, + size_factors: List[float], + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + with vllm_runner(source_model, + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + source_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + with vllm_runner(quant_model, + quantization="awq", + max_model_len=4096, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + quant_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for source_outputs, quant_outputs in zip(source_outputs_per_image, + quant_outputs_per_image): + # TODO: Check whether using original CLIPVisionModel can improve + # consistency against HF + check_logprobs_close( + outputs_0_lst=source_outputs, + outputs_1_lst=quant_outputs, + name_0="source", + name_1="awq", + ) + + +@pytest.mark.quant_model +@pytest.mark.parametrize( + ("source_model", "quant_model"), + [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")], +) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@torch.inference_mode() +def test_awq_models(vllm_runner, image_assets, source_model, quant_model, + size_factors, dtype, max_tokens, num_logprobs) -> None: + run_awq_test( + vllm_runner, + image_assets, + source_model, + quant_model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_blip2.py b/vllm/tests/models/decoder_only/vision_language/test_blip2.py deleted file mode 100644 index e1e32b96d..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_blip2.py +++ /dev/null @@ -1,101 +0,0 @@ -from typing import List, Optional, Tuple - -import pytest -from transformers import AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "Question: What's the content of the image? Answer:", - "cherry_blossom": - "Question: What is the season? Answer:", -}) - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - _, output_str, out_logprobs = vllm_output - - hf_output_str = output_str + "\n" - - tokenizer = AutoTokenizer.from_pretrained(model) - hf_output_ids = tokenizer.encode(hf_output_str) - assert hf_output_ids[0] == tokenizer.bos_token_id - hf_output_ids = hf_output_ids[1:] - - return hf_output_ids, hf_output_str, out_logprobs - - -@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_broadcast.py b/vllm/tests/models/decoder_only/vision_language/test_broadcast.py deleted file mode 100644 index fd7af4a8b..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_broadcast.py +++ /dev/null @@ -1,46 +0,0 @@ -import pytest -import transformers - -from ....utils import multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) -@pytest.mark.parametrize("model", [ - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-mistral-7b-hf", - "facebook/chameleon-7b", -]) -def test_models(hf_runner, vllm_runner, image_assets, - distributed_executor_backend, model) -> None: - - dtype = "half" - max_tokens = 5 - num_logprobs = 5 - tensor_parallel_size = 2 - - if model.startswith("llava-hf/llava-1.5"): - from .test_llava import models, run_test - elif model.startswith("llava-hf/llava-v1.6"): - from .test_llava_next import models, run_test # type: ignore[no-redef] - elif model.startswith("facebook/chameleon"): - if transformers.__version__.startswith("4.46.0"): - pytest.skip("Model broken in HF, " - "see huggingface/transformers#34379") - from .test_chameleon import models, run_test # type: ignore[no-redef] - else: - raise NotImplementedError(f"Unsupported model: {model}") - - run_test( - hf_runner, - vllm_runner, - image_assets, - model=models[0], - # So that LLaVA-NeXT processor may return nested list - size_factors=[0.25, 0.5, 1.0], - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_chameleon.py b/vllm/tests/models/decoder_only/vision_language/test_chameleon.py deleted file mode 100644 index 4bd678b9f..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_chameleon.py +++ /dev/null @@ -1,130 +0,0 @@ -from typing import List, Optional, Type - -import pytest -import transformers -from transformers import AutoModelForVision2Seq, BatchEncoding - -from vllm.multimodal.utils import rescale_image_size -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_outputs_equal - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = ["facebook/chameleon-7b"] - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # HF Logprobs include image tokens, unlike vLLM, so we don't directly - # compare them - check_outputs_equal( - outputs_0_lst=[outputs[:2] for outputs in hf_outputs], - outputs_1_lst=[outputs[:2] for outputs in vllm_outputs], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.skipif( - transformers.__version__.startswith("4.46.0"), - reason="Model broken in HF, see huggingface/transformers#34379", -) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [8]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_fuyu.py b/vllm/tests/models/decoder_only/vision_language/test_fuyu.py deleted file mode 100644 index 1affcd10e..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_fuyu.py +++ /dev/null @@ -1,139 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.platforms import current_platform -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?\n", - "cherry_blossom": - "What is the season?\n", -}) - -models = ["adept/fuyu-8b"] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]]): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - hf_output_str = output_str.lstrip() + "|ENDOFTEXT|" - - return output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with hf_runner(model, dtype=dtype) as hf_model: - eos_token_id = hf_model.processor.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - eos_token_id=eos_token_id) - for prompts, images in inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -target_dtype = "half" -if current_platform.is_cpu(): - target_dtype = "bfloat16" - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [0.25], - # Single-scale, batched - [0.25, 0.25, 0.25], - # Multi-scale - [0.25, 0.2, 0.15], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_glm4.py b/vllm/tests/models/decoder_only/vision_language/test_glm4.py deleted file mode 100644 index 47922a57f..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_glm4.py +++ /dev/null @@ -1,133 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest - -from vllm.multimodal.utils import rescale_image_size -from vllm.transformers_utils.tokenizer import patch_padding_side - -from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner -from ....utils import large_gpu_test -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "What's the content of the image?", - "cherry_blossom": - "What is the season?", -}) - -models = ["THUDM/glm-4v-9b"] -target_dtype = "bfloat16" - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=2048, - max_num_seqs=2, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - stop_token_ids = [151329, 151336, 151338] - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - stop_token_ids=stop_token_ids) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - hf_processor = hf_model.processor - patch_padding_side(hf_processor) - - def processor(*args, text="", images=None, **kwargs): - if images is None: - return hf_processor(*args, **kwargs) - - return hf_processor.apply_chat_template( - [{ - "role": "user", - "image": images, - "content": text - }], - add_generation_prompt=True, - tokenize=True, - return_dict=True, - **kwargs, - ) - - hf_model.processor = processor - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.transformer.output_layer - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images, - ) for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -@large_gpu_test(min_gb=48) -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_h2ovl.py b/vllm/tests/models/decoder_only/vision_language/test_h2ovl.py new file mode 100644 index 000000000..45a736520 --- /dev/null +++ b/vllm/tests/models/decoder_only/vision_language/test_h2ovl.py @@ -0,0 +1,129 @@ +from typing import Optional, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoConfig + +# Import the functions to test +from vllm.model_executor.models.h2ovl import (calculate_num_blocks, + image_to_pixel_values_wrapper) +from vllm.multimodal.utils import rescale_image_size + +models = [ + "h2oai/h2ovl-mississippi-800m", # Replace with your actual model names + "h2oai/h2ovl-mississippi-2b", +] + + +def run_preprocessing_test( + image: Image, + config, + max_dynamic_patch: Optional[int] = None, +) -> Tuple[torch.Tensor, int]: + """Test the image preprocessing and calculate expected blocks.""" + + if max_dynamic_patch is None: + max_dynamic_patch = config.max_dynamic_patch + + width, height = image.size + use_MSAC = config.use_msac + + # Create the mapper function with the provided configuration + mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC) + pixel_values = mapper(image) + + # Calculate the expected number of blocks + if use_MSAC: + # First pass + blocks1, _, _, aspect_ratio = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, # Thumbnail is handled separately + prior_aspect_ratio=None, + ) + + # Second pass + blocks2, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=aspect_ratio, + ) + + # Add thumbnail if use_thumbnail is True and total_blocks > 1 + if config.use_thumbnail: + blocks1 += 1 if blocks1 > 1 else 0 + blocks2 += 1 if blocks2 > 1 else 0 + + # Total blocks is the sum of blocks from both passes minus overlapping + total_blocks = blocks1 + blocks2 - 1 + + expected_blocks = total_blocks + + else: + blocks, _, _, _ = calculate_num_blocks( + width, + height, + config.min_dynamic_patch, + max_dynamic_patch, + config.vision_config.image_size, + use_thumbnail=False, + prior_aspect_ratio=None, + ) + expected_blocks = blocks + + if config.use_thumbnail and expected_blocks > 1: + expected_blocks += 1 + + return pixel_values, expected_blocks + + +@pytest.mark.parametrize("model_name", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8]) +def test_image_preprocessing(image_assets, model_name, size_factors, + max_dynamic_patch): + """Test image preprocessing pipeline with different configurations.""" + # Load the configuration from the model + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) + + for asset in image_assets: + image = asset.pil_image + for factor in size_factors: + scaled_image = rescale_image_size(image, factor) + + # Test preprocessing and get expected number of blocks + pixel_values, expected_blocks = run_preprocessing_test( + scaled_image, config, max_dynamic_patch) + + # Verify output shapes and properties + actual_blocks = pixel_values.shape[0] + assert actual_blocks == expected_blocks, ( + f"Expected {expected_blocks} blocks, got {actual_blocks}") + + # Check image dimensions + expected_size = ( + 3, # Number of channels (C, H, W) + config.vision_config.image_size, + config.vision_config.image_size, + ) + for img in pixel_values: + assert img.shape == expected_size, ( + f"Expected image size {expected_size}, got {img.shape}") diff --git a/vllm/tests/models/decoder_only/vision_language/test_intern_vit.py b/vllm/tests/models/decoder_only/vision_language/test_intern_vit.py index 98f313eb9..32fcb0bbc 100644 --- a/vllm/tests/models/decoder_only/vision_language/test_intern_vit.py +++ b/vllm/tests/models/decoder_only/vision_language/test_intern_vit.py @@ -11,21 +11,17 @@ # we use snapshot_download to prevent conflicts between # dynamic_module and trust_remote_code for hf_runner DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"] -models = [ - snapshot_download("OpenGVLab/InternViT-300M-448px", - allow_patterns=DOWNLOAD_PATTERN), - snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5", - allow_patterns=DOWNLOAD_PATTERN), -] def run_intern_vit_test( image_assets: _ImageAssets, - model: str, + model_id: str, *, dtype: str, distributed_executor_backend: Optional[str] = None, ): + model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN) + img_processor = CLIPImageProcessor.from_pretrained(model) images = [asset.pil_image for asset in image_assets] pixel_values = [ @@ -67,12 +63,15 @@ def run_intern_vit_test( assert cos_similar(vllm_output, hf_output).mean() > 0.99 -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", [ + "OpenGVLab/InternViT-300M-448px", + "OpenGVLab/InternViT-6B-448px-V1-5", +]) @pytest.mark.parametrize("dtype", [torch.half]) @torch.inference_mode() -def test_models(dist_init, image_assets, model, dtype: str) -> None: +def test_models(dist_init, image_assets, model_id, dtype: str) -> None: run_intern_vit_test( image_assets, - model, + model_id, dtype=dtype, ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_internvl.py b/vllm/tests/models/decoder_only/vision_language/test_internvl.py deleted file mode 100644 index fc842ec4a..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_internvl.py +++ /dev/null @@ -1,403 +0,0 @@ -import types -from typing import List, Optional, Tuple, Type, Union - -import pytest -import torch -from PIL.Image import Image -from transformers import AutoConfig - -from vllm.multimodal.utils import rescale_image_size - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "<|im_start|>User\n\nWhat's the content in the center of the image?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 - "cherry_blossom": - "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 -}) -HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 - -models = [ - "OpenGVLab/InternVL2-1B", - "OpenGVLab/InternVL2-2B", - # NOTE: Mono-InternVL-2B doesn't work with fp16, - # it will result NaN during inference. - # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9 - "OpenGVLab/Mono-InternVL-2B", - # Broken due to outdated implementation of Phi-3 - # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 - # "OpenGVLab/InternVL2-4B", -] -target_dtype = "bfloat16" - - -# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py -def generate( - self, - pixel_values: torch.FloatTensor, - input_ids: torch.FloatTensor, - attention_mask: Optional[torch.LongTensor] = None, - **generate_kwargs, -) -> torch.LongTensor: - """Generate method for InternVL2 model without fixed use_cache.""" - assert self.img_context_token_id is not None - vit_embeds = self.extract_feature(pixel_values) - input_embeds = self.language_model.get_input_embeddings()(input_ids) - B, N, C = input_embeds.shape - input_embeds = input_embeds.reshape(B * N, C) - - input_ids = input_ids.reshape(B * N) - selected = (input_ids == self.img_context_token_id) - assert selected.sum() != 0 - input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) - - input_embeds = input_embeds.reshape(B, N, C) - - forward_kwargs = dict( - inputs_embeds=input_embeds, - attention_mask=attention_mask, - ) - if getattr(self, "use_visual_token_mask", False): - visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype) - forward_kwargs["visual_token_mask"] = visual_token_mask - outputs = self.language_model.generate( - **forward_kwargs, - **generate_kwargs, - ) - - return outputs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - class InternVLProcessor: - """A simple processor for InternVL2 which misses a processor.""" - - def __init__(self, hf_runner: HfRunner): - self.num_image_token = hf_runner.model.num_image_token - self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype - - self.config = AutoConfig.from_pretrained(hf_runner.model_name, - trust_remote_code=True) - self.vision_config = self.config.vision_config - self.use_thumbnail = self.config.use_thumbnail - self.min_num = self.config.min_dynamic_patch - self.max_num = self.config.max_dynamic_patch - self.image_size = self.vision_config.image_size - - def __call__(self, text: str, images: Union[Image, List[Image]], - **kwargs): - from vllm.model_executor.models.internvl import ( - IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) - images = [images] if isinstance(images, Image) else images - pixel_values = [ - image_to_pixel_values(image, self.image_size, self.min_num, - self.max_num, - self.use_thumbnail).to(self.dtype) - for image in images - ] - num_patches_list = [ - pixel_value.shape[0] for pixel_value in pixel_values - ] - pixel_values = torch.cat(pixel_values, dim=0) - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token \ - * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('', image_tokens, 1) - prompt = self.tokenizer(text, return_tensors="pt") - prompt.update({"pixel_values": pixel_values}) - return prompt - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - max_model_len=4096, - dtype=dtype, - limit_mm_per_prompt={"image": mm_limit}, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype) as hf_model: - img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids( - "") - hf_model.model.img_context_token_id = img_context_token_id - hf_model.processor = InternVLProcessor(hf_model) - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.get_output_embeddings() - hf_model.model.generate = types.MethodType(generate, hf_model.model) - eos_token_id = hf_model.tokenizer.eos_token_id - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=hf_images, - eos_token_id=eos_token_id) - for prompts, hf_images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) - - -def run_awq_test( - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - models: Tuple[str, str], - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - source_model, quant_model = models - - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(source_model, - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - source_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - with vllm_runner(quant_model, - quantization="awq", - max_model_len=4096, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - quant_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs_per_image - ] - - for source_outputs, quant_outputs in zip(source_outputs_per_image, - quant_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=source_outputs, - outputs_1_lst=quant_outputs, - name_0="source", - name_1="awq", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - run_test( - hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.5, 0.75, 1.0], - ], -) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image for asset in image_assets] - - inputs_per_case = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - - run_test( - hf_runner, - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", ["OpenGVLab/InternVL2-2B"]) -@pytest.mark.parametrize("size_factors", [[0.5, 1.0]]) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_different_num_patches(hf_runner, vllm_runner, image_assets, model, - size_factors, dtype: str, max_tokens: int, - num_logprobs: int) -> None: - images = [asset.pil_image.resize((896, 896)) for asset in image_assets] - - inputs_batching = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - - inputs_multi_images = [ - ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], - [[rescale_image_size(image, factor) for image in images] - for factor in size_factors]) - ] - for inputs in [inputs_batching, inputs_multi_images]: - run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=2, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize( - "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")]) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@torch.inference_mode() -def test_awq_models(vllm_runner, image_assets, models, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_awq_test( - vllm_runner, - image_assets, - models, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_llava.py b/vllm/tests/models/decoder_only/vision_language/test_llava.py deleted file mode 100644 index fd28a9367..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_llava.py +++ /dev/null @@ -1,313 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, - BatchEncoding) - -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", - # TODO: Get this model to produce meaningful output in vLLM - # "TIGER-Lab/Mantis-8B-siglip-llama3", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - # NOTE: For local use; this isn't tested in CI yet (see TODO above) - if model.startswith("TIGER-Lab/Mantis"): - from mantis.models.mllava import MLlavaProcessor - - torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype] - mantis_processor = MLlavaProcessor.from_pretrained( - model, torch_dtype=torch_dtype) - assert isinstance(mantis_processor, MLlavaProcessor) - else: - mantis_processor = None - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=4096, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - if mantis_processor is not None: - - def process(hf_inputs: BatchEncoding): - hf_inputs["pixel_values"] = hf_inputs["pixel_values"] \ - .to(torch_dtype) # type: ignore - return hf_inputs - else: - - def process(hf_inputs: BatchEncoding): - return hf_inputs - - with hf_runner(model, - dtype=dtype, - postprocess_inputs=process, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 2 images.\nASSISTANT:", - "USER: \nDescribe 4 images.\nASSISTANT:", # noqa: E501 - "USER: \nWhat is the season?\nASSISTANT:", - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -def test_context_length_too_short(vllm_runner, image_assets, model): - images = [asset.pil_image for asset in image_assets] - - with pytest.raises(ValueError, match="too long to fit into the model"): - vllm_model = vllm_runner( - model, - max_model_len=128, # LLaVA has a feature size of 576 - enforce_eager=True, - ) - - with vllm_model: - vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]], - max_tokens=1, - images=[images[0]]) diff --git a/vllm/tests/models/decoder_only/vision_language/test_llava_image_embeds.py b/vllm/tests/models/decoder_only/vision_language/test_llava_image_embeds.py deleted file mode 100644 index 664140325..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_llava_image_embeds.py +++ /dev/null @@ -1,158 +0,0 @@ -from typing import List, Optional, Tuple, Type - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.sequence import SampleLogprobs - -from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets -from ...utils import check_logprobs_close - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "USER: \nWhat's the content of the image?\nASSISTANT:", - "cherry_blossom": - "USER: \nWhat is the season?\nASSISTANT:", -}) - -models = [ - "llava-hf/llava-1.5-7b-hf", -] - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding vision language config as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - - # vLLM to load from image embeddings - vllm_images = [asset.image_embeds for asset in image_assets] - - # transformers to load from PIL images - hf_images = [asset.pil_image for asset in image_assets] - - vllm_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(vllm_images, HF_IMAGE_PROMPTS)] - - hf_inputs_per_image = [( - [prompt for _ in size_factors], - [image for _ in size_factors], - ) for image, prompt in zip(hf_images, HF_IMAGE_PROMPTS)] - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in vllm_inputs_per_image - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in hf_inputs_per_image - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype: str, max_tokens: int, num_logprobs: int) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) diff --git a/vllm/tests/models/decoder_only/vision_language/test_llava_next.py b/vllm/tests/models/decoder_only/vision_language/test_llava_next.py deleted file mode 100644 index aa9b297c5..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_llava_next.py +++ /dev/null @@ -1,347 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.inputs import InputContext -from vllm.multimodal.utils import rescale_image_size -from vllm.sequence import SampleLogprobs - -from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _ImageAssets) -from ...utils import build_model_context, check_logprobs_close - -_LIMIT_IMAGE_PER_PROMPT = 4 - -HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ - "stop_sign": - "[INST] \nWhat's the content of the image? [/INST]", - "cherry_blossom": - "[INST] \nWhat is the season? [/INST]", -}) - -models = ["llava-hf/llava-v1.6-mistral-7b-hf"] - - -@pytest.fixture() -def get_max_llava_next_image_tokens(): - from vllm.model_executor.models.llava_next import ( - get_max_llava_next_image_tokens) - return get_max_llava_next_image_tokens - - -@pytest.fixture() -def dummy_data_for_llava_next(): - from vllm.model_executor.models.llava_next import dummy_data_for_llava_next - return dummy_data_for_llava_next - - -def vllm_to_hf_output(vllm_output: Tuple[List[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - config = AutoConfig.from_pretrained(model) - image_token_id = config.image_token_index - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = [ - token_id for idx, token_id in enumerate(output_ids) - if token_id != image_token_id or output_ids[idx - 1] != image_token_id - ] - - assert output_str[0] == " " - hf_output_str = output_str[1:] - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: List[float], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -@overload -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - sizes: List[Tuple[int, int]], - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - ... - - -def run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, - model: str, - *, - size_factors: Optional[List[float]] = None, - sizes: Optional[List[Tuple[int, int]]] = None, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - images = [asset.pil_image for asset in image_assets] - - if size_factors is not None: - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - elif sizes is not None: - inputs_per_image = [( - [prompt for _ in sizes], - [image.resize(size) for size in sizes], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] - else: - raise ValueError("You must provide either `size_factors` or `sizes`") - - _run_test(hf_runner, - vllm_runner, - inputs_per_image, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend) - - -def _run_test( - hf_runner: Type[HfRunner], - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput]], - model: str, - dtype: str, - max_tokens: int, - num_logprobs: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - dtype=dtype, - max_model_len=10240, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enforce_eager=True, - limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT - }) as vllm_model: - vllm_outputs_per_image = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - with hf_runner(model, dtype=dtype, - auto_cls=AutoModelForVision2Seq) as hf_model: - hf_outputs_per_image = [ - hf_model.generate_greedy_logprobs_limit(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images) - for prompts, images in inputs - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, - vllm_outputs_per_image): - # TODO: Check whether using original CLIPVisionModel can improve - # consistency against HF - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "size_factors", - [ - # No image - [], - # Single-scale - [1.0], - # Single-scale, batched - [1.0, 1.0, 1.0], - # Multi-scale - [0.25, 0.5, 1.0], - ], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, - dtype, max_tokens, num_logprobs) -> None: - """Inference result should be the same between hf and vllm. - - All the image fixtures for the test are from IMAGE_ASSETS. - For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects - and corresponding MultiModalConfig as input. - Note, the text input is also adjusted to abide by vllm contract. - The text output is sanitized to be able to compare with hf. - """ - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - size_factors=size_factors, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize( - "sizes", - [[(1669, 2560), (2560, 1669), (183, 488), (488, 183)]], -) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_fixed_sizes(hf_runner, vllm_runner, image_assets, model, sizes, - dtype, max_tokens, num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: - stop_sign = image_assets[0].pil_image - cherry_blossom = image_assets[1].pil_image - - inputs = [( - [ - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 2 images. [/INST]", - "[INST] \nDescribe 4 images. [/INST]", - "[INST] \nWhat is the season? [/INST]" - ], - [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) - - -@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ - ([[336, 336]], 1176), - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), -]) -def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, - get_max_llava_next_image_tokens): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - # and calculate the resulting max tokens - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - - actual_max_tokens = get_max_llava_next_image_tokens( - InputContext(ctx.model_config)) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize( - "gridpoints,expected_size", - [ - # One point; it has to be the largest - ([[336, 336]], (336, 336)), - # Default for most llava next models; the 2x2 tile is the largest - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], - (672, 672)), - # If two rectangular gridpoints are the same, the more vertical - # one has the higher feature count due to newline features - ([[336, 672], [672, 336]], (672, 336)) - ]) -def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, - gridpoints, expected_size): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - seq_len = 5000 # bigger than the max feature size for any image - - seq_data, mm_data = dummy_data_for_llava_next( - ctx, - seq_len=seq_len, - mm_counts={"image": 1}, - ) - - # The dummy data dims should match the gridpoint with the biggest feat size - assert mm_data["image"].height == expected_size[0] - assert mm_data["image"].width == expected_size[1] - assert len(seq_data.get_token_ids()) >= seq_len diff --git a/vllm/tests/models/decoder_only/vision_language/test_llava_next_video.py b/vllm/tests/models/decoder_only/vision_language/test_llava_next_video.py deleted file mode 100644 index 7b7b23c78..000000000 --- a/vllm/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ /dev/null @@ -1,226 +0,0 @@ -from typing import List, Optional, Tuple, Type, overload - -import pytest -from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer - -from vllm.multimodal.utils import (rescale_video_size, resize_video, - sample_frames_from_video) -from vllm.sequence import SampleLogprobs - -from ....conftest import VIDEO_ASSETS, HfRunner, VllmRunner, _VideoAssets -from ...utils import check_logprobs_close - -_PREFACE = ( - "A chat between a curious human and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the human's " - "questions.") - -HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ - "sample_demo_1": - f"{_PREFACE}USER: