From 6c75674294be93751de9449067f239f01630a312 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 11:45:30 -0500 Subject: [PATCH 01/15] Test md linter Signed-off-by: Rafael Vasquez --- .github/workflows/sphinx-lint.yml | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml index e0bb24276a653..453fd5b129a4b 100644 --- a/.github/workflows/sphinx-lint.yml +++ b/.github/workflows/sphinx-lint.yml @@ -13,20 +13,10 @@ on: - "docs/**" jobs: - sphinx-lint: + markdown-lint: runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + - name: markdownlint-cli2-action + uses: DavidAnson/markdownlint-cli2-action@v19.0.0 with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/sphinx-lint.sh + globs: '**/*.md' From f4f802ca2d6914bcc33d3939c580d01915f00dc0 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 11:53:45 -0500 Subject: [PATCH 02/15] Keep sphinxlint Signed-off-by: Rafael Vasquez --- .github/workflows/sphinx-lint.yml | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml index 453fd5b129a4b..ca0f9b523c361 100644 --- a/.github/workflows/sphinx-lint.yml +++ b/.github/workflows/sphinx-lint.yml @@ -13,10 +13,27 @@ on: - "docs/**" jobs: - markdown-lint: + sphinx-lint: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] steps: - - name: markdownlint-cli2-action - uses: DavidAnson/markdownlint-cli2-action@v19.0.0 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 with: - globs: '**/*.md' + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Linting docs + run: tools/sphinx-lint.sh + markdown-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - uses: DavidAnson/markdownlint-cli2-action@v19 + with: + globs: "docs/**/*.md" \ No newline at end of file From eb417732bdf4f6593d8a85a727e7026dad6721bc Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 12:00:16 -0500 Subject: [PATCH 03/15] Fix typo Signed-off-by: Rafael Vasquez --- docs/source/serving/offline_inference.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 79092ab208784..94703a1c32ade 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -64,7 +64,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her #### Context length and batch size -You can further reduce memory usage by limit the context length of the model (`max_model_len` option) +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). ```python From a77d9fb5d3ad3c63ad1286478d90980bc4bbab30 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 17:16:17 -0500 Subject: [PATCH 04/15] Update action, config, and fix docs Signed-off-by: Rafael Vasquez --- .github/workflows/doc-lint.yml | 22 ++++ .github/workflows/sphinx-lint.yml | 39 ------ .markdownlint-cli2.yaml | 24 ++++ docs/README.md | 1 + docs/source/api/multimodal/index.md | 1 - docs/source/api/params.md | 1 - docs/source/community/sponsors.md | 2 + docs/source/contributing/overview.md | 2 - .../contributing/vulnerability_management.md | 4 +- docs/source/deployment/docker.md | 4 +- .../source/deployment/frameworks/cerebrium.md | 10 +- docs/source/deployment/frameworks/dstack.md | 10 +- docs/source/deployment/frameworks/skypilot.md | 4 +- .../deployment/integrations/llamastack.md | 2 +- docs/source/deployment/k8s.md | 9 +- .../source/design/automatic_prefix_caching.md | 11 +- docs/source/features/quantization/auto_awq.md | 4 +- docs/source/features/quantization/bnb.md | 7 +- docs/source/features/quantization/fp8.md | 4 +- .../features/quantization/fp8_e4m3_kvcache.md | 4 +- docs/source/features/quantization/gguf.md | 10 +- docs/source/features/quantization/int8.md | 2 +- docs/source/features/spec_decode.md | 10 +- docs/source/features/tool_calling.md | 90 +++++++------ docs/source/getting_started/faq.md | 2 +- .../getting_started/installation/cpu-apple.md | 17 +-- .../getting_started/installation/cpu-x86.md | 30 ++--- .../getting_started/installation/gpu-cuda.md | 96 +++++++------- .../getting_started/installation/gpu-rocm.md | 120 +++++++++--------- .../getting_started/installation/hpu-gaudi.md | 60 ++++----- .../getting_started/installation/neuron.md | 8 +- .../getting_started/installation/openvino.md | 14 +- .../getting_started/installation/tpu.md | 6 +- .../getting_started/installation/xpu.md | 24 ++-- docs/source/getting_started/quickstart.md | 44 +++---- .../source/getting_started/troubleshooting.md | 6 +- docs/source/index.md | 4 +- .../models/extensions/runai_model_streamer.md | 12 +- docs/source/models/supported_models.md | 8 +- docs/source/performance/optimization.md | 2 +- docs/source/serving/distributed_serving.md | 40 +++--- docs/source/serving/integrations/langchain.md | 2 +- .../source/serving/integrations/llamaindex.md | 2 +- docs/source/serving/metrics.md | 2 +- docs/source/serving/multimodal_inputs.md | 1 + .../serving/openai_compatible_server.md | 20 ++- tools/sphinx-lint.sh | 3 - 47 files changed, 408 insertions(+), 392 deletions(-) create mode 100644 .github/workflows/doc-lint.yml delete mode 100644 .github/workflows/sphinx-lint.yml create mode 100644 .markdownlint-cli2.yaml delete mode 100755 tools/sphinx-lint.sh diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml new file mode 100644 index 0000000000000..24e761fb5c421 --- /dev/null +++ b/.github/workflows/doc-lint.yml @@ -0,0 +1,22 @@ +name: Lint documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + pull_request: + branches: + - main + paths: + - "docs/**" + +jobs: + markdown-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - uses: DavidAnson/markdownlint-cli2-action@v19 + with: + config: '.markdownlint-cli2.yaml' \ No newline at end of file diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml deleted file mode 100644 index ca0f9b523c361..0000000000000 --- a/.github/workflows/sphinx-lint.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - sphinx-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/sphinx-lint.sh - markdown-lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - uses: DavidAnson/markdownlint-cli2-action@v19 - with: - globs: "docs/**/*.md" \ No newline at end of file diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 0000000000000..32ccc5178ae2e --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,24 @@ +# Disable some built-in rules +config: + link-fragments: false + line-length: false + no-inline-html: false + first-line-heading: false + no-duplicate-heading: + siblings_only: true + +# Ignore files referenced by .gitignore (only valid at root) +gitignore: true + +# Define glob expressions to use (only valid at root) +globs: + - "docs/**/*.md" + +# Disable banner message on stdout (only valid at root) +noBanner: true + +# Disable progress on stdout (only valid at root) +noProgress: false + +# Show found files on stdout (only valid at root) +showFound: true \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index 46488c9bb0b92..1a44c1341f4fb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,4 +16,5 @@ make html ```bash python -m http.server -d build/html/ ``` + Launch your browser and open localhost:8000. diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 0046b73ea825e..fa2eb5793386e 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -13,7 +13,6 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - ## Module Contents ```{eval-rst} diff --git a/docs/source/api/params.md b/docs/source/api/params.md index a3b4d9cbb44ec..56e6fb664798b 100644 --- a/docs/source/api/params.md +++ b/docs/source/api/params.md @@ -19,4 +19,3 @@ Optional parameters for vLLM APIs. .. autoclass:: vllm.PoolingParams :members: ``` - diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 9d2af4c13b088..fb93e65673dff 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -6,6 +6,7 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: + - a16z - Dropbox - Sequoia Capital @@ -13,6 +14,7 @@ Cash Donations: - ZhenFund Compute Resources: + - AMD - Anyscale - AWS diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index c960790f47a13..e92104399342d 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -37,8 +37,6 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` -# Contribution Guidelines - ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index 422dc13e6a644..61766f795bdb9 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -32,8 +32,8 @@ We prefer to keep all vulnerability-related communication on the security report on GitHub. However, if you need to contact the VMT directly for an urgent issue, you may contact the following individuals: -- Simon Mo - simon.mo@hey.com -- Russell Bryant - rbryant@redhat.com +- Simon Mo - +- Russell Bryant - ## Slack Discussion diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 2df1aca27f1e6..c735bfd0e87a7 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -28,8 +28,8 @@ memory to share data between processes under the hood, particularly for tensor p You can build and run vLLM from source via the provided . To build vLLM: ```console -$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index be018dfb75d7a..5787c4a407bfb 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -$ pip install cerebrium -$ cerebrium login +pip install cerebrium +cerebrium login ``` Next, create your Cerebrium project, run: ```console -$ cerebrium init vllm-project +cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -$ cerebrium deploy +cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index 4142c1d9f1f60..b42a34125c6d7 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -$ pip install "dstack[all] -$ dstack server +pip install "dstack[all] +dstack server ``` Next, to configure your dstack project, run: ```console -$ mkdir -p vllm-dstack -$ cd vllm-dstack -$ dstack init +mkdir -p vllm-dstack +cd vllm-dstack +dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 657e7f2bc72cc..6e7d7b7e51d7b 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -332,13 +332,13 @@ run: | ``` -1. Start the chat web UI: +Start the chat web UI: ```console sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) ``` -2. Then, we can access the GUI at the returned gradio link: +Then, we can access the GUI at the returned gradio link: ```console | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md index 474d2bdfa9580..a6c3569637abf 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta To install Llama Stack, run ```console -$ pip install llama-stack -q +pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 760214e112fba..9e982f9c30e9a 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -14,7 +14,7 @@ Before you begin, ensure that you have the following: ## Deployment Steps -1. **Create a PVC , Secret and Deployment for vLLM** +### Create a PVC , Secret and Deployment for vLLM PVC is used to store the model cache and it is optional, you can use hostPath or other storage options @@ -49,7 +49,7 @@ stringData: Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. -Here are two examples for using NVIDIA GPU and AMD GPU. +Here are two examples for using NVIDIA GPU and AMD GPU. - NVIDIA GPU @@ -194,9 +194,10 @@ spec: - name: shm mountPath: /dev/shm ``` + You can get the full example with steps and sample yaml files from . -2. **Create a Kubernetes Service for vLLM** +### Create a Kubernetes Service for vLLM Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: @@ -219,7 +220,7 @@ spec: type: ClusterIP ``` -3. **Deploy and Test** +### Deploy and Test Apply the deployment and service configurations using `kubectl apply -f `: diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md index 4398536b2b4ad..bbea45eac45bf 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -6,7 +6,7 @@ The core idea of [PagedAttention](#design-paged-attention) is to partition the K To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -``` +```text Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` - In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -``` +```text hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 3679595e3d4d0..404505eb3890e 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -$ pip install autoawq +pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index f7f41726f3725..c0cde55685445 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -$ pip install bitsandbytes>=0.45.0 +pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint. +## Read quantized checkpoint ```python from vllm import LLM @@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` + ## OpenAI Compatible Server Append the following to your 4bit model arguments: -``` +```bash --quantization bitsandbytes --load-format bitsandbytes ``` diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b2eda74fd1e3b..da49cd2747228 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -$ pip install vllm lm-eval==0.4.4 +pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index 50edaf81fddd3..d233cdf0a464c 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -3,7 +3,7 @@ # FP8 E4M3 KV Cache Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 +improving throughput. OCP (Open Compute Project ) specifies two common 8-bit floating point data formats: E5M2 (5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside @@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). To install AMMO (AlgorithMic Model Optimization): ```console -$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo ``` Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index eebf11dfc1b2b..640997cf4bc39 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 1ac50ba987dda..82a15d76d352f 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index 903acadb71426..ab7b2f302bd13 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models: 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be used directly with vLLM due to differences in the expected layer names and model definition. - To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert them. Note that this script does not modify the model's weights. In the above example, use the script to first convert - the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model and then use the converted checkpoint as the draft model in vLLM. 2. The EAGLE based draft models need to be run without tensor parallelism @@ -207,7 +207,6 @@ A few important things to consider when using the EAGLE based draft models: reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). - A variety of EAGLE draft models are available on the Hugging Face hub: | Base Model | EAGLE on Hugging Face | # EAGLE Parameters | @@ -224,7 +223,6 @@ A variety of EAGLE draft models are available on the Hugging Face hub: | Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | | Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | - ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -250,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas: same request across runs. For more details, see the FAQ section titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). -**Conclusion** - While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -259,8 +255,6 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -**Mitigation Strategies** - For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 062f2021eb62a..f8ef6795e9d5d 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -55,13 +55,15 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: -``` + +```text Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: + - Setting up the server with tool calling enabled - Defining an actual function to handle tool calls - Making a request with `tool_choice="auto"` @@ -70,6 +72,7 @@ This example demonstrates: You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: + 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -77,26 +80,27 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling + vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. - ## Automatic Function Calling To enable this feature, you should set the following flags: -* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it + +- `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers +- `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. -* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. -* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +- `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. +- `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) @@ -104,54 +108,54 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! - ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. -* `NousResearch/Hermes-2-Pro-*` -* `NousResearch/Hermes-2-Theta-*` -* `NousResearch/Hermes-3-*` +- `NousResearch/Hermes-2-Pro-*` +- `NousResearch/Hermes-2-Theta-*` +- `NousResearch/Hermes-3-*` _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` - ### Mistral Models (`mistral`) Supported models: -* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) -* Additional mistral function-calling models are compatible as well. + +- `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) +- Additional mistral function-calling models are compatible as well. Known issues: + 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition is not met, the following additional chat templates are provided: -* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that +- `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) -* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt +- `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. - Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` - ### Llama Models (`llama3_json`) Supported models: -* `meta-llama/Meta-Llama-3.1-8B-Instruct` -* `meta-llama/Meta-Llama-3.1-70B-Instruct` -* `meta-llama/Meta-Llama-3.1-405B-Instruct` -* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` + +- `meta-llama/Meta-Llama-3.1-8B-Instruct` +- `meta-llama/Meta-Llama-3.1-70B-Instruct` +- `meta-llama/Meta-Llama-3.1-405B-Instruct` +- `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: + 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -164,64 +168,68 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: -* `ibm-granite/granite-3.0-8b-instruct` + +- `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` `examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. -* `ibm-granite/granite-3.1-8b-instruct` +- `ibm-granite/granite-3.1-8b-instruct` Recommended flags: `--tool-call-parser granite` The chat template from Huggingface can be used directly. Parallel function calls are supported. -* `ibm-granite/granite-20b-functioncalling` +- `ibm-granite/granite-20b-functioncalling` Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - ### InternLM Models (`internlm`) Supported models: -* `internlm/internlm2_5-7b-chat` (confirmed) -* Additional internlm2.5 function-calling models are compatible as well + +- `internlm/internlm2_5-7b-chat` (confirmed) +- Additional internlm2.5 function-calling models are compatible as well Known issues: -* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. -Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` +- Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. +Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` ### Jamba Models (`jamba`) + AI21's Jamba-1.5 models are supported. -* `ai21labs/AI21-Jamba-1.5-Mini` -* `ai21labs/AI21-Jamba-1.5-Large` +- `ai21labs/AI21-Jamba-1.5-Mini` +- `ai21labs/AI21-Jamba-1.5-Large` Flags: `--tool-call-parser jamba` - ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: + ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: -* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) -* Llama's smaller models struggle to use tools effectively. + +- The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) +- Llama's smaller models struggle to use tools effectively. Example supported models: -* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) -* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) -* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) -* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) + +- `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +- `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +- `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +- `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) Flags: `--tool-call-parser pythonic --chat-template {see_above}` @@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- - ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. -``` + +```bash --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index fde2954f10c59..4751b325e6fc4 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -**Mitigation Strategies** +## Mitigation Strategies - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md index b55e4384d064d..29e9892f1ba12 100644 --- a/docs/source/getting_started/installation/cpu-apple.md +++ b/docs/source/getting_started/installation/cpu-apple.md @@ -18,25 +18,23 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. -``` -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -r requirements-cpu.txt -$ pip install -e . +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-cpu.txt +pip install -e . ``` ```{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. ``` - - ## Troubleshooting -If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). -``` +```text [...] fatal error: 'map' file not found 1 | #include | ^~~~~ @@ -48,4 +46,3 @@ If the build has error like the following snippet where standard C++ headers can | ^~~~~~~~~ 1 error generated. ``` - diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index bb046dd0fd9dc..df2c0323e6305 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -48,23 +48,23 @@ $ docker run -it \ - First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get update -y -$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev -$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +sudo apt-get update -y +sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` - Second, install Python packages for vLLM CPU backend building: ```console -$ pip install --upgrade pip -$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +pip install --upgrade pip +pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, build and install vLLM CPU backend: ```console -$ VLLM_TARGET_DEVICE=cpu python setup.py install +VLLM_TARGET_DEVICE=cpu python setup.py install ``` ```{note} @@ -92,18 +92,18 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -$ find / -name *libtcmalloc* # find the dynamic link library path -$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference/offline_inference.py # run vLLM +sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +find / -name *libtcmalloc* # find the dynamic link library path +export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +python examples/offline_inference/offline_inference.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -$ export VLLM_CPU_KVCACHE_SPACE=40 -$ export VLLM_CPU_OMP_THREADS_BIND=0-29 -$ vllm serve facebook/opt-125m +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_OMP_THREADS_BIND=0-29 +vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -148,7 +148,7 @@ $ python examples/offline_inference/offline_inference.py - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md index 419b8163fc034..e8606b21169b7 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -17,9 +17,9 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin You can create a new Python environment using `conda`: ```console -$ # (Recommended) Create a new conda environment. -$ conda create -n myenv python=3.12 -y -$ conda activate myenv +# (Recommended) Create a new conda environment. +conda create -n myenv python=3.12 -y +conda activate myenv ``` ```{note} @@ -29,9 +29,9 @@ $ conda activate myenv Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: ```console -$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate +# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +uv venv myenv --python 3.12 --seed +source myenv/bin/activate ``` In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. @@ -43,18 +43,18 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -$ # Install vLLM with CUDA 12.1. -$ pip install vllm # If you are using pip. -$ uv pip install vllm # If you are using uv. +# Install vLLM with CUDA 12.1. +pip install vllm # If you are using pip. +uv pip install vllm # If you are using uv. ``` As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: ```console -$ # Install vLLM with CUDA 11.8. -$ export VLLM_VERSION=0.6.1.post1 -$ export PYTHON_VERSION=310 -$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +# Install vLLM with CUDA 11.8. +export VLLM_VERSION=0.6.1.post1 +export PYTHON_VERSION=310 +pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` (install-the-latest-code)= @@ -66,7 +66,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ### Install the latest code using `pip` ```console -$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -74,8 +74,8 @@ $ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. @@ -85,14 +85,14 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p Another way to install the latest code is to use `uv`: ```console -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console -$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. @@ -102,8 +102,8 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb Another way to access the latest code is to use the docker images: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. @@ -121,18 +121,18 @@ The latest code can contain bugs and may not be stable. Please use it with cauti If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_USE_PRECOMPILED=1 pip install --editable . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation. +This will download the latest nightly wheel from and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -$ pip install --editable . +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +pip install --editable . ``` You can find more information about vLLM's wheels [above](#install-the-latest-code). @@ -147,9 +147,9 @@ It is recommended to use the same commit ID for the source code as the vLLM whee If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -e . ``` ```{tip} @@ -172,11 +172,11 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python use_existing_torch.py -$ pip install -r requirements-build.txt -$ pip install -e . --no-build-isolation +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +pip install -r requirements-build.txt +pip install -e . --no-build-isolation ``` #### Use the local cutlass for compilation @@ -185,9 +185,9 @@ Currently, before starting the build process, vLLM fetches cutlass code from Git To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` #### Troubleshooting @@ -196,8 +196,8 @@ To avoid your system being overloaded, you can limit the number of compilation j to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -$ export MAX_JOBS=6 -$ pip install -e . +export MAX_JOBS=6 +pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -206,22 +206,22 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -$ # Use `--ipc=host` to make sure the shared memory is large enough. -$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +# Use `--ipc=host` to make sure the shared memory is large enough. +docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -$ export CUDA_HOME=/usr/local/cuda -$ export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda +export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -$ nvcc --version # verify that nvcc is in your PATH -$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +nvcc --version # verify that nvcc is in your PATH +${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` ### Unsupported OS build @@ -231,6 +231,6 @@ vLLM can fully run only on Linux but for development purposes, you can still bui Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -$ export VLLM_TARGET_DEVICE=empty -$ pip install -e . +export VLLM_TARGET_DEVICE=empty +pip install -e . ``` diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md index e36b92513e31d..2f3510c8a7334 100644 --- a/docs/source/getting_started/installation/gpu-rocm.md +++ b/docs/source/getting_started/installation/gpu-rocm.md @@ -47,13 +47,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: ```console -$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: ```console -$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: @@ -80,84 +80,84 @@ Where the `` is the location where the model is stored, for examp 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): -- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) -- [PyTorch](https://pytorch.org/) + - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) + - [PyTorch](https://pytorch.org/) -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) -```console -$ python3 -m pip install ninja cmake wheel pybind11 -$ pip uninstall -y triton -$ git clone https://github.com/OpenAI/triton.git -$ cd triton -$ git checkout e192dba -$ cd python -$ pip3 install . -$ cd ../.. -``` + ```console + python3 -m pip install ninja cmake wheel pybind11 + pip uninstall -y triton + git clone https://github.com/OpenAI/triton.git + cd triton + git checkout e192dba + cd python + pip3 install . + cd ../.. + ``` -```{note} -- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -``` + ```{note} + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ``` 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) -Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) -Alternatively, wheels intended for vLLM use can be accessed under the releases. + Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) + Alternatively, wheels intended for vLLM use can be accessed under the releases. -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. -```console -$ git clone https://github.com/ROCm/flash-attention.git -$ cd flash-attention -$ git checkout 3cea2fb -$ git submodule update --init -$ GPU_ARCHS="gfx90a" python3 setup.py install -$ cd .. -``` + ```console + git clone https://github.com/ROCm/flash-attention.git + cd flash-attention + git checkout 3cea2fb + git submodule update --init + GPU_ARCHS="gfx90a" python3 setup.py install + cd .. + ``` -```{note} -- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -``` + ```{note} + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ``` 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: -```bash -$ pip install --upgrade pip + ```bash + $ pip install --upgrade pip -# Install PyTorch -$ pip uninstall torch -y -$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 -# Build & install AMD SMI -$ pip install /opt/rocm/share/amd_smi + # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi -# Install dependencies -$ pip install --upgrade numba scipy huggingface-hub[cli] -$ pip install "numpy<2" -$ pip install -r requirements-rocm.txt + # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt -# Build vLLM for MI210/MI250/MI300. -$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" -$ python3 setup.py develop -``` + # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + ``` -This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. + This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. -```{tip} -- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. -- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. -- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. -- The ROCm version of PyTorch, ideally, should match the ROCm driver version. -``` + ```{tip} + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + ``` -```{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` + ```{tip} + - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). + ``` diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md index 1d50cef3bdc83..a6407b0447a8d 100644 --- a/docs/source/getting_started/installation/hpu-gaudi.md +++ b/docs/source/getting_started/installation/hpu-gaudi.md @@ -22,8 +22,8 @@ Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optim ### Quick start using Dockerfile ```console -$ docker build -f Dockerfile.hpu -t vllm-hpu-env . -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` ```{tip} @@ -37,10 +37,10 @@ If you're observing the following error: `docker: Error response from daemon: Un To verify that the Intel Gaudi software was correctly installed, run: ```console -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural_compressor is installed +hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack @@ -57,8 +57,8 @@ for more details. Use the following commands to run a Docker image: ```console -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` #### Build and Install vLLM @@ -66,18 +66,18 @@ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_ To build and install vLLM from source, run: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python setup.py develop +git clone https://github.com/vllm-project/vllm.git +cd vllm +python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -$ git clone https://github.com/HabanaAI/vllm-fork.git -$ cd vllm-fork -$ git checkout habana_main -$ python setup.py develop +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout habana_main +python setup.py develop ``` ## Supported Features @@ -181,7 +181,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` +```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -192,7 +192,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -``` +```text min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -201,7 +201,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` +```text min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -224,7 +224,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` +```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -273,7 +273,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -``` +```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -350,18 +350,18 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Prompt: : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md index 431f90537f543..5581b1940ca46 100644 --- a/docs/source/getting_started/installation/neuron.md +++ b/docs/source/getting_started/installation/neuron.md @@ -123,10 +123,10 @@ python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torch Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -U -r requirements-neuron.txt -$ VLLM_TARGET_DEVICE="neuron" pip install . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements-neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md index 60f95fd1c4250..d97d4173bf36b 100644 --- a/docs/source/getting_started/installation/openvino.md +++ b/docs/source/getting_started/installation/openvino.md @@ -27,8 +27,8 @@ vLLM powered by OpenVINO supports all LLM models from [vLLM supported models lis ## Quick start using Dockerfile ```console -$ docker build -f Dockerfile.openvino -t vllm-openvino-env . -$ docker run -it --rm vllm-openvino-env +docker build -f Dockerfile.openvino -t vllm-openvino-env . +docker run -it --rm vllm-openvino-env ``` (install-openvino-backend-from-source)= @@ -38,21 +38,21 @@ $ docker run -it --rm vllm-openvino-env - First, install Python. For example, on Ubuntu 22.04, you can run: ```console - $ sudo apt-get update -y - $ sudo apt-get install python3 + sudo apt-get update -y + sudo apt-get install python3 ``` - Second, install prerequisites vLLM OpenVINO backend installation: ```console - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install --upgrade pip + pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, install vLLM with OpenVINO backend: ```console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . ``` - [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md index bc93c44fead30..1938785ade46a 100644 --- a/docs/source/getting_started/installation/tpu.md +++ b/docs/source/getting_started/installation/tpu.md @@ -156,14 +156,14 @@ For more information about using TPUs with GKE, see You can use to build a Docker image with TPU support. ```console -$ docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -$ # Make sure to add `--privileged --net host --shm-size=16G`. -$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +# Make sure to add `--privileged --net host --shm-size=16G`. +docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md index c1ab5478eb652..d35e117a8446f 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/xpu.md @@ -40,15 +40,15 @@ $ docker run -it \ - Second, install Python packages for vLLM XPU backend building: ```console -$ source /opt/intel/oneapi/setvars.sh -$ pip install --upgrade pip -$ pip install -v -r requirements-xpu.txt +source /opt/intel/oneapi/setvars.sh +pip install --upgrade pip +pip install -v -r requirements-xpu.txt ``` - Finally, build and install vLLM XPU backend: ```console -$ VLLM_TARGET_DEVICE=xpu python setup.py install +VLLM_TARGET_DEVICE=xpu python setup.py install ``` ```{note} @@ -61,14 +61,14 @@ $ VLLM_TARGET_DEVICE=xpu python setup.py install XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -$ python -m vllm.entrypoints.openai.api_server \ -$ --model=facebook/opt-13b \ -$ --dtype=bfloat16 \ -$ --device=xpu \ -$ --max_model_len=1024 \ -$ --distributed-executor-backend=ray \ -$ --pipeline-parallel-size=2 \ -$ -tp=8 +python -m vllm.entrypoints.openai.api_server \ +--model=facebook/opt-13b \ +--dtype=bfloat16 \ +--device=xpu \ +--max_model_len=1024 \ +--distributed-executor-backend=ray \ +--pipeline-parallel-size=2 \ +-tp=8 ``` By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 2808e1b386801..b61b3df8cc043 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -18,9 +18,9 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -$ conda create -n myenv python=3.10 -y -$ conda activate myenv -$ pip install vllm +conda create -n myenv python=3.10 -y +conda activate myenv +pip install vllm ``` ```{note} @@ -85,7 +85,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -96,7 +96,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -$ curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -106,14 +106,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -$ curl http://localhost:8000/v1/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "prompt": "San Francisco is a", -$ "max_tokens": 7, -$ "temperature": 0 -$ }' +curl http://localhost:8000/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "Qwen/Qwen2.5-1.5B-Instruct", +"prompt": "San Francisco is a", +"max_tokens": 7, +"temperature": 0 +}' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -142,15 +142,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -$ curl http://localhost:8000/v1/chat/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "messages": [ -$ {"role": "system", "content": "You are a helpful assistant."}, -$ {"role": "user", "content": "Who won the world series in 2020?"} -$ ] -$ }' +curl http://localhost:8000/v1/chat/completions \ +-H "Content-Type: application/json" \ +-d '{ +"model": "Qwen/Qwen2.5-1.5B-Instruct", +"messages": [ +{"role": "system", "content": "You are a helpful assistant."}, +{"role": "user", "content": "Who won the world series in 2020?"} +] +}' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index f5efe0bef7506..1e290d2b4c0bd 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. (troubleshooting-incorrect-hardware-driver)= + ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -118,13 +119,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup ``` (troubleshooting-python-multiprocessing)= + ## Python multiprocessing ### `RuntimeError` Exception diff --git a/docs/source/index.md b/docs/source/index.md index 6747a7fcce4fe..8a32e782eda75 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM! +# Welcome to vLLM ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -171,7 +171,7 @@ contributing/model/index contributing/vulnerability_management ``` -# Indices and tables +## Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index fe2701194a604..75f7a9fcad416 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -$ pip3 install vllm[runai] +pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3ba34c77205e5..53f8fac38f18b 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -$ export VLLM_USE_MODELSCOPE=True +export VLLM_USE_MODELSCOPE=True ``` And use with `trust_remote_code=True`. @@ -820,9 +820,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -```{tip} -When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. -``` + ```{tip} + When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + ``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 4fcde9b03b887..4fbc376e1aa39 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +```text WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 4e0a9ef6ecf7d..ce8708b25be0e 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a") To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -$ vllm serve facebook/opt-13b \ -$ --tensor-parallel-size 4 +vllm serve facebook/opt-13b \ +--tensor-parallel-size 4 ``` You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -$ vllm serve gpt2 \ -$ --tensor-parallel-size 4 \ -$ --pipeline-parallel-size 2 +vllm serve gpt2 \ +--tensor-parallel-size 4 \ +--pipeline-parallel-size 2 ``` ## Running vLLM on multiple nodes @@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --head \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ +vllm/vllm-openai \ +ip_of_head_node \ +--head \ +/path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --worker \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ +vllm/vllm-openai \ +ip_of_head_node \ +--worker \ +/path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 8 \ -$ --pipeline-parallel-size 2 +vllm serve /path/to/the/model/in/the/container \ +--tensor-parallel-size 8 \ +--pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 16 +vllm serve /path/to/the/model/in/the/container \ +--tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md index 49ff6e0c32a72..03142d23b145a 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain To install LangChain, run ```console -$ pip install langchain langchain_community -q +pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md index 9961c181d7e1c..8c72605202cf5 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index To install LlamaIndex, run ```console -$ pip install llama-index-llms-vllm -q +pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index e6ded2e6dd465..6c84f6d1350a6 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -7,7 +7,7 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker](#deployment-docker): ```console -$ vllm serve unsloth/Llama-3.2-1B-Instruct +vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 9f5e1b908d786..bc475826bbfde 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: + ```python from openai import OpenAI diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index ec5a367594743..fad38664605f5 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -5,11 +5,13 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): + ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. + ```python from openai import OpenAI client = OpenAI( @@ -50,6 +52,7 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= + ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -71,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -80,7 +84,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -115,12 +119,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See https://github.com/vllm-project/vllm/pull/11529 for more details. +> See for more details. ```python completion = client.chat.completions.create( @@ -147,6 +151,7 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= + ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -175,7 +180,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -$ vllm serve SOME_MODEL --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -186,6 +191,7 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= + ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); @@ -212,6 +218,7 @@ The following extra parameters are supported: ``` (chat-api)= + ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -220,6 +227,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; see our [Multimodal Inputs](#multimodal-inputs) guide for more information. + - *Note: `image_url.detail` parameter is not supported.* Code example: @@ -243,6 +251,7 @@ The following extra parameters are supported: ``` (embeddings-api)= + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -284,6 +293,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= + ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -293,6 +303,7 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= + ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. @@ -302,6 +313,7 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu Code example: (score-api)= + ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh deleted file mode 100755 index 04f8075c5527f..0000000000000 --- a/tools/sphinx-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sphinx-lint --disable trailing-whitespace,missing-final-newline docs From 6a9eeec9e6c1ca2766edd9d7a8710af77e500275 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 17:19:09 -0500 Subject: [PATCH 05/15] Checks only /docs Signed-off-by: Rafael Vasquez --- .github/workflows/doc-lint.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml index 24e761fb5c421..618f4deda904f 100644 --- a/.github/workflows/doc-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -19,4 +19,5 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - uses: DavidAnson/markdownlint-cli2-action@v19 with: - config: '.markdownlint-cli2.yaml' \ No newline at end of file + config: ".markdownlint-cli2.yaml" + globs: "docs/**/*.md" \ No newline at end of file From cc8bbf47aa23ba40cdb5d7478f4bf08cf1c53e2f Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 17:23:16 -0500 Subject: [PATCH 06/15] Remove sphinx-lint dependency Signed-off-by: Rafael Vasquez --- requirements-lint.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-lint.txt b/requirements-lint.txt index 711bb50a0e936..f9132bbf96437 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -6,7 +6,6 @@ ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 -sphinx-lint==1.0.0 # type checking mypy==1.11.1 From 1e4e4835f5ab515304757981d4b1c1791fde3767 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Wed, 8 Jan 2025 17:23:46 -0500 Subject: [PATCH 07/15] Add newline Signed-off-by: Rafael Vasquez --- .github/workflows/doc-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml index 618f4deda904f..471a7b7df3a8f 100644 --- a/.github/workflows/doc-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -20,4 +20,4 @@ jobs: - uses: DavidAnson/markdownlint-cli2-action@v19 with: config: ".markdownlint-cli2.yaml" - globs: "docs/**/*.md" \ No newline at end of file + globs: "docs/**/*.md" From b92d9814154c6eb7d5d06bb292c3e7dbd6074eb2 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 13:18:25 -0500 Subject: [PATCH 08/15] Revert doc changes Signed-off-by: Rafael Vasquez --- docs/README.md | 1 - docs/source/api/inference_params.md | 21 +++ docs/source/api/model/adapters.md | 9 ++ docs/source/api/model/index.md | 12 ++ docs/source/api/model/interfaces.md | 9 ++ docs/source/api/model/interfaces_base.md | 9 ++ docs/source/api/multimodal/index.md | 60 ++------- docs/source/api/multimodal/inputs.md | 49 +++++++ docs/source/api/multimodal/parse.md | 9 ++ docs/source/api/multimodal/processing.md | 9 ++ docs/source/api/multimodal/profiling.md | 9 ++ docs/source/api/multimodal/registry.md | 9 ++ docs/source/community/sponsors.md | 2 - docs/source/contributing/overview.md | 2 + .../contributing/vulnerability_management.md | 4 +- docs/source/deployment/docker.md | 4 +- .../source/deployment/frameworks/cerebrium.md | 10 +- docs/source/deployment/frameworks/dstack.md | 10 +- docs/source/deployment/frameworks/skypilot.md | 4 +- .../deployment/integrations/llamastack.md | 2 +- docs/source/deployment/k8s.md | 9 +- .../source/design/automatic_prefix_caching.md | 11 +- docs/source/design/multiprocessing.md | 2 +- docs/source/features/quantization/auto_awq.md | 4 +- docs/source/features/quantization/bnb.md | 7 +- docs/source/features/quantization/fp8.md | 4 +- .../features/quantization/fp8_e4m3_kvcache.md | 4 +- docs/source/features/quantization/gguf.md | 10 +- docs/source/features/quantization/int8.md | 2 +- docs/source/features/spec_decode.md | 10 +- docs/source/features/tool_calling.md | 90 ++++++------- docs/source/getting_started/faq.md | 2 +- .../getting_started/installation/cpu-apple.md | 17 ++- .../getting_started/installation/cpu-x86.md | 30 ++--- .../getting_started/installation/gpu-cuda.md | 96 +++++++------- .../getting_started/installation/gpu-rocm.md | 120 +++++++++--------- .../getting_started/installation/hpu-gaudi.md | 60 ++++----- .../getting_started/installation/neuron.md | 8 +- .../getting_started/installation/openvino.md | 14 +- .../getting_started/installation/tpu.md | 6 +- .../getting_started/installation/xpu.md | 24 ++-- docs/source/getting_started/quickstart.md | 55 ++++---- .../source/getting_started/troubleshooting.md | 6 +- docs/source/index.md | 7 +- .../models/extensions/runai_model_streamer.md | 12 +- docs/source/models/supported_models.md | 69 +++++----- docs/source/performance/optimization.md | 2 +- docs/source/serving/distributed_serving.md | 40 +++--- docs/source/serving/integrations/langchain.md | 2 +- .../source/serving/integrations/llamaindex.md | 2 +- docs/source/serving/metrics.md | 2 +- docs/source/serving/multimodal_inputs.md | 1 - docs/source/serving/offline_inference.md | 2 +- .../serving/openai_compatible_server.md | 20 +-- 54 files changed, 549 insertions(+), 445 deletions(-) create mode 100644 docs/source/api/inference_params.md create mode 100644 docs/source/api/model/adapters.md create mode 100644 docs/source/api/model/index.md create mode 100644 docs/source/api/model/interfaces.md create mode 100644 docs/source/api/model/interfaces_base.md create mode 100644 docs/source/api/multimodal/inputs.md create mode 100644 docs/source/api/multimodal/parse.md create mode 100644 docs/source/api/multimodal/processing.md create mode 100644 docs/source/api/multimodal/profiling.md create mode 100644 docs/source/api/multimodal/registry.md diff --git a/docs/README.md b/docs/README.md index 1a44c1341f4fb..46488c9bb0b92 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,5 +16,4 @@ make html ```bash python -m http.server -d build/html/ ``` - Launch your browser and open localhost:8000. diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md new file mode 100644 index 0000000000000..181c30cab9c4a --- /dev/null +++ b/docs/source/api/inference_params.md @@ -0,0 +1,21 @@ +# Inference Parameters + +Inference parameters for vLLM APIs. + +(sampling-params)= + +## Sampling Parameters + +```{eval-rst} +.. autoclass:: vllm.SamplingParams + :members: +``` + +(pooling-params)= + +## Pooling Parameters + +```{eval-rst} +.. autoclass:: vllm.PoolingParams + :members: +``` diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md new file mode 100644 index 0000000000000..e103a51d0070d --- /dev/null +++ b/docs/source/api/model/adapters.md @@ -0,0 +1,9 @@ +# Model Adapters + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.adapters + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md new file mode 100644 index 0000000000000..b8437e3c3517a --- /dev/null +++ b/docs/source/api/model/index.md @@ -0,0 +1,12 @@ +# Model Development + +## Submodules + +```{toctree} +:maxdepth: 1 + +interfaces_base +interfaces +adapters +``` + diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md new file mode 100644 index 0000000000000..55bee57f64faa --- /dev/null +++ b/docs/source/api/model/interfaces.md @@ -0,0 +1,9 @@ +# Optional Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces + :members: + :member-order: bysource +``` diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md new file mode 100644 index 0000000000000..75d58d34228e9 --- /dev/null +++ b/docs/source/api/model/interfaces_base.md @@ -0,0 +1,9 @@ +# Base Model Interfaces + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.model_executor.models.interfaces_base + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index fa2eb5793386e..51e24795a34cf 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -2,10 +2,6 @@ # Multi-Modality -```{eval-rst} -.. currentmodule:: vllm.multimodal -``` - vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) @@ -15,58 +11,18 @@ Looking to add your own multi-modal model? Please follow the instructions listed ## Module Contents -```{eval-rst} -.. automodule:: vllm.multimodal -``` - -### Registry - ```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY ``` -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalRegistry - :members: - :show-inheritance: -``` - -### Base Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.base - :members: - :show-inheritance: -``` - -### Input Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: -``` +## Submodules -### Audio Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: -``` +```{toctree} +:maxdepth: 1 -### Image Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: -``` - -### Video Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: +inputs +parse +processing +profiling +registry ``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md new file mode 100644 index 0000000000000..3d89666113229 --- /dev/null +++ b/docs/source/api/multimodal/inputs.md @@ -0,0 +1,49 @@ +# Input Definitions + +## User-facing inputs + +```{eval-rst} +.. autodata:: vllm.multimodal.MultiModalDataDict +``` + +## Internal data structures + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.PlaceholderRange + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.NestedTensors +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 + :members: + :show-inheritance: +``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md new file mode 100644 index 0000000000000..4676139efe626 --- /dev/null +++ b/docs/source/api/multimodal/parse.md @@ -0,0 +1,9 @@ +# Data Parsing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.parse + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md new file mode 100644 index 0000000000000..0d81c8d3966ee --- /dev/null +++ b/docs/source/api/multimodal/processing.md @@ -0,0 +1,9 @@ +# Data Processing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.processing + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md new file mode 100644 index 0000000000000..b455145212202 --- /dev/null +++ b/docs/source/api/multimodal/profiling.md @@ -0,0 +1,9 @@ +# Memory Profiling + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.profiling + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md new file mode 100644 index 0000000000000..0737a4385cf32 --- /dev/null +++ b/docs/source/api/multimodal/registry.md @@ -0,0 +1,9 @@ +# Registry + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.registry + :members: + :member-order: bysource +``` diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index fb93e65673dff..9d2af4c13b088 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -6,7 +6,6 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: - - a16z - Dropbox - Sequoia Capital @@ -14,7 +13,6 @@ Cash Donations: - ZhenFund Compute Resources: - - AMD - Anyscale - AWS diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index e92104399342d..c960790f47a13 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -37,6 +37,8 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` +# Contribution Guidelines + ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index 61766f795bdb9..422dc13e6a644 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -32,8 +32,8 @@ We prefer to keep all vulnerability-related communication on the security report on GitHub. However, if you need to contact the VMT directly for an urgent issue, you may contact the following individuals: -- Simon Mo - -- Russell Bryant - +- Simon Mo - simon.mo@hey.com +- Russell Bryant - rbryant@redhat.com ## Slack Discussion diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index c735bfd0e87a7..2df1aca27f1e6 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -28,8 +28,8 @@ memory to share data between processes under the hood, particularly for tensor p You can build and run vLLM from source via the provided . To build vLLM: ```console -# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index 5787c4a407bfb..be018dfb75d7a 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -pip install cerebrium -cerebrium login +$ pip install cerebrium +$ cerebrium login ``` Next, create your Cerebrium project, run: ```console -cerebrium init vllm-project +$ cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -cerebrium deploy +$ cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index b42a34125c6d7..4142c1d9f1f60 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -pip install "dstack[all] -dstack server +$ pip install "dstack[all] +$ dstack server ``` Next, to configure your dstack project, run: ```console -mkdir -p vllm-dstack -cd vllm-dstack -dstack init +$ mkdir -p vllm-dstack +$ cd vllm-dstack +$ dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 6e7d7b7e51d7b..657e7f2bc72cc 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -332,13 +332,13 @@ run: | ``` -Start the chat web UI: +1. Start the chat web UI: ```console sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) ``` -Then, we can access the GUI at the returned gradio link: +2. Then, we can access the GUI at the returned gradio link: ```console | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md index a6c3569637abf..474d2bdfa9580 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta To install Llama Stack, run ```console -pip install llama-stack -q +$ pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 9e982f9c30e9a..760214e112fba 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -14,7 +14,7 @@ Before you begin, ensure that you have the following: ## Deployment Steps -### Create a PVC , Secret and Deployment for vLLM +1. **Create a PVC , Secret and Deployment for vLLM** PVC is used to store the model cache and it is optional, you can use hostPath or other storage options @@ -49,7 +49,7 @@ stringData: Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. -Here are two examples for using NVIDIA GPU and AMD GPU. +Here are two examples for using NVIDIA GPU and AMD GPU. - NVIDIA GPU @@ -194,10 +194,9 @@ spec: - name: shm mountPath: /dev/shm ``` - You can get the full example with steps and sample yaml files from . -### Create a Kubernetes Service for vLLM +2. **Create a Kubernetes Service for vLLM** Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: @@ -220,7 +219,7 @@ spec: type: ClusterIP ``` -### Deploy and Test +3. **Deploy and Test** Apply the deployment and service configurations using `kubectl apply -f `: diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md index bbea45eac45bf..4398536b2b4ad 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -6,7 +6,7 @@ The core idea of [PagedAttention](#design-paged-attention) is to partition the K To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -```text +``` Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -14,16 +14,19 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` + In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -```text +``` hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. + This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. + ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -38,5 +41,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index da87638e5b743..c2cdb75ea08a7 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 404505eb3890e..3679595e3d4d0 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -pip install autoawq +$ pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index c0cde55685445..f7f41726f3725 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -pip install bitsandbytes>=0.45.0 +$ pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint +## Read quantized checkpoint. ```python from vllm import LLM @@ -37,11 +37,10 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` - ## OpenAI Compatible Server Append the following to your 4bit model arguments: -```bash +``` --quantization bitsandbytes --load-format bitsandbytes ``` diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index da49cd2747228..b2eda74fd1e3b 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -pip install llmcompressor +$ pip install llmcompressor ``` ## Quantization Process @@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -pip install vllm lm-eval==0.4.4 +$ pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index d233cdf0a464c..50edaf81fddd3 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -3,7 +3,7 @@ # FP8 E4M3 KV Cache Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project ) specifies two common 8-bit floating point data formats: E5M2 +improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 (5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside @@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). To install AMMO (AlgorithMic Model Optimization): ```console -pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo ``` Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index 640997cf4bc39..eebf11dfc1b2b 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 82a15d76d352f..1ac50ba987dda 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -pip install llmcompressor +$ pip install llmcompressor ``` ## Quantization Process diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index ab7b2f302bd13..903acadb71426 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models: 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be used directly with vLLM due to differences in the expected layer names and model definition. - To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert them. Note that this script does not modify the model's weights. In the above example, use the script to first convert - the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model and then use the converted checkpoint as the draft model in vLLM. 2. The EAGLE based draft models need to be run without tensor parallelism @@ -207,6 +207,7 @@ A few important things to consider when using the EAGLE based draft models: reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). + A variety of EAGLE draft models are available on the Hugging Face hub: | Base Model | EAGLE on Hugging Face | # EAGLE Parameters | @@ -223,6 +224,7 @@ A variety of EAGLE draft models are available on the Hugging Face hub: | Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | | Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | + ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -248,6 +250,8 @@ speculative decoding, breaking down the guarantees into three key areas: same request across runs. For more details, see the FAQ section titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). +**Conclusion** + While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -255,6 +259,8 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. +**Mitigation Strategies** + For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index f8ef6795e9d5d..062f2021eb62a 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -55,15 +55,13 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: - -```text +``` Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: - - Setting up the server with tool calling enabled - Defining an actual function to handle tool calls - Making a request with `tool_choice="auto"` @@ -72,7 +70,6 @@ This example demonstrates: You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: - 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -80,27 +77,26 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling - vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. + ## Automatic Function Calling To enable this feature, you should set the following flags: - -- `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -- `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`. -- `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. -- `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +* `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`. +* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) @@ -108,54 +104,54 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. +* `NousResearch/Hermes-2-Pro-*` +* `NousResearch/Hermes-2-Theta-*` +* `NousResearch/Hermes-3-*` -- `NousResearch/Hermes-2-Pro-*` -- `NousResearch/Hermes-2-Theta-*` -- `NousResearch/Hermes-3-*` _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` + ### Mistral Models (`mistral`) Supported models: - -- `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) -- Additional mistral function-calling models are compatible as well. +* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) +* Additional mistral function-calling models are compatible as well. Known issues: - 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition is not met, the following additional chat templates are provided: -- `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that +* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) -- `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt +* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. + Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + ### Llama Models (`llama3_json`) Supported models: - -- `meta-llama/Meta-Llama-3.1-8B-Instruct` -- `meta-llama/Meta-Llama-3.1-70B-Instruct` -- `meta-llama/Meta-Llama-3.1-405B-Instruct` -- `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` +* `meta-llama/Meta-Llama-3.1-8B-Instruct` +* `meta-llama/Meta-Llama-3.1-70B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below. Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: - 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -168,68 +164,64 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: - -- `ibm-granite/granite-3.0-8b-instruct` +* `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` `examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported. -- `ibm-granite/granite-3.1-8b-instruct` +* `ibm-granite/granite-3.1-8b-instruct` Recommended flags: `--tool-call-parser granite` The chat template from Huggingface can be used directly. Parallel function calls are supported. -- `ibm-granite/granite-20b-functioncalling` +* `ibm-granite/granite-20b-functioncalling` Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja` `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. + ### InternLM Models (`internlm`) Supported models: - -- `internlm/internlm2_5-7b-chat` (confirmed) -- Additional internlm2.5 function-calling models are compatible as well +* `internlm/internlm2_5-7b-chat` (confirmed) +* Additional internlm2.5 function-calling models are compatible as well Known issues: - -- Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. +* Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` -### Jamba Models (`jamba`) +### Jamba Models (`jamba`) AI21's Jamba-1.5 models are supported. +* `ai21labs/AI21-Jamba-1.5-Mini` +* `ai21labs/AI21-Jamba-1.5-Large` -- `ai21labs/AI21-Jamba-1.5-Mini` -- `ai21labs/AI21-Jamba-1.5-Large` Flags: `--tool-call-parser jamba` + ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: - ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: - -- The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) -- Llama's smaller models struggle to use tools effectively. +* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) +* Llama's smaller models struggle to use tools effectively. Example supported models: - -- `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) -- `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) -- `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) -- `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) +* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) +* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) Flags: `--tool-call-parser pythonic --chat-template {see_above}` @@ -239,6 +231,7 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- + ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -291,8 +284,7 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. - -```bash +``` --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index 4751b325e6fc4..fde2954f10c59 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -## Mitigation Strategies +**Mitigation Strategies** - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md index 29e9892f1ba12..b55e4384d064d 100644 --- a/docs/source/getting_started/installation/cpu-apple.md +++ b/docs/source/getting_started/installation/cpu-apple.md @@ -18,23 +18,25 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. -```bash -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -r requirements-cpu.txt -pip install -e . +``` +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -r requirements-cpu.txt +$ pip install -e . ``` ```{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. ``` + + ## Troubleshooting -If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). -```text +``` [...] fatal error: 'map' file not found 1 | #include | ^~~~~ @@ -46,3 +48,4 @@ If the build has error like the following snippet where standard C++ headers can | ^~~~~~~~~ 1 error generated. ``` + diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index df2c0323e6305..bb046dd0fd9dc 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -48,23 +48,23 @@ $ docker run -it \ - First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```console -sudo apt-get update -y -sudo apt-get install -y gcc-12 g++-12 libnuma-dev -sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +$ sudo apt-get update -y +$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev +$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` - Second, install Python packages for vLLM CPU backend building: ```console -pip install --upgrade pip -pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +$ pip install --upgrade pip +$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, build and install vLLM CPU backend: ```console -VLLM_TARGET_DEVICE=cpu python setup.py install +$ VLLM_TARGET_DEVICE=cpu python setup.py install ``` ```{note} @@ -92,18 +92,18 @@ VLLM_TARGET_DEVICE=cpu python setup.py install - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -find / -name *libtcmalloc* # find the dynamic link library path -export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -python examples/offline_inference/offline_inference.py # run vLLM +$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +$ find / -name *libtcmalloc* # find the dynamic link library path +$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +$ python examples/offline_inference/offline_inference.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -export VLLM_CPU_KVCACHE_SPACE=40 -export VLLM_CPU_OMP_THREADS_BIND=0-29 -vllm serve facebook/opt-125m +$ export VLLM_CPU_KVCACHE_SPACE=40 +$ export VLLM_CPU_OMP_THREADS_BIND=0-29 +$ vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -148,7 +148,7 @@ $ python examples/offline_inference/offline_inference.py - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md index e8606b21169b7..419b8163fc034 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -17,9 +17,9 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin You can create a new Python environment using `conda`: ```console -# (Recommended) Create a new conda environment. -conda create -n myenv python=3.12 -y -conda activate myenv +$ # (Recommended) Create a new conda environment. +$ conda create -n myenv python=3.12 -y +$ conda activate myenv ``` ```{note} @@ -29,9 +29,9 @@ conda activate myenv Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: ```console -# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. -uv venv myenv --python 3.12 --seed -source myenv/bin/activate +$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +$ uv venv myenv --python 3.12 --seed +$ source myenv/bin/activate ``` In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. @@ -43,18 +43,18 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -# Install vLLM with CUDA 12.1. -pip install vllm # If you are using pip. -uv pip install vllm # If you are using uv. +$ # Install vLLM with CUDA 12.1. +$ pip install vllm # If you are using pip. +$ uv pip install vllm # If you are using uv. ``` As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: ```console -# Install vLLM with CUDA 11.8. -export VLLM_VERSION=0.6.1.post1 -export PYTHON_VERSION=310 -pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +$ # Install vLLM with CUDA 11.8. +$ export VLLM_VERSION=0.6.1.post1 +$ export PYTHON_VERSION=310 +$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` (install-the-latest-code)= @@ -66,7 +66,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ### Install the latest code using `pip` ```console -pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -74,8 +74,8 @@ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. @@ -85,14 +85,14 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p Another way to install the latest code is to use `uv`: ```console -uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console -export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. @@ -102,8 +102,8 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb Another way to access the latest code is to use the docker images: ```console -export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. @@ -121,18 +121,18 @@ The latest code can contain bugs and may not be stable. Please use it with cauti If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -VLLM_USE_PRECOMPILED=1 pip install --editable . +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel from and use the compiled libraries from there in the installation. +This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -pip install --editable . +$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +$ pip install --editable . ``` You can find more information about vLLM's wheels [above](#install-the-latest-code). @@ -147,9 +147,9 @@ It is recommended to use the same commit ID for the source code as the vLLM whee If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -e . +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -e . ``` ```{tip} @@ -172,11 +172,11 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -python use_existing_torch.py -pip install -r requirements-build.txt -pip install -e . --no-build-isolation +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python use_existing_torch.py +$ pip install -r requirements-build.txt +$ pip install -e . --no-build-isolation ``` #### Use the local cutlass for compilation @@ -185,9 +185,9 @@ Currently, before starting the build process, vLLM fetches cutlass code from Git To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` #### Troubleshooting @@ -196,8 +196,8 @@ To avoid your system being overloaded, you can limit the number of compilation j to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -export MAX_JOBS=6 -pip install -e . +$ export MAX_JOBS=6 +$ pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -206,22 +206,22 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -# Use `--ipc=host` to make sure the shared memory is large enough. -docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +$ # Use `--ipc=host` to make sure the shared memory is large enough. +$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -export CUDA_HOME=/usr/local/cuda -export PATH="${CUDA_HOME}/bin:$PATH" +$ export CUDA_HOME=/usr/local/cuda +$ export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -nvcc --version # verify that nvcc is in your PATH -${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +$ nvcc --version # verify that nvcc is in your PATH +$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` ### Unsupported OS build @@ -231,6 +231,6 @@ vLLM can fully run only on Linux but for development purposes, you can still bui Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -export VLLM_TARGET_DEVICE=empty -pip install -e . +$ export VLLM_TARGET_DEVICE=empty +$ pip install -e . ``` diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md index 2f3510c8a7334..e36b92513e31d 100644 --- a/docs/source/getting_started/installation/gpu-rocm.md +++ b/docs/source/getting_started/installation/gpu-rocm.md @@ -47,13 +47,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: ```console -DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: @@ -80,84 +80,84 @@ Where the `` is the location where the model is stored, for examp 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) - - [PyTorch](https://pytorch.org/) +- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) +- [PyTorch](https://pytorch.org/) - For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. - Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) - Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) +Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) - ```console - python3 -m pip install ninja cmake wheel pybind11 - pip uninstall -y triton - git clone https://github.com/OpenAI/triton.git - cd triton - git checkout e192dba - cd python - pip3 install . - cd ../.. - ``` +```console +$ python3 -m pip install ninja cmake wheel pybind11 +$ pip uninstall -y triton +$ git clone https://github.com/OpenAI/triton.git +$ cd triton +$ git checkout e192dba +$ cd python +$ pip3 install . +$ cd ../.. +``` - ```{note} - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - ``` +```{note} +- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. +``` 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) - Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) - Alternatively, wheels intended for vLLM use can be accessed under the releases. +Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) +Alternatively, wheels intended for vLLM use can be accessed under the releases. - For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. - ```console - git clone https://github.com/ROCm/flash-attention.git - cd flash-attention - git checkout 3cea2fb - git submodule update --init - GPU_ARCHS="gfx90a" python3 setup.py install - cd .. - ``` +```console +$ git clone https://github.com/ROCm/flash-attention.git +$ cd flash-attention +$ git checkout 3cea2fb +$ git submodule update --init +$ GPU_ARCHS="gfx90a" python3 setup.py install +$ cd .. +``` - ```{note} - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - ``` +```{note} +- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) +``` 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: - ```bash - $ pip install --upgrade pip +```bash +$ pip install --upgrade pip - # Install PyTorch - $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 +# Install PyTorch +$ pip uninstall torch -y +$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 - # Build & install AMD SMI - $ pip install /opt/rocm/share/amd_smi +# Build & install AMD SMI +$ pip install /opt/rocm/share/amd_smi - # Install dependencies - $ pip install --upgrade numba scipy huggingface-hub[cli] - $ pip install "numpy<2" - $ pip install -r requirements-rocm.txt +# Install dependencies +$ pip install --upgrade numba scipy huggingface-hub[cli] +$ pip install "numpy<2" +$ pip install -r requirements-rocm.txt - # Build vLLM for MI210/MI250/MI300. - $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" - $ python3 setup.py develop - ``` +# Build vLLM for MI210/MI250/MI300. +$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" +$ python3 setup.py develop +``` - This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. +This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - ```{tip} - - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - ``` +```{tip} +- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. +- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. +- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. +- The ROCm version of PyTorch, ideally, should match the ROCm driver version. +``` - ```{tip} - - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). - ``` +```{tip} +- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). +``` diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md index a6407b0447a8d..1d50cef3bdc83 100644 --- a/docs/source/getting_started/installation/hpu-gaudi.md +++ b/docs/source/getting_started/installation/hpu-gaudi.md @@ -22,8 +22,8 @@ Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optim ### Quick start using Dockerfile ```console -docker build -f Dockerfile.hpu -t vllm-hpu-env . -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +$ docker build -f Dockerfile.hpu -t vllm-hpu-env . +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` ```{tip} @@ -37,10 +37,10 @@ If you're observing the following error: `docker: Error response from daemon: Un To verify that the Intel Gaudi software was correctly installed, run: ```console -hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -pip list | grep neural # verify that neural_compressor is installed +$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +$ pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack @@ -57,8 +57,8 @@ for more details. Use the following commands to run a Docker image: ```console -docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` #### Build and Install vLLM @@ -66,18 +66,18 @@ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_va To build and install vLLM from source, run: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -python setup.py develop +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -git clone https://github.com/HabanaAI/vllm-fork.git -cd vllm-fork -git checkout habana_main -python setup.py develop +$ git clone https://github.com/HabanaAI/vllm-fork.git +$ cd vllm-fork +$ git checkout habana_main +$ python setup.py develop ``` ## Supported Features @@ -181,7 +181,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -```text +``` INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -192,7 +192,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -```text +``` min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -201,7 +201,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -```text +``` min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -224,7 +224,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -```text +``` INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -273,7 +273,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -```text +``` INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -350,18 +350,18 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Prompt: : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md index 5581b1940ca46..431f90537f543 100644 --- a/docs/source/getting_started/installation/neuron.md +++ b/docs/source/getting_started/installation/neuron.md @@ -123,10 +123,10 @@ python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torch Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -git clone https://github.com/vllm-project/vllm.git -cd vllm -pip install -U -r requirements-neuron.txt -VLLM_TARGET_DEVICE="neuron" pip install . +$ git clone https://github.com/vllm-project/vllm.git +$ cd vllm +$ pip install -U -r requirements-neuron.txt +$ VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md index d97d4173bf36b..60f95fd1c4250 100644 --- a/docs/source/getting_started/installation/openvino.md +++ b/docs/source/getting_started/installation/openvino.md @@ -27,8 +27,8 @@ vLLM powered by OpenVINO supports all LLM models from [vLLM supported models lis ## Quick start using Dockerfile ```console -docker build -f Dockerfile.openvino -t vllm-openvino-env . -docker run -it --rm vllm-openvino-env +$ docker build -f Dockerfile.openvino -t vllm-openvino-env . +$ docker run -it --rm vllm-openvino-env ``` (install-openvino-backend-from-source)= @@ -38,21 +38,21 @@ docker run -it --rm vllm-openvino-env - First, install Python. For example, on Ubuntu 22.04, you can run: ```console - sudo apt-get update -y - sudo apt-get install python3 + $ sudo apt-get update -y + $ sudo apt-get install python3 ``` - Second, install prerequisites vLLM OpenVINO backend installation: ```console - pip install --upgrade pip - pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + $ pip install --upgrade pip + $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, install vLLM with OpenVINO backend: ```console - PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . ``` - [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md index 1938785ade46a..bc93c44fead30 100644 --- a/docs/source/getting_started/installation/tpu.md +++ b/docs/source/getting_started/installation/tpu.md @@ -156,14 +156,14 @@ For more information about using TPUs with GKE, see You can use to build a Docker image with TPU support. ```console -docker build -f Dockerfile.tpu -t vllm-tpu . +$ docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -# Make sure to add `--privileged --net host --shm-size=16G`. -docker run --privileged --net host --shm-size=16G -it vllm-tpu +$ # Make sure to add `--privileged --net host --shm-size=16G`. +$ docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md index d35e117a8446f..c1ab5478eb652 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/xpu.md @@ -40,15 +40,15 @@ $ docker run -it \ - Second, install Python packages for vLLM XPU backend building: ```console -source /opt/intel/oneapi/setvars.sh -pip install --upgrade pip -pip install -v -r requirements-xpu.txt +$ source /opt/intel/oneapi/setvars.sh +$ pip install --upgrade pip +$ pip install -v -r requirements-xpu.txt ``` - Finally, build and install vLLM XPU backend: ```console -VLLM_TARGET_DEVICE=xpu python setup.py install +$ VLLM_TARGET_DEVICE=xpu python setup.py install ``` ```{note} @@ -61,14 +61,14 @@ VLLM_TARGET_DEVICE=xpu python setup.py install XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -python -m vllm.entrypoints.openai.api_server \ ---model=facebook/opt-13b \ ---dtype=bfloat16 \ ---device=xpu \ ---max_model_len=1024 \ ---distributed-executor-backend=ray \ ---pipeline-parallel-size=2 \ --tp=8 +$ python -m vllm.entrypoints.openai.api_server \ +$ --model=facebook/opt-13b \ +$ --dtype=bfloat16 \ +$ --device=xpu \ +$ --max_model_len=1024 \ +$ --distributed-executor-backend=ray \ +$ --pipeline-parallel-size=2 \ +$ -tp=8 ``` By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index b61b3df8cc043..ea15d9ef065fa 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -15,12 +15,21 @@ This guide will help you quickly get started with vLLM to perform: ## Installation If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly. -It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. + +It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: + +```console +$ uv venv myenv --python 3.12 --seed +$ source myenv/bin/activate +$ uv pip install vllm +``` + +You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -conda create -n myenv python=3.10 -y -conda activate myenv -pip install vllm +$ conda create -n myenv python=3.12 -y +$ conda activate myenv +$ pip install vllm ``` ```{note} @@ -85,7 +94,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -vllm serve Qwen/Qwen2.5-1.5B-Instruct +$ vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -96,7 +105,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -curl http://localhost:8000/v1/models +$ curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -106,14 +115,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -curl http://localhost:8000/v1/completions \ --H "Content-Type: application/json" \ --d '{ -"model": "Qwen/Qwen2.5-1.5B-Instruct", -"prompt": "San Francisco is a", -"max_tokens": 7, -"temperature": 0 -}' +$ curl http://localhost:8000/v1/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "prompt": "San Francisco is a", +$ "max_tokens": 7, +$ "temperature": 0 +$ }' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -142,15 +151,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -curl http://localhost:8000/v1/chat/completions \ --H "Content-Type: application/json" \ --d '{ -"model": "Qwen/Qwen2.5-1.5B-Instruct", -"messages": [ -{"role": "system", "content": "You are a helpful assistant."}, -{"role": "user", "content": "Who won the world series in 2020?"} -] -}' +$ curl http://localhost:8000/v1/chat/completions \ +$ -H "Content-Type: application/json" \ +$ -d '{ +$ "model": "Qwen/Qwen2.5-1.5B-Instruct", +$ "messages": [ +$ {"role": "system", "content": "You are a helpful assistant."}, +$ {"role": "user", "content": "Who won the world series in 2020?"} +$ ] +$ }' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 1e290d2b4c0bd..f5efe0bef7506 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -48,7 +48,6 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. (troubleshooting-incorrect-hardware-driver)= - ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -119,13 +118,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -142,7 +141,6 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup ``` (troubleshooting-python-multiprocessing)= - ## Python multiprocessing ### `RuntimeError` Exception diff --git a/docs/source/index.md b/docs/source/index.md index 8a32e782eda75..23e4304fe29d9 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM +# Welcome to vLLM! ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -139,8 +139,9 @@ community/sponsors api/offline_inference/index api/engine/index +api/inference_params api/multimodal/index -api/params +api/model/index ``` % Design Documents: Details about vLLM internals @@ -171,7 +172,7 @@ contributing/model/index contributing/vulnerability_management ``` -## Indices and tables +# Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index 75f7a9fcad416..fe2701194a604 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -pip3 install vllm[runai] +$ pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 53f8fac38f18b..acbe27a22a679 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -export VLLM_USE_MODELSCOPE=True +$ export VLLM_USE_MODELSCOPE=True ``` And use with `trust_remote_code=True`. @@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2ForCausalLM` - - Qwen2 + - QwQ, Qwen2 - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ @@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) @@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, @@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) @@ -550,6 +550,28 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. +````{important} +To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) +or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + +Offline inference: +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` + +Online inference: +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` + +```{note} +vLLM currently only supports adding LoRA to the language backbone of multimodal models. +``` + ### Generative Models See [this page](#generative-models) for more information on how to use generative models. @@ -689,14 +711,14 @@ See [this page](#generative-models) for more information on how to use generativ * - `Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - - ✅︎ - ✅︎ * - `PixtralForConditionalGeneration` - Pixtral - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. - - ✅︎ - ✅︎ @@ -715,7 +737,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2VLForConditionalGeneration` - - Qwen2-VL + - QVQ, Qwen2-VL - T + IE+ + VE+ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ @@ -733,26 +755,6 @@ See [this page](#generative-models) for more information on how to use generativ E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{important} -To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) -or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - -```python -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) -``` - -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 -``` -```` - -```{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. -``` - ```{note} To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` @@ -762,6 +764,11 @@ The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (` For more details, please see: ``` +```{note} +The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). +A corrected version is available at . +``` + ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. @@ -820,9 +827,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - ```{tip} - When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - ``` +```{tip} +When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. +``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 4fbc376e1aa39..4fcde9b03b887 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -```text +``` WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index ce8708b25be0e..4e0a9ef6ecf7d 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a") To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -vllm serve facebook/opt-13b \ ---tensor-parallel-size 4 +$ vllm serve facebook/opt-13b \ +$ --tensor-parallel-size 4 ``` You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -vllm serve gpt2 \ ---tensor-parallel-size 4 \ ---pipeline-parallel-size 2 +$ vllm serve gpt2 \ +$ --tensor-parallel-size 4 \ +$ --pipeline-parallel-size 2 ``` ## Running vLLM on multiple nodes @@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: ```console -bash run_cluster.sh \ -vllm/vllm-openai \ -ip_of_head_node \ ---head \ -/path/to/the/huggingface/home/in/this/node +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --head \ +$ /path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -bash run_cluster.sh \ -vllm/vllm-openai \ -ip_of_head_node \ ---worker \ -/path/to/the/huggingface/home/in/this/node +$ bash run_cluster.sh \ +$ vllm/vllm-openai \ +$ ip_of_head_node \ +$ --worker \ +$ /path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -vllm serve /path/to/the/model/in/the/container \ ---tensor-parallel-size 8 \ ---pipeline-parallel-size 2 +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 8 \ +$ --pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -vllm serve /path/to/the/model/in/the/container \ ---tensor-parallel-size 16 +$ vllm serve /path/to/the/model/in/the/container \ +$ --tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md index 03142d23b145a..49ff6e0c32a72 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain To install LangChain, run ```console -pip install langchain langchain_community -q +$ pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md index 8c72605202cf5..9961c181d7e1c 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index To install LlamaIndex, run ```console -pip install llama-index-llms-vllm -q +$ pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 6c84f6d1350a6..e6ded2e6dd465 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -7,7 +7,7 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker](#deployment-docker): ```console -vllm serve unsloth/Llama-3.2-1B-Instruct +$ vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index bc475826bbfde..9f5e1b908d786 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -303,7 +303,6 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: - ```python from openai import OpenAI diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 94703a1c32ade..79092ab208784 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -64,7 +64,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her #### Context length and batch size -You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) +You can further reduce memory usage by limit the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). ```python diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index fad38664605f5..ec5a367594743 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -5,13 +5,11 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): - ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. - ```python from openai import OpenAI client = OpenAI( @@ -52,7 +50,6 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= - ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -74,7 +71,6 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: - ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -84,7 +80,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -119,12 +115,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See for more details. +> See https://github.com/vllm-project/vllm/pull/11529 for more details. ```python completion = client.chat.completions.create( @@ -151,7 +147,6 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= - ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -180,7 +175,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -vllm serve SOME_MODEL --config config.yaml +$ vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -191,7 +186,6 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= - ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); @@ -218,7 +212,6 @@ The following extra parameters are supported: ``` (chat-api)= - ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -227,7 +220,6 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - - *Note: `image_url.detail` parameter is not supported.* Code example: @@ -251,7 +243,6 @@ The following extra parameters are supported: ``` (embeddings-api)= - ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -293,7 +284,6 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= - ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -303,7 +293,6 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= - ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. @@ -313,7 +302,6 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu Code example: (score-api)= - ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. From d4a6be70efe89b577630aa0c8d79aca86aad0f17 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 13:20:25 -0500 Subject: [PATCH 09/15] Revert Signed-off-by: Rafael Vasquez --- .github/workflows/doc-lint.yml | 23 --------- .github/workflows/sphinx-lint.yml | 39 +++++++++++++++ .markdownlint-cli2.yaml | 24 --------- docs/source/api/inference_params.md | 21 -------- docs/source/api/model/adapters.md | 9 ---- docs/source/api/model/index.md | 12 ----- docs/source/api/model/interfaces.md | 9 ---- docs/source/api/model/interfaces_base.md | 9 ---- docs/source/api/multimodal/index.md | 61 ++++++++++++++++++++--- docs/source/api/multimodal/inputs.md | 49 ------------------ docs/source/api/multimodal/parse.md | 9 ---- docs/source/api/multimodal/processing.md | 9 ---- docs/source/api/multimodal/profiling.md | 9 ---- docs/source/api/multimodal/registry.md | 9 ---- docs/source/api/params.md | 1 + docs/source/design/multiprocessing.md | 2 +- docs/source/getting_started/quickstart.md | 13 +---- docs/source/index.md | 3 +- docs/source/models/supported_models.md | 61 ++++++++++------------- docs/source/serving/offline_inference.md | 2 +- requirements-lint.txt | 1 + tools/sphinx-lint.sh | 3 ++ 22 files changed, 129 insertions(+), 249 deletions(-) delete mode 100644 .github/workflows/doc-lint.yml create mode 100644 .github/workflows/sphinx-lint.yml delete mode 100644 .markdownlint-cli2.yaml delete mode 100644 docs/source/api/inference_params.md delete mode 100644 docs/source/api/model/adapters.md delete mode 100644 docs/source/api/model/index.md delete mode 100644 docs/source/api/model/interfaces.md delete mode 100644 docs/source/api/model/interfaces_base.md delete mode 100644 docs/source/api/multimodal/inputs.md delete mode 100644 docs/source/api/multimodal/parse.md delete mode 100644 docs/source/api/multimodal/processing.md delete mode 100644 docs/source/api/multimodal/profiling.md delete mode 100644 docs/source/api/multimodal/registry.md create mode 100755 tools/sphinx-lint.sh diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml deleted file mode 100644 index 471a7b7df3a8f..0000000000000 --- a/.github/workflows/doc-lint.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - main - paths: - - "docs/**" - pull_request: - branches: - - main - paths: - - "docs/**" - -jobs: - markdown-lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - uses: DavidAnson/markdownlint-cli2-action@v19 - with: - config: ".markdownlint-cli2.yaml" - globs: "docs/**/*.md" diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/sphinx-lint.yml new file mode 100644 index 0000000000000..ca0f9b523c361 --- /dev/null +++ b/.github/workflows/sphinx-lint.yml @@ -0,0 +1,39 @@ +name: Lint documentation + +on: + push: + branches: + - main + paths: + - "docs/**" + pull_request: + branches: + - main + paths: + - "docs/**" + +jobs: + sphinx-lint: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Linting docs + run: tools/sphinx-lint.sh + markdown-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 + - uses: DavidAnson/markdownlint-cli2-action@v19 + with: + globs: "docs/**/*.md" \ No newline at end of file diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml deleted file mode 100644 index 32ccc5178ae2e..0000000000000 --- a/.markdownlint-cli2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Disable some built-in rules -config: - link-fragments: false - line-length: false - no-inline-html: false - first-line-heading: false - no-duplicate-heading: - siblings_only: true - -# Ignore files referenced by .gitignore (only valid at root) -gitignore: true - -# Define glob expressions to use (only valid at root) -globs: - - "docs/**/*.md" - -# Disable banner message on stdout (only valid at root) -noBanner: true - -# Disable progress on stdout (only valid at root) -noProgress: false - -# Show found files on stdout (only valid at root) -showFound: true \ No newline at end of file diff --git a/docs/source/api/inference_params.md b/docs/source/api/inference_params.md deleted file mode 100644 index 181c30cab9c4a..0000000000000 --- a/docs/source/api/inference_params.md +++ /dev/null @@ -1,21 +0,0 @@ -# Inference Parameters - -Inference parameters for vLLM APIs. - -(sampling-params)= - -## Sampling Parameters - -```{eval-rst} -.. autoclass:: vllm.SamplingParams - :members: -``` - -(pooling-params)= - -## Pooling Parameters - -```{eval-rst} -.. autoclass:: vllm.PoolingParams - :members: -``` diff --git a/docs/source/api/model/adapters.md b/docs/source/api/model/adapters.md deleted file mode 100644 index e103a51d0070d..0000000000000 --- a/docs/source/api/model/adapters.md +++ /dev/null @@ -1,9 +0,0 @@ -# Model Adapters - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.adapters - :members: - :member-order: bysource -``` diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md deleted file mode 100644 index b8437e3c3517a..0000000000000 --- a/docs/source/api/model/index.md +++ /dev/null @@ -1,12 +0,0 @@ -# Model Development - -## Submodules - -```{toctree} -:maxdepth: 1 - -interfaces_base -interfaces -adapters -``` - diff --git a/docs/source/api/model/interfaces.md b/docs/source/api/model/interfaces.md deleted file mode 100644 index 55bee57f64faa..0000000000000 --- a/docs/source/api/model/interfaces.md +++ /dev/null @@ -1,9 +0,0 @@ -# Optional Interfaces - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.interfaces - :members: - :member-order: bysource -``` diff --git a/docs/source/api/model/interfaces_base.md b/docs/source/api/model/interfaces_base.md deleted file mode 100644 index 75d58d34228e9..0000000000000 --- a/docs/source/api/model/interfaces_base.md +++ /dev/null @@ -1,9 +0,0 @@ -# Base Model Interfaces - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.model_executor.models.interfaces_base - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 51e24795a34cf..0046b73ea825e 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -2,6 +2,10 @@ # Multi-Modality +```{eval-rst} +.. currentmodule:: vllm.multimodal +``` + vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) @@ -9,20 +13,61 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). + ## Module Contents +```{eval-rst} +.. automodule:: vllm.multimodal +``` + +### Registry + ```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY ``` -## Submodules +```{eval-rst} +.. autoclass:: vllm.multimodal.MultiModalRegistry + :members: + :show-inheritance: +``` + +### Base Classes + +```{eval-rst} +.. automodule:: vllm.multimodal.base + :members: + :show-inheritance: +``` -```{toctree} -:maxdepth: 1 +### Input Classes -inputs -parse -processing -profiling -registry +```{eval-rst} +.. automodule:: vllm.multimodal.inputs + :members: + :show-inheritance: +``` + +### Audio Classes + +```{eval-rst} +.. automodule:: vllm.multimodal.audio + :members: + :show-inheritance: +``` + +### Image Classes + +```{eval-rst} +.. automodule:: vllm.multimodal.image + :members: + :show-inheritance: +``` + +### Video Classes + +```{eval-rst} +.. automodule:: vllm.multimodal.video + :members: + :show-inheritance: ``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md deleted file mode 100644 index 3d89666113229..0000000000000 --- a/docs/source/api/multimodal/inputs.md +++ /dev/null @@ -1,49 +0,0 @@ -# Input Definitions - -## User-facing inputs - -```{eval-rst} -.. autodata:: vllm.multimodal.MultiModalDataDict -``` - -## Internal data structures - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.PlaceholderRange - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autodata:: vllm.multimodal.inputs.NestedTensors -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs - :members: - :show-inheritance: -``` - -```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 - :members: - :show-inheritance: -``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md deleted file mode 100644 index 4676139efe626..0000000000000 --- a/docs/source/api/multimodal/parse.md +++ /dev/null @@ -1,9 +0,0 @@ -# Data Parsing - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.parse - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md deleted file mode 100644 index 0d81c8d3966ee..0000000000000 --- a/docs/source/api/multimodal/processing.md +++ /dev/null @@ -1,9 +0,0 @@ -# Data Processing - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.processing - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md deleted file mode 100644 index b455145212202..0000000000000 --- a/docs/source/api/multimodal/profiling.md +++ /dev/null @@ -1,9 +0,0 @@ -# Memory Profiling - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.profiling - :members: - :member-order: bysource -``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md deleted file mode 100644 index 0737a4385cf32..0000000000000 --- a/docs/source/api/multimodal/registry.md +++ /dev/null @@ -1,9 +0,0 @@ -# Registry - -## Module Contents - -```{eval-rst} -.. automodule:: vllm.multimodal.registry - :members: - :member-order: bysource -``` diff --git a/docs/source/api/params.md b/docs/source/api/params.md index 56e6fb664798b..a3b4d9cbb44ec 100644 --- a/docs/source/api/params.md +++ b/docs/source/api/params.md @@ -19,3 +19,4 @@ Optional parameters for vLLM APIs. .. autoclass:: vllm.PoolingParams :members: ``` + diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index c2cdb75ea08a7..da87638e5b743 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -21,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index ea15d9ef065fa..2808e1b386801 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -15,19 +15,10 @@ This guide will help you quickly get started with vLLM to perform: ## Installation If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/project/vllm/) directly. - -It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: - -```console -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate -$ uv pip install vllm -``` - -You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. +It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -$ conda create -n myenv python=3.12 -y +$ conda create -n myenv python=3.10 -y $ conda activate myenv $ pip install vllm ``` diff --git a/docs/source/index.md b/docs/source/index.md index 23e4304fe29d9..6747a7fcce4fe 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -139,9 +139,8 @@ community/sponsors api/offline_inference/index api/engine/index -api/inference_params api/multimodal/index -api/model/index +api/params ``` % Design Documents: Details about vLLM internals diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index acbe27a22a679..3ba34c77205e5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -322,7 +322,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2ForCausalLM` - - QwQ, Qwen2 + - Qwen2 - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - ✅︎ - ✅︎ @@ -436,7 +436,7 @@ loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/t ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) @@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, @@ -499,7 +499,7 @@ e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "r ``` If your model is not in the above list, we will try to automatically convert the model using -{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) @@ -550,28 +550,6 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. -````{important} -To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) -or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: - -Offline inference: -```python -llm = LLM( - model="Qwen/Qwen2-VL-7B-Instruct", - limit_mm_per_prompt={"image": 4}, -) -``` - -Online inference: -```bash -vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 -``` -```` - -```{note} -vLLM currently only supports adding LoRA to the language backbone of multimodal models. -``` - ### Generative Models See [this page](#generative-models) for more information on how to use generative models. @@ -711,14 +689,14 @@ See [this page](#generative-models) for more information on how to use generativ * - `Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. - - ✅︎ - ✅︎ * - `PixtralForConditionalGeneration` - Pixtral - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. - - ✅︎ - ✅︎ @@ -737,7 +715,7 @@ See [this page](#generative-models) for more information on how to use generativ - ✅︎ - ✅︎ * - `Qwen2VLForConditionalGeneration` - - QVQ, Qwen2-VL + - Qwen2-VL - T + IE+ + VE+ - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ @@ -755,18 +733,33 @@ See [this page](#generative-models) for more information on how to use generativ E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. +````{important} +To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) +or `--limit-mm-per-prompt` (online inference). For example, to enable passing up to 4 images per text prompt: + +```python +llm = LLM( + model="Qwen/Qwen2-VL-7B-Instruct", + limit_mm_per_prompt={"image": 4}, +) +``` + +```bash +vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 +``` +```` + ```{note} -To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +vLLM currently only supports adding LoRA to the language backbone of multimodal models. ``` ```{note} -The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. -For more details, please see: +To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} -The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). -A corrected version is available at . +The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. +For more details, please see: ``` ### Pooling Models diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 79092ab208784..94703a1c32ade 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -64,7 +64,7 @@ Dynamic quantization is also supported via the `quantization` option -- see [her #### Context length and batch size -You can further reduce memory usage by limit the context length of the model (`max_model_len` option) +You can further reduce memory usage by limiting the context length of the model (`max_model_len` option) and the maximum batch size (`max_num_seqs` option). ```python diff --git a/requirements-lint.txt b/requirements-lint.txt index f9132bbf96437..711bb50a0e936 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -6,6 +6,7 @@ ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 +sphinx-lint==1.0.0 # type checking mypy==1.11.1 diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh new file mode 100755 index 0000000000000..04f8075c5527f --- /dev/null +++ b/tools/sphinx-lint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +sphinx-lint --disable trailing-whitespace,missing-final-newline docs From 6f638e9173192895afaca39548a86f78c85fded8 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 13:59:33 -0500 Subject: [PATCH 10/15] Use pymarkdownlnt add to format.sh Signed-off-by: Rafael Vasquez --- .github/workflows/{sphinx-lint.yml => doc-lint.yml} | 9 +-------- format.sh | 6 +++--- requirements-lint.txt | 2 +- tools/doc-lint.sh | 3 +++ tools/sphinx-lint.sh | 3 --- 5 files changed, 8 insertions(+), 15 deletions(-) rename .github/workflows/{sphinx-lint.yml => doc-lint.yml} (74%) create mode 100755 tools/doc-lint.sh delete mode 100755 tools/sphinx-lint.sh diff --git a/.github/workflows/sphinx-lint.yml b/.github/workflows/doc-lint.yml similarity index 74% rename from .github/workflows/sphinx-lint.yml rename to .github/workflows/doc-lint.yml index ca0f9b523c361..c0709978048ec 100644 --- a/.github/workflows/sphinx-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -29,11 +29,4 @@ jobs: python -m pip install --upgrade pip pip install -r requirements-lint.txt - name: Linting docs - run: tools/sphinx-lint.sh - markdown-lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 - - uses: DavidAnson/markdownlint-cli2-action@v19 - with: - globs: "docs/**/*.md" \ No newline at end of file + run: tools/doc-lint.sh diff --git a/format.sh b/format.sh index 0b196de9d0773..522963b7d9386 100755 --- a/format.sh +++ b/format.sh @@ -316,6 +316,6 @@ else echo "✨🎉 Format check passed! Congratulations! 🎉✨" fi -echo 'vLLM sphinx-lint:' -tools/sphinx-lint.sh -echo 'vLLM sphinx-lint: Done' +echo 'vLLM doc-lint:' +tools/doc-lint.sh +echo 'vLLM doc-lint: Done' diff --git a/requirements-lint.txt b/requirements-lint.txt index 711bb50a0e936..ffc73f90a0d48 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -6,7 +6,7 @@ ruff==0.6.5 codespell==2.3.0 isort==5.13.2 clang-format==18.1.5 -sphinx-lint==1.0.0 +pymarkdownlnt==0.9.26 # type checking mypy==1.11.1 diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh new file mode 100755 index 0000000000000..8926e92c06659 --- /dev/null +++ b/tools/doc-lint.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +pymarkdownlnt -d line-length scan docs/ -r \ No newline at end of file diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh deleted file mode 100755 index 04f8075c5527f..0000000000000 --- a/tools/sphinx-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -sphinx-lint --disable trailing-whitespace,missing-final-newline docs From ce45c0d1e9e39c70ef65057d6fdfe62c7fc806c6 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 14:56:28 -0500 Subject: [PATCH 11/15] Fix docs, update command Signed-off-by: Rafael Vasquez --- docs/README.md | 1 + docs/source/api/multimodal/index.md | 1 - docs/source/api/params.md | 1 - docs/source/community/sponsors.md | 2 + docs/source/contributing/overview.md | 2 - docs/source/deployment/docker.md | 4 +- .../source/deployment/frameworks/cerebrium.md | 10 +- docs/source/deployment/frameworks/dstack.md | 10 +- docs/source/deployment/frameworks/skypilot.md | 2 +- .../deployment/integrations/llamastack.md | 2 +- docs/source/deployment/k8s.md | 9 +- .../source/design/automatic_prefix_caching.md | 11 +- docs/source/features/quantization/auto_awq.md | 4 +- docs/source/features/quantization/bnb.md | 7 +- docs/source/features/quantization/fp8.md | 4 +- .../features/quantization/fp8_e4m3_kvcache.md | 2 +- docs/source/features/quantization/gguf.md | 10 +- docs/source/features/quantization/int8.md | 2 +- docs/source/features/spec_decode.md | 10 +- docs/source/features/tool_calling.md | 44 ++++--- docs/source/getting_started/faq.md | 2 +- .../getting_started/installation/cpu-apple.md | 17 ++- .../getting_started/installation/cpu-x86.md | 44 +++---- .../getting_started/installation/gpu-cuda.md | 96 +++++++-------- .../getting_started/installation/gpu-rocm.md | 116 +++++++++--------- .../getting_started/installation/hpu-gaudi.md | 64 +++++----- .../getting_started/installation/neuron.md | 8 +- .../getting_started/installation/openvino.md | 14 +-- .../getting_started/installation/tpu.md | 6 +- .../getting_started/installation/xpu.md | 24 ++-- docs/source/getting_started/quickstart.md | 44 +++---- .../source/getting_started/troubleshooting.md | 6 +- docs/source/index.md | 4 +- .../models/extensions/runai_model_streamer.md | 12 +- docs/source/models/supported_models.md | 13 +- docs/source/performance/optimization.md | 2 +- docs/source/serving/distributed_serving.md | 40 +++--- docs/source/serving/integrations/langchain.md | 2 +- .../source/serving/integrations/llamaindex.md | 2 +- docs/source/serving/metrics.md | 2 +- docs/source/serving/multimodal_inputs.md | 1 + .../serving/openai_compatible_server.md | 19 ++- tools/doc-lint.sh | 2 +- 43 files changed, 346 insertions(+), 332 deletions(-) diff --git a/docs/README.md b/docs/README.md index 46488c9bb0b92..1a44c1341f4fb 100644 --- a/docs/README.md +++ b/docs/README.md @@ -16,4 +16,5 @@ make html ```bash python -m http.server -d build/html/ ``` + Launch your browser and open localhost:8000. diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 0046b73ea825e..fa2eb5793386e 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -13,7 +13,6 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - ## Module Contents ```{eval-rst} diff --git a/docs/source/api/params.md b/docs/source/api/params.md index a3b4d9cbb44ec..56e6fb664798b 100644 --- a/docs/source/api/params.md +++ b/docs/source/api/params.md @@ -19,4 +19,3 @@ Optional parameters for vLLM APIs. .. autoclass:: vllm.PoolingParams :members: ``` - diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md index 9d2af4c13b088..fb93e65673dff 100644 --- a/docs/source/community/sponsors.md +++ b/docs/source/community/sponsors.md @@ -6,6 +6,7 @@ vLLM is a community project. Our compute resources for development and testing a Cash Donations: + - a16z - Dropbox - Sequoia Capital @@ -13,6 +14,7 @@ Cash Donations: - ZhenFund Compute Resources: + - AMD - Anyscale - AWS diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index c960790f47a13..e92104399342d 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -37,8 +37,6 @@ pytest tests/ Currently, the repository is not fully checked by `mypy`. ``` -# Contribution Guidelines - ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 2df1aca27f1e6..c735bfd0e87a7 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -28,8 +28,8 @@ memory to share data between processes under the hood, particularly for tensor p You can build and run vLLM from source via the provided . To build vLLM: ```console -$ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 -$ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai +# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 +DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` ```{note} diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index be018dfb75d7a..5787c4a407bfb 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -13,14 +13,14 @@ vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebr To install the Cerebrium client, run: ```console -$ pip install cerebrium -$ cerebrium login +pip install cerebrium +cerebrium login ``` Next, create your Cerebrium project, run: ```console -$ cerebrium init vllm-project +cerebrium init vllm-project ``` Next, to install the required packages, add the following to your cerebrium.toml: @@ -58,10 +58,10 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): Then, run the following code to deploy it to the cloud: ```console -$ cerebrium deploy +cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index 4142c1d9f1f60..b42a34125c6d7 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -13,16 +13,16 @@ vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), To install dstack client, run: ```console -$ pip install "dstack[all] -$ dstack server +pip install "dstack[all] +dstack server ``` Next, to configure your dstack project, run: ```console -$ mkdir -p vllm-dstack -$ cd vllm-dstack -$ dstack init +mkdir -p vllm-dstack +cd vllm-dstack +dstack init ``` Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 657e7f2bc72cc..bc2fbb93d5332 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -338,7 +338,7 @@ run: | sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) ``` -2. Then, we can access the GUI at the returned gradio link: +1. Then, we can access the GUI at the returned gradio link: ```console | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/source/deployment/integrations/llamastack.md index 474d2bdfa9580..a6c3569637abf 100644 --- a/docs/source/deployment/integrations/llamastack.md +++ b/docs/source/deployment/integrations/llamastack.md @@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta To install Llama Stack, run ```console -$ pip install llama-stack -q +pip install llama-stack -q ``` ## Inference using OpenAI Compatible API diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 760214e112fba..e58916d64e835 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -14,7 +14,7 @@ Before you begin, ensure that you have the following: ## Deployment Steps -1. **Create a PVC , Secret and Deployment for vLLM** +1. Create a PVC, Secret and Deployment for vLLM PVC is used to store the model cache and it is optional, you can use hostPath or other storage options @@ -49,7 +49,7 @@ stringData: Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. -Here are two examples for using NVIDIA GPU and AMD GPU. +Here are two examples for using NVIDIA GPU and AMD GPU. - NVIDIA GPU @@ -194,9 +194,10 @@ spec: - name: shm mountPath: /dev/shm ``` + You can get the full example with steps and sample yaml files from . -2. **Create a Kubernetes Service for vLLM** +1. Create a Kubernetes Service for vLLM Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: @@ -219,7 +220,7 @@ spec: type: ClusterIP ``` -3. **Deploy and Test** +1. Deploy and Test Apply the deployment and service configurations using `kubectl apply -f `: diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/source/design/automatic_prefix_caching.md index 4398536b2b4ad..bbea45eac45bf 100644 --- a/docs/source/design/automatic_prefix_caching.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -6,7 +6,7 @@ The core idea of [PagedAttention](#design-paged-attention) is to partition the K To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. -``` +```text Block 1 Block 2 Block 3 [A gentle breeze stirred] [the leaves as children] [laughed in the distance] Block 1: |<--- block tokens ---->| @@ -14,19 +14,16 @@ Block 2: |<------- prefix ------>| |<--- block tokens --->| Block 3: |<------------------ prefix -------------------->| |<--- block tokens ---->| ``` - In the example above, the KV cache in the first block can be uniquely identified with the tokens “A gentle breeze stirred”. The third block can be uniquely identified with the tokens in the block “laughed in the distance”, along with the prefix tokens “A gentle breeze stirred the leaves as children”. Therefore, we can build the following one-to-one mapping: -``` +```text hash(prefix tokens + block tokens) <--> KV Block ``` With this mapping, we can add another indirection in vLLM’s KV cache management. Previously, each sequence in vLLM maintained a mapping from their logical KV blocks to physical blocks. To achieve automatic caching of KV blocks, we map the logical KV blocks to their hash value and maintain a global hash table of all the physical blocks. In this way, all the KV blocks sharing the same hash value (e.g., shared prefix blocks across two requests) can be mapped to the same physical block and share the memory space. - This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. - ## Generalized Caching Policy Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. @@ -41,5 +38,5 @@ Note that this eviction policy effectively implements the exact policy as in [Ra However, the hash-based KV cache management gives us the flexibility to handle more complicated serving scenarios and implement more complicated eviction policies beyond the policy above: -- Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. -- Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. +* Multi-LoRA serving. When serving requests for multiple LoRA adapters, we can simply let the hash of each KV block to also include the LoRA ID the request is querying for to enable caching for all adapters. In this way, we can jointly manage the KV blocks for different adapters, which simplifies the system implementation and improves the global cache hit rate and efficiency. +* Multi-modal models. When the user input includes more than just discrete tokens, we can use different hashing methods to handle the caching of inputs of different modalities. For example, perceptual hashing for images to cache similar input images. diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 3679595e3d4d0..404505eb3890e 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -15,7 +15,7 @@ The main benefits are lower latency and memory usage. You can quantize your own models by installing AutoAWQ or picking one of the [400+ models on Huggingface](https://huggingface.co/models?sort=trending&search=awq). ```console -$ pip install autoawq +pip install autoawq ``` After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`: @@ -47,7 +47,7 @@ print(f'Model is quantized and saved at "{quant_path}"') To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command: ```console -$ python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq +python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq ``` AWQ models are also supported directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/bnb.md b/docs/source/features/quantization/bnb.md index f7f41726f3725..7525e8e7866c3 100644 --- a/docs/source/features/quantization/bnb.md +++ b/docs/source/features/quantization/bnb.md @@ -9,7 +9,7 @@ Compared to other quantization methods, BitsAndBytes eliminates the need for cal Below are the steps to utilize BitsAndBytes with vLLM. ```console -$ pip install bitsandbytes>=0.45.0 +pip install bitsandbytes>=0.45.0 ``` vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint. @@ -17,7 +17,7 @@ vLLM reads the model's config file and supports both in-flight quantization and You can find bitsandbytes quantized models on . And usually, these repositories have a config.json file that includes a quantization_config section. -## Read quantized checkpoint. +## Read quantized checkpoint ```python from vllm import LLM @@ -37,10 +37,11 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` + ## OpenAI Compatible Server Append the following to your 4bit model arguments: -``` +```console --quantization bitsandbytes --load-format bitsandbytes ``` diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index b2eda74fd1e3b..da49cd2747228 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -41,7 +41,7 @@ Currently, we load the model at original precision before quantizing down to 8-b To produce performant FP8 quantized models with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process @@ -98,7 +98,7 @@ tokenizer.save_pretrained(SAVE_DIR) Install `vllm` and `lm-evaluation-harness`: ```console -$ pip install vllm lm-eval==0.4.4 +pip install vllm lm-eval==0.4.4 ``` Load and run the model in `vllm`: diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md index 50edaf81fddd3..1cd67cb8fd336 100644 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ b/docs/source/features/quantization/fp8_e4m3_kvcache.md @@ -17,7 +17,7 @@ unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). To install AMMO (AlgorithMic Model Optimization): ```console -$ pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo +pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo ``` Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index eebf11dfc1b2b..640997cf4bc39 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -13,16 +13,16 @@ Currently, vllm only supports loading single-file GGUF models. If you have a mul To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: ```console -$ wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 +wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 ``` You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs: ```console -$ # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. -$ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 +# We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion. +vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` ```{warning} diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 1ac50ba987dda..82a15d76d352f 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -16,7 +16,7 @@ INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turi To use INT8 quantization with vLLM, you'll need to install the [llm-compressor](https://github.com/vllm-project/llm-compressor/) library: ```console -$ pip install llmcompressor +pip install llmcompressor ``` ## Quantization Process diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index 903acadb71426..ab7b2f302bd13 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -192,11 +192,11 @@ A few important things to consider when using the EAGLE based draft models: 1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) cannot be used directly with vLLM due to differences in the expected layer names and model definition. - To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) + To use these models with vLLM, use the [following script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert them. Note that this script does not modify the model's weights. In the above example, use the script to first convert - the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model + the [yuhuili/EAGLE-LLaMA3-Instruct-8B](https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B) model and then use the converted checkpoint as the draft model in vLLM. 2. The EAGLE based draft models need to be run without tensor parallelism @@ -207,7 +207,6 @@ A few important things to consider when using the EAGLE based draft models: reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565). - A variety of EAGLE draft models are available on the Hugging Face hub: | Base Model | EAGLE on Hugging Face | # EAGLE Parameters | @@ -224,7 +223,6 @@ A variety of EAGLE draft models are available on the Hugging Face hub: | Qwen2-7B-Instruct | yuhuili/EAGLE-Qwen2-7B-Instruct | 0.26B | | Qwen2-72B-Instruct | yuhuili/EAGLE-Qwen2-72B-Instruct | 1.05B | - ## Lossless guarantees of Speculative Decoding In vLLM, speculative decoding aims to enhance inference efficiency while maintaining accuracy. This section addresses the lossless guarantees of @@ -250,8 +248,6 @@ speculative decoding, breaking down the guarantees into three key areas: same request across runs. For more details, see the FAQ section titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). -**Conclusion** - While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding can occur due to following factors: @@ -259,8 +255,6 @@ can occur due to following factors: - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially due to non-deterministic behavior in batched operations or numerical instability. -**Mitigation Strategies** - For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/features/tool_calling.md b/docs/source/features/tool_calling.md index 062f2021eb62a..027ddb6d5eda3 100644 --- a/docs/source/features/tool_calling.md +++ b/docs/source/features/tool_calling.md @@ -55,21 +55,24 @@ print(f"Result: {get_weather(**json.loads(tool_call.arguments))}") ``` Example output: -``` + +```text Function called: get_weather Arguments: {"location": "San Francisco, CA", "unit": "fahrenheit"} Result: Getting the weather for San Francisco, CA in fahrenheit... ``` This example demonstrates: -- Setting up the server with tool calling enabled -- Defining an actual function to handle tool calls -- Making a request with `tool_choice="auto"` -- Handling the structured response and executing the corresponding function + +* Setting up the server with tool calling enabled +* Defining an actual function to handle tool calls +* Making a request with `tool_choice="auto"` +* Handling the structured response and executing the corresponding function You can also specify a particular function using named function calling by setting `tool_choice={"type": "function", "function": {"name": "get_weather"}}`. Note that this will use the guided decoding backend - so the first time this is used, there will be several seconds of latency (or more) as the FSM is compiled for the first time before it is cached for subsequent requests. Remember that it's the callers responsibility to: + 1. Define appropriate tools in the request 2. Include relevant context in the chat messages 3. Handle the tool calls in your application logic @@ -77,20 +80,21 @@ Remember that it's the callers responsibility to: For more advanced usage, including parallel tool calls and different model-specific parsers, see the sections below. ## Named Function Calling + vLLM supports named function calling in the chat completion API by default. It does so using Outlines through guided decoding, so this is enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a high-quality one. -vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. +vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. For best results, we recommend ensuring that the expected output format / schema is specified in the prompt to ensure that the model's intended generation is aligned with the schema that it's being forced to generate by the guided decoding backend. To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. - ## Automatic Function Calling To enable this feature, you should set the following flags: + * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. * `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers @@ -104,28 +108,28 @@ from HuggingFace; and you can find an example of this in a `tokenizer_config.jso If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! - ### Hermes Models (`hermes`) All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. + * `NousResearch/Hermes-2-Pro-*` * `NousResearch/Hermes-2-Theta-*` * `NousResearch/Hermes-3-*` - _Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge step in their creation_. Flags: `--tool-call-parser hermes` - ### Mistral Models (`mistral`) Supported models: + * `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) * Additional mistral function-calling models are compatible as well. Known issues: + 1. Mistral 7B struggles to generate parallel tool calls correctly. 2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is much shorter than what vLLM generates. Since an exception is thrown when this condition @@ -136,13 +140,12 @@ it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated * `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt when tools are provided, that results in much better reliability when working with parallel tool calling. - Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` - ### Llama Models (`llama3_json`) Supported models: + * `meta-llama/Meta-Llama-3.1-8B-Instruct` * `meta-llama/Meta-Llama-3.1-70B-Instruct` * `meta-llama/Meta-Llama-3.1-405B-Instruct` @@ -152,6 +155,7 @@ The tool calling that is supported is the [JSON based tool calling](https://llam Other tool calling formats like the built in python tool calling or custom tool calling are not supported. Known issues: + 1. Parallel tool calls are not supported. 2. The model can generate parameters with a wrong format, such as generating an array serialized as string instead of an array. @@ -164,6 +168,7 @@ Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool #### IBM Granite Supported models: + * `ibm-granite/granite-3.0-8b-instruct` Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja` @@ -182,42 +187,45 @@ Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/t `examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported. - ### InternLM Models (`internlm`) Supported models: + * `internlm/internlm2_5-7b-chat` (confirmed) * Additional internlm2.5 function-calling models are compatible as well Known issues: + * Although this implementation also supports InternLM2, the tool call results are not stable when testing with the `internlm/internlm2-chat-7b` model. Recommended flags: `--tool-call-parser internlm --chat-template examples/tool_chat_template_internlm2_tool.jinja` - ### Jamba Models (`jamba`) + AI21's Jamba-1.5 models are supported. + * `ai21labs/AI21-Jamba-1.5-Mini` * `ai21labs/AI21-Jamba-1.5-Large` - Flags: `--tool-call-parser jamba` - ### Models with Pythonic Tool Calls (`pythonic`) A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models. As a concrete example, these models may look up the weather in San Francisco and Seattle by generating: + ```python [get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')] ``` Limitations: + * The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls. (In particular, the Llama 3.2 models emit no such tokens.) * Llama's smaller models struggle to use tools effectively. Example supported models: + * `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`) * `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`) @@ -231,7 +239,6 @@ Llama's smaller models frequently fail to emit tool calls in the correct format. --- - ## How to write a tool parser plugin A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py. @@ -284,7 +291,8 @@ class ExampleToolParser(ToolParser): ``` Then you can use this plugin in the command line like this. -``` + +```console --enable-auto-tool-choice \ --tool-parser-plugin --tool-call-parser example \ diff --git a/docs/source/getting_started/faq.md b/docs/source/getting_started/faq.md index fde2954f10c59..4751b325e6fc4 100644 --- a/docs/source/getting_started/faq.md +++ b/docs/source/getting_started/faq.md @@ -30,7 +30,7 @@ changes in batch size, or batch expansion in speculative decoding. These batchin can lead to slightly different logit/logprob values at each step. Such differences can accumulate, potentially resulting in different tokens being sampled. Once a different token is sampled, further divergence is likely. -**Mitigation Strategies** +## Mitigation Strategies - For improved stability and reduced variance, use `float32`. Note that this will require more memory. - If using `bfloat16`, switching to `float16` can also help. diff --git a/docs/source/getting_started/installation/cpu-apple.md b/docs/source/getting_started/installation/cpu-apple.md index b55e4384d064d..1068893f5bafa 100644 --- a/docs/source/getting_started/installation/cpu-apple.md +++ b/docs/source/getting_started/installation/cpu-apple.md @@ -18,25 +18,23 @@ Currently the CPU implementation for macOS supports FP32 and FP16 datatypes. After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source. -``` -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -r requirements-cpu.txt -$ pip install -e . +```console +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-cpu.txt +pip install -e . ``` ```{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. ``` - - ## Troubleshooting -If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your +If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your [Command Line Tools for Xcode](https://developer.apple.com/download/all/). -``` +```text [...] fatal error: 'map' file not found 1 | #include | ^~~~~ @@ -48,4 +46,3 @@ If the build has error like the following snippet where standard C++ headers can | ^~~~~~~~~ 1 error generated. ``` - diff --git a/docs/source/getting_started/installation/cpu-x86.md b/docs/source/getting_started/installation/cpu-x86.md index bb046dd0fd9dc..301a913e27ba2 100644 --- a/docs/source/getting_started/installation/cpu-x86.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -32,13 +32,13 @@ Table of contents: ## Quick start using Dockerfile ```console -$ docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . -$ docker run -it \ - --rm \ - --network=host \ - --cpuset-cpus= \ - --cpuset-mems= \ - vllm-cpu-env +docker build -f Dockerfile.cpu -t vllm-cpu-env --shm-size=4g . +docker run -it \ + --rm \ + --network=host \ + --cpuset-cpus= \ + --cpuset-mems= \ + vllm-cpu-env ``` (build-cpu-backend-from-source)= @@ -48,23 +48,23 @@ $ docker run -it \ - First, install recommended compiler. We recommend to use `gcc/g++ >= 12.3.0` as the default compiler to avoid potential problems. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get update -y -$ sudo apt-get install -y gcc-12 g++-12 libnuma-dev -$ sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +sudo apt-get update -y +sudo apt-get install -y gcc-12 g++-12 libnuma-dev +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 ``` - Second, install Python packages for vLLM CPU backend building: ```console -$ pip install --upgrade pip -$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy -$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +pip install --upgrade pip +pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy +pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, build and install vLLM CPU backend: ```console -$ VLLM_TARGET_DEVICE=cpu python setup.py install +VLLM_TARGET_DEVICE=cpu python setup.py install ``` ```{note} @@ -92,18 +92,18 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install - We highly recommend to use TCMalloc for high performance memory allocation and better cache locality. For example, on Ubuntu 22.4, you can run: ```console -$ sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library -$ find / -name *libtcmalloc* # find the dynamic link library path -$ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD -$ python examples/offline_inference/offline_inference.py # run vLLM +sudo apt-get install libtcmalloc-minimal4 # install TCMalloc library +find / -name *libtcmalloc* # find the dynamic link library path +export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD # prepend the library to LD_PRELOAD +python examples/offline_inference/offline_inference.py # run vLLM ``` - When using the online serving, it is recommended to reserve 1-2 CPU cores for the serving framework to avoid CPU oversubscription. For example, on a platform with 32 physical CPU cores, reserving CPU 30 and 31 for the framework and using CPU 0-29 for OpenMP: ```console -$ export VLLM_CPU_KVCACHE_SPACE=40 -$ export VLLM_CPU_OMP_THREADS_BIND=0-29 -$ vllm serve facebook/opt-125m +export VLLM_CPU_KVCACHE_SPACE=40 +export VLLM_CPU_OMP_THREADS_BIND=0-29 +vllm serve facebook/opt-125m ``` - If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND`. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: @@ -148,7 +148,7 @@ $ python examples/offline_inference/offline_inference.py - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console - $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp + VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/installation/gpu-cuda.md b/docs/source/getting_started/installation/gpu-cuda.md index 419b8163fc034..727486abbd10f 100644 --- a/docs/source/getting_started/installation/gpu-cuda.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -17,9 +17,9 @@ vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) bin You can create a new Python environment using `conda`: ```console -$ # (Recommended) Create a new conda environment. -$ conda create -n myenv python=3.12 -y -$ conda activate myenv +# (Recommended) Create a new conda environment. +conda create -n myenv python=3.12 -y +conda activate myenv ``` ```{note} @@ -29,9 +29,9 @@ $ conda activate myenv Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: ```console -$ # (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate +# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment. +uv venv myenv --python 3.12 --seed +source myenv/bin/activate ``` In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. @@ -43,18 +43,18 @@ Therefore, it is recommended to install vLLM with a **fresh new** environment. I You can install vLLM using either `pip` or `uv pip`: ```console -$ # Install vLLM with CUDA 12.1. -$ pip install vllm # If you are using pip. -$ uv pip install vllm # If you are using uv. +# Install vLLM with CUDA 12.1. +pip install vllm # If you are using pip. +uv pip install vllm # If you are using uv. ``` As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 11.8 and public PyTorch release versions: ```console -$ # Install vLLM with CUDA 11.8. -$ export VLLM_VERSION=0.6.1.post1 -$ export PYTHON_VERSION=310 -$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 +# Install vLLM with CUDA 11.8. +export VLLM_VERSION=0.6.1.post1 +export PYTHON_VERSION=310 +pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 ``` (install-the-latest-code)= @@ -66,7 +66,7 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe ### Install the latest code using `pip` ```console -$ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly +pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly ``` `--pre` is required for `pip` to consider pre-released versions. @@ -74,8 +74,8 @@ $ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl ``` Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before. @@ -85,14 +85,14 @@ Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.p Another way to install the latest code is to use `uv`: ```console -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly +uv pip install vllm --extra-index-url https://wheels.vllm.ai/nightly ``` If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL: ```console -$ export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch -$ uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} +export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch +uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT} ``` The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. @@ -102,8 +102,8 @@ The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-rememb Another way to access the latest code is to use the docker images: ```console -$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch -$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} +export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch +docker pull public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:${VLLM_COMMIT} ``` These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days. @@ -121,18 +121,18 @@ The latest code can contain bugs and may not be stable. Please use it with cauti If you only need to change Python code, you can build and install vLLM without compilation. Using `pip`'s [`--editable` flag](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs), changes you make to the code will be reflected when you run vLLM: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_USE_PRECOMPILED=1 pip install --editable . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_USE_PRECOMPILED=1 pip install --editable . ``` -This will download the latest nightly wheel from https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl and use the compiled libraries from there in the installation. +This will download the [latest nightly wheel](https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl) and use the compiled libraries from there in the installation. The `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable can be used instead of `VLLM_USE_PRECOMPILED` to specify a custom path or URL to the wheel file. For example, to use the [0.6.1.post1 PyPi wheel](https://pypi.org/project/vllm/#files): ```console -$ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl -$ pip install --editable . +export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl +pip install --editable . ``` You can find more information about vLLM's wheels [above](#install-the-latest-code). @@ -147,9 +147,9 @@ It is recommended to use the same commit ID for the source code as the vLLM whee If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -e . ``` ```{tip} @@ -172,11 +172,11 @@ There are scenarios where the PyTorch dependency cannot be easily installed via To build vLLM using an existing PyTorch installation: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python use_existing_torch.py -$ pip install -r requirements-build.txt -$ pip install -e . --no-build-isolation +git clone https://github.com/vllm-project/vllm.git +cd vllm +python use_existing_torch.py +pip install -r requirements-build.txt +pip install -e . --no-build-isolation ``` #### Use the local cutlass for compilation @@ -185,9 +185,9 @@ Currently, before starting the build process, vLLM fetches cutlass code from Git To achieve this, you can set the environment variable VLLM_CUTLASS_SRC_DIR to point to your local cutlass directory. ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . +git clone https://github.com/vllm-project/vllm.git +cd vllm +VLLM_CUTLASS_SRC_DIR=/path/to/cutlass pip install -e . ``` #### Troubleshooting @@ -196,8 +196,8 @@ To avoid your system being overloaded, you can limit the number of compilation j to be run simultaneously, via the environment variable `MAX_JOBS`. For example: ```console -$ export MAX_JOBS=6 -$ pip install -e . +export MAX_JOBS=6 +pip install -e . ``` This is especially useful when you are building on less powerful machines. For example, when you use WSL it only [assigns 50% of the total memory by default](https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings), so using `export MAX_JOBS=1` can avoid compiling multiple files simultaneously and running out of memory. @@ -206,22 +206,22 @@ A side effect is a much slower build process. Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. ```console -$ # Use `--ipc=host` to make sure the shared memory is large enough. -$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 +# Use `--ipc=host` to make sure the shared memory is large enough. +docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 ``` If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: ```console -$ export CUDA_HOME=/usr/local/cuda -$ export PATH="${CUDA_HOME}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda +export PATH="${CUDA_HOME}/bin:$PATH" ``` Here is a sanity check to verify that the CUDA Toolkit is correctly installed: ```console -$ nvcc --version # verify that nvcc is in your PATH -$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME +nvcc --version # verify that nvcc is in your PATH +${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME ``` ### Unsupported OS build @@ -231,6 +231,6 @@ vLLM can fully run only on Linux but for development purposes, you can still bui Simply disable the `VLLM_TARGET_DEVICE` environment variable before installing: ```console -$ export VLLM_TARGET_DEVICE=empty -$ pip install -e . +export VLLM_TARGET_DEVICE=empty +pip install -e . ``` diff --git a/docs/source/getting_started/installation/gpu-rocm.md b/docs/source/getting_started/installation/gpu-rocm.md index e36b92513e31d..a8971bb96248c 100644 --- a/docs/source/getting_started/installation/gpu-rocm.md +++ b/docs/source/getting_started/installation/gpu-rocm.md @@ -47,13 +47,13 @@ Their values can be passed in when running `docker build` with `--build-arg` opt To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: ```console -$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: ```console -$ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: @@ -83,81 +83,81 @@ Where the `` is the location where the model is stored, for examp - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) - [PyTorch](https://pytorch.org/) -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. + For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. -Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) + Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch [Getting Started](https://pytorch.org/get-started/locally/) 1. Install [Triton flash attention for ROCm](https://github.com/ROCm/triton) -Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) + Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from [ROCm/triton](https://github.com/ROCm/triton/blob/triton-mlir/README.md) -```console -$ python3 -m pip install ninja cmake wheel pybind11 -$ pip uninstall -y triton -$ git clone https://github.com/OpenAI/triton.git -$ cd triton -$ git checkout e192dba -$ cd python -$ pip3 install . -$ cd ../.. -``` + ```console + python3 -m pip install ninja cmake wheel pybind11 + pip uninstall -y triton + git clone https://github.com/OpenAI/triton.git + cd triton + git checkout e192dba + cd python + pip3 install . + cd ../.. + ``` -```{note} -- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. -``` + ```{note} + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ``` 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) -Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) -Alternatively, wheels intended for vLLM use can be accessed under the releases. + Install ROCm's flash attention (v2.5.9.post1) following the instructions from [ROCm/flash-attention](https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support) + Alternatively, wheels intended for vLLM use can be accessed under the releases. -For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. + For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. To get your gfx architecture, run `rocminfo |grep gfx`. -```console -$ git clone https://github.com/ROCm/flash-attention.git -$ cd flash-attention -$ git checkout 3cea2fb -$ git submodule update --init -$ GPU_ARCHS="gfx90a" python3 setup.py install -$ cd .. -``` + ```console + git clone https://github.com/ROCm/flash-attention.git + cd flash-attention + git checkout 3cea2fb + git submodule update --init + GPU_ARCHS="gfx90a" python3 setup.py install + cd .. + ``` -```{note} -- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -``` + ```{note} + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ``` 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: -```bash -$ pip install --upgrade pip + ```bash + $ pip install --upgrade pip -# Install PyTorch -$ pip uninstall torch -y -$ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + # Install PyTorch + $ pip uninstall torch -y + $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 -# Build & install AMD SMI -$ pip install /opt/rocm/share/amd_smi + # Build & install AMD SMI + $ pip install /opt/rocm/share/amd_smi -# Install dependencies -$ pip install --upgrade numba scipy huggingface-hub[cli] -$ pip install "numpy<2" -$ pip install -r requirements-rocm.txt + # Install dependencies + $ pip install --upgrade numba scipy huggingface-hub[cli] + $ pip install "numpy<2" + $ pip install -r requirements-rocm.txt -# Build vLLM for MI210/MI250/MI300. -$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" -$ python3 setup.py develop -``` + # Build vLLM for MI210/MI250/MI300. + $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" + $ python3 setup.py develop + ``` -This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. + This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. -```{tip} -- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. -- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. -- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. -- The ROCm version of PyTorch, ideally, should match the ROCm driver version. -``` + ```{tip} + - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. + - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. + - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. + ``` -```{tip} -- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. - For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` + ```{tip} + - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. + For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). + ``` diff --git a/docs/source/getting_started/installation/hpu-gaudi.md b/docs/source/getting_started/installation/hpu-gaudi.md index 1d50cef3bdc83..043781e332e6a 100644 --- a/docs/source/getting_started/installation/hpu-gaudi.md +++ b/docs/source/getting_started/installation/hpu-gaudi.md @@ -22,8 +22,8 @@ Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optim ### Quick start using Dockerfile ```console -$ docker build -f Dockerfile.hpu -t vllm-hpu-env . -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env +docker build -f Dockerfile.hpu -t vllm-hpu-env . +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` ```{tip} @@ -37,10 +37,10 @@ If you're observing the following error: `docker: Error response from daemon: Un To verify that the Intel Gaudi software was correctly installed, run: ```console -$ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible -$ apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed -$ pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed -$ pip list | grep neural # verify that neural_compressor is installed +hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible +apt list --installed | grep habana # verify that habanalabs-firmware-tools, habanalabs-graph, habanalabs-rdma-core, habanalabs-thunk and habanalabs-container-runtime are installed +pip list | grep habana # verify that habana-torch-plugin, habana-torch-dataloader, habana-pyhlml and habana-media-loader are installed +pip list | grep neural # verify that neural_compressor is installed ``` Refer to [Intel Gaudi Software Stack @@ -57,8 +57,8 @@ for more details. Use the following commands to run a Docker image: ```console -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest ``` #### Build and Install vLLM @@ -66,18 +66,18 @@ $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_ To build and install vLLM from source, run: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ python setup.py develop +git clone https://github.com/vllm-project/vllm.git +cd vllm +python setup.py develop ``` Currently, the latest features and performance optimizations are developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and we periodically upstream them to vLLM main repo. To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: ```console -$ git clone https://github.com/HabanaAI/vllm-fork.git -$ cd vllm-fork -$ git checkout habana_main -$ python setup.py develop +git clone https://github.com/HabanaAI/vllm-fork.git +cd vllm-fork +git checkout habana_main +python setup.py develop ``` ## Supported Features @@ -181,7 +181,7 @@ Bucketing allows us to reduce the number of required graphs significantly, but i Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: -``` +```text INFO 08-01 21:37:59 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-01 21:37:59 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -192,7 +192,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 1 Example (with ramp-up) -``` +```text min = 2, step = 32, max = 64 => ramp_up = (2, 4, 8, 16) => stable = (32, 64) @@ -201,7 +201,7 @@ min = 2, step = 32, max = 64 Example (without ramp-up) -``` +```text min = 128, step = 128, max = 512 => ramp_up = () => stable = (128, 256, 384, 512) @@ -224,7 +224,7 @@ Bucketing is transparent to a client -- padding in sequence length dimension is Warmup is an optional, but highly recommended step occurring before vLLM server starts listening. It executes a forward pass for each bucket with dummy data. The goal is to pre-compile all graphs and not incur any graph compilation overheads within bucket boundaries during server runtime. Each warmup step is logged during vLLM startup: -``` +```text INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][1/24] batch_size:4 seq_len:1024 free_mem:79.16 GiB INFO 08-01 22:26:47 hpu_model_runner.py:1066] [Warmup][Prompt][2/24] batch_size:4 seq_len:896 free_mem:55.43 GiB INFO 08-01 22:26:48 hpu_model_runner.py:1066] [Warmup][Prompt][3/24] batch_size:4 seq_len:768 free_mem:55.43 GiB @@ -273,7 +273,7 @@ When there's large amount of requests pending, vLLM scheduler will attempt to fi Each described step is logged by vLLM server, as follows (negative values correspond to memory being released): -``` +```text INFO 08-02 17:37:44 hpu_model_runner.py:493] Prompt bucket config (min, step, max_warmup) bs:[1, 32, 4], seq:[128, 128, 1024] INFO 08-02 17:37:44 hpu_model_runner.py:499] Generated 24 prompt buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024)] INFO 08-02 17:37:44 hpu_model_runner.py:504] Decode bucket config (min, step, max_warmup) bs:[1, 128, 4], seq:[128, 128, 2048] @@ -349,19 +349,19 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - Default values: - Prompt: - : - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `min(max_num_seqs, 64)` + - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` - Decode: - : - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` - - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` + - batch size min (`VLLM_DECODE_BS_BUCKET_MIN`): `1` + - batch size step (`VLLM_DECODE_BS_BUCKET_STEP`): `min(max_num_seqs, 32)` + - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` + - sequence length min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` + - sequence length step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` + - sequence length max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max(128, (max_num_seqs*max_model_len)/block_size)` Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution: diff --git a/docs/source/getting_started/installation/neuron.md b/docs/source/getting_started/installation/neuron.md index 431f90537f543..5581b1940ca46 100644 --- a/docs/source/getting_started/installation/neuron.md +++ b/docs/source/getting_started/installation/neuron.md @@ -123,10 +123,10 @@ python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torch Once neuronx-cc and transformers-neuronx packages are installed, we will be able to install vllm as follows: ```console -$ git clone https://github.com/vllm-project/vllm.git -$ cd vllm -$ pip install -U -r requirements-neuron.txt -$ VLLM_TARGET_DEVICE="neuron" pip install . +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -U -r requirements-neuron.txt +VLLM_TARGET_DEVICE="neuron" pip install . ``` If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed. diff --git a/docs/source/getting_started/installation/openvino.md b/docs/source/getting_started/installation/openvino.md index 60f95fd1c4250..d97d4173bf36b 100644 --- a/docs/source/getting_started/installation/openvino.md +++ b/docs/source/getting_started/installation/openvino.md @@ -27,8 +27,8 @@ vLLM powered by OpenVINO supports all LLM models from [vLLM supported models lis ## Quick start using Dockerfile ```console -$ docker build -f Dockerfile.openvino -t vllm-openvino-env . -$ docker run -it --rm vllm-openvino-env +docker build -f Dockerfile.openvino -t vllm-openvino-env . +docker run -it --rm vllm-openvino-env ``` (install-openvino-backend-from-source)= @@ -38,21 +38,21 @@ $ docker run -it --rm vllm-openvino-env - First, install Python. For example, on Ubuntu 22.04, you can run: ```console - $ sudo apt-get update -y - $ sudo apt-get install python3 + sudo apt-get update -y + sudo apt-get install python3 ``` - Second, install prerequisites vLLM OpenVINO backend installation: ```console - $ pip install --upgrade pip - $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + pip install --upgrade pip + pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu ``` - Finally, install vLLM with OpenVINO backend: ```console - $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . + PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v . ``` - [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html). diff --git a/docs/source/getting_started/installation/tpu.md b/docs/source/getting_started/installation/tpu.md index bc93c44fead30..1938785ade46a 100644 --- a/docs/source/getting_started/installation/tpu.md +++ b/docs/source/getting_started/installation/tpu.md @@ -156,14 +156,14 @@ For more information about using TPUs with GKE, see You can use to build a Docker image with TPU support. ```console -$ docker build -f Dockerfile.tpu -t vllm-tpu . +docker build -f Dockerfile.tpu -t vllm-tpu . ``` Run the Docker image with the following command: ```console -$ # Make sure to add `--privileged --net host --shm-size=16G`. -$ docker run --privileged --net host --shm-size=16G -it vllm-tpu +# Make sure to add `--privileged --net host --shm-size=16G`. +docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` ```{note} diff --git a/docs/source/getting_started/installation/xpu.md b/docs/source/getting_started/installation/xpu.md index c1ab5478eb652..73758f37cf0f6 100644 --- a/docs/source/getting_started/installation/xpu.md +++ b/docs/source/getting_started/installation/xpu.md @@ -40,15 +40,15 @@ $ docker run -it \ - Second, install Python packages for vLLM XPU backend building: ```console -$ source /opt/intel/oneapi/setvars.sh -$ pip install --upgrade pip -$ pip install -v -r requirements-xpu.txt +source /opt/intel/oneapi/setvars.sh +pip install --upgrade pip +pip install -v -r requirements-xpu.txt ``` - Finally, build and install vLLM XPU backend: ```console -$ VLLM_TARGET_DEVICE=xpu python setup.py install +VLLM_TARGET_DEVICE=xpu python setup.py install ``` ```{note} @@ -61,14 +61,14 @@ $ VLLM_TARGET_DEVICE=xpu python setup.py install XPU platform supports tensor-parallel inference/serving and also supports pipeline parallel as a beta feature for online serving. We requires Ray as the distributed runtime backend. For example, a reference execution likes following: ```console -$ python -m vllm.entrypoints.openai.api_server \ -$ --model=facebook/opt-13b \ -$ --dtype=bfloat16 \ -$ --device=xpu \ -$ --max_model_len=1024 \ -$ --distributed-executor-backend=ray \ -$ --pipeline-parallel-size=2 \ -$ -tp=8 +python -m vllm.entrypoints.openai.api_server \ + --model=facebook/opt-13b \ + --dtype=bfloat16 \ + --device=xpu \ + --max_model_len=1024 \ + --distributed-executor-backend=ray \ + --pipeline-parallel-size=2 \ + -tp=8 ``` By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 2808e1b386801..435e760cc13ab 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -18,9 +18,9 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -$ conda create -n myenv python=3.10 -y -$ conda activate myenv -$ pip install vllm +conda create -n myenv python=3.10 -y +conda activate myenv +pip install vllm ``` ```{note} @@ -85,7 +85,7 @@ By default, it starts the server at `http://localhost:8000`. You can specify the Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) model: ```console -$ vllm serve Qwen/Qwen2.5-1.5B-Instruct +vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} @@ -96,7 +96,7 @@ You can learn about overriding it [here](#chat-template). This server can be queried in the same format as OpenAI API. For example, to list the models: ```console -$ curl http://localhost:8000/v1/models +curl http://localhost:8000/v1/models ``` You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` to enable the server to check for API key in the header. @@ -106,14 +106,14 @@ You can pass in the argument `--api-key` or environment variable `VLLM_API_KEY` Once your server is started, you can query the model with input prompts: ```console -$ curl http://localhost:8000/v1/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "prompt": "San Francisco is a", -$ "max_tokens": 7, -$ "temperature": 0 -$ }' +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' ``` Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: @@ -142,15 +142,15 @@ vLLM is designed to also support the OpenAI Chat Completions API. The chat inter You can use the [create chat completion](https://platform.openai.com/docs/api-reference/chat/completions/create) endpoint to interact with the model: ```console -$ curl http://localhost:8000/v1/chat/completions \ -$ -H "Content-Type: application/json" \ -$ -d '{ -$ "model": "Qwen/Qwen2.5-1.5B-Instruct", -$ "messages": [ -$ {"role": "system", "content": "You are a helpful assistant."}, -$ {"role": "user", "content": "Who won the world series in 2020?"} -$ ] -$ }' +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2.5-1.5B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Who won the world series in 2020?"} + ] + }' ``` Alternatively, you can use the `openai` Python package: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index f5efe0bef7506..1e290d2b4c0bd 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -48,6 +48,7 @@ If vLLM crashes and the error trace captures it somewhere around `self.graph.rep To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. (troubleshooting-incorrect-hardware-driver)= + ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -118,13 +119,13 @@ dist.destroy_process_group() If you are testing with a single node, adjust `--nproc-per-node` to the number of GPUs you want to use: ```console -$ NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py +NCCL_DEBUG=TRACE torchrun --nproc-per-node= test.py ``` If you are testing with multi-nodes, adjust `--nproc-per-node` and `--nnodes` according to your setup and set `MASTER_ADDR` to the correct IP address of the master node, reachable from all nodes. Then, run: ```console -$ NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py +NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py ``` If the script runs successfully, you should see the message `sanity check is successful!`. @@ -141,6 +142,7 @@ Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup ``` (troubleshooting-python-multiprocessing)= + ## Python multiprocessing ### `RuntimeError` Exception diff --git a/docs/source/index.md b/docs/source/index.md index 6747a7fcce4fe..8a32e782eda75 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,4 +1,4 @@ -# Welcome to vLLM! +# Welcome to vLLM ```{figure} ./assets/logos/vllm-logo-text-light.png :align: center @@ -171,7 +171,7 @@ contributing/model/index contributing/vulnerability_management ``` -# Indices and tables +## Indices and tables - {ref}`genindex` - {ref}`modindex` diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index fe2701194a604..75f7a9fcad416 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -9,25 +9,25 @@ vLLM supports loading weights in Safetensors format using the Run:ai Model Strea You first need to install vLLM RunAI optional dependency: ```console -$ pip3 install vllm[runai] +pip3 install vllm[runai] ``` To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag: ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer ``` To run model from AWS S3 object store run: ```console -$ vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` To run model from a S3 compatible object store run: ```console -$ RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer +RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer ``` ## Tunable parameters @@ -38,14 +38,14 @@ You can tune `concurrency` that controls the level of concurrency and number of For reading from S3, it will be the number of client instances the host is opening to the S3 server. ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console -$ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' +vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` ```{note} diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 3ba34c77205e5..3529d30964c06 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -45,7 +45,7 @@ Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project To use models from [ModelScope](https://www.modelscope.cn) instead of HuggingFace Hub, set an environment variable: ```shell -$ export VLLM_USE_MODELSCOPE=True +export VLLM_USE_MODELSCOPE=True ``` And use with `trust_remote_code=True`. @@ -813,19 +813,22 @@ The following table lists those that are tested in vLLM. _________________ -# Model Support Policy +## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: 1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated! + 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. -```{tip} -When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. -``` + ```{tip} + When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. + ``` 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. + 4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use. + 5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement. Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem. diff --git a/docs/source/performance/optimization.md b/docs/source/performance/optimization.md index 4fcde9b03b887..4fbc376e1aa39 100644 --- a/docs/source/performance/optimization.md +++ b/docs/source/performance/optimization.md @@ -8,7 +8,7 @@ Due to the auto-regressive nature of transformer architecture, there are times w The vLLM can preempt requests to free up KV cache space for other requests. Preempted requests are recomputed when sufficient KV cache space becomes available again. When this occurs, the following warning is printed: -``` +```text WARNING 05-09 00:49:33 scheduler.py:1057 Sequence group 0 is preempted by PreemptionMode.SWAP mode because there is not enough KV cache space. This can affect the end-to-end performance. Increase gpu_memory_utilization or tensor_parallel_size to provide more KV cache memory. total_cumulative_preemption_cnt=1 ``` diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 4e0a9ef6ecf7d..daf6e2f250416 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -35,16 +35,16 @@ output = llm.generate("San Franciso is a") To run multi-GPU serving, pass in the `--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: ```console -$ vllm serve facebook/opt-13b \ -$ --tensor-parallel-size 4 + vllm serve facebook/opt-13b \ + --tensor-parallel-size 4 ``` You can also additionally specify `--pipeline-parallel-size` to enable pipeline parallelism. For example, to run API server on 8 GPUs with pipeline parallelism and tensor parallelism: ```console -$ vllm serve gpt2 \ -$ --tensor-parallel-size 4 \ -$ --pipeline-parallel-size 2 + vllm serve gpt2 \ + --tensor-parallel-size 4 \ + --pipeline-parallel-size 2 ``` ## Running vLLM on multiple nodes @@ -56,21 +56,21 @@ The first step, is to start containers and organize them into a cluster. We have Pick a node as the head node, and run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --head \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --head \ + /path/to/the/huggingface/home/in/this/node ``` On the rest of the worker nodes, run the following command: ```console -$ bash run_cluster.sh \ -$ vllm/vllm-openai \ -$ ip_of_head_node \ -$ --worker \ -$ /path/to/the/huggingface/home/in/this/node +bash run_cluster.sh \ + vllm/vllm-openai \ + ip_of_head_node \ + --worker \ + /path/to/the/huggingface/home/in/this/node ``` Then you get a ray cluster of containers. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. A common misunderstanding is to use the IP address of the worker node, which is not correct. @@ -80,16 +80,16 @@ Then, on any node, use `docker exec -it node /bin/bash` to enter the container, After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 8 \ -$ --pipeline-parallel-size 2 + vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 8 \ + --pipeline-parallel-size 2 ``` You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console -$ vllm serve /path/to/the/model/in/the/container \ -$ --tensor-parallel-size 16 +vllm serve /path/to/the/model/in/the/container \ + --tensor-parallel-size 16 ``` To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. diff --git a/docs/source/serving/integrations/langchain.md b/docs/source/serving/integrations/langchain.md index 49ff6e0c32a72..03142d23b145a 100644 --- a/docs/source/serving/integrations/langchain.md +++ b/docs/source/serving/integrations/langchain.md @@ -7,7 +7,7 @@ vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain To install LangChain, run ```console -$ pip install langchain langchain_community -q +pip install langchain langchain_community -q ``` To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`. diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/source/serving/integrations/llamaindex.md index 9961c181d7e1c..8c72605202cf5 100644 --- a/docs/source/serving/integrations/llamaindex.md +++ b/docs/source/serving/integrations/llamaindex.md @@ -7,7 +7,7 @@ vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index To install LlamaIndex, run ```console -$ pip install llama-index-llms-vllm -q +pip install llama-index-llms-vllm -q ``` To run inference on a single or multiple GPUs, use `Vllm` class from `llamaindex`. diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index e6ded2e6dd465..6c84f6d1350a6 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -7,7 +7,7 @@ OpenAI compatible API server. You can start the server using Python, or using [Docker](#deployment-docker): ```console -$ vllm serve unsloth/Llama-3.2-1B-Instruct +vllm serve unsloth/Llama-3.2-1B-Instruct ``` Then query the endpoint to get the latest metrics from the server: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 9f5e1b908d786..bc475826bbfde 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -303,6 +303,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ``` Then, you can use the OpenAI client as follows: + ```python from openai import OpenAI diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index ec5a367594743..e49bbb06695f8 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -5,11 +5,13 @@ vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! You can start the server via the [`vllm serve`](#vllm-serve) command, or through [Docker](#deployment-docker): + ```bash vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123 ``` To call the server, you can use the [official OpenAI Python client](https://github.com/openai/openai-python), or any other HTTP client. + ```python from openai import OpenAI client = OpenAI( @@ -50,6 +52,7 @@ In addition, we have the following custom APIs: - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= + ## Chat Template In order for the language model to support chat protocol, vLLM requires the model to include @@ -71,6 +74,7 @@ vLLM community provides a set of chat templates for popular models. You can find With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: + ```python completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", @@ -80,7 +84,7 @@ completion = client.chat.completions.create( ) ``` -Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like +Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like `meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the request. vLLM provides best-effort support to detect this automatically, which is logged as a string like *"Detected the chat template content format to be..."*, and internally converts incoming requests to match @@ -115,12 +119,12 @@ completion = client.chat.completions.create( ## Extra HTTP Headers Only `X-Request-Id` HTTP request header is supported for now. It can be enabled -with `--enable-request-id-headers`. +with `--enable-request-id-headers`. > Note that enablement of the headers can impact performance significantly at high QPS > rates. We recommend implementing HTTP headers at the router level (e.g. via Istio), > rather than within the vLLM layer for this reason. -> See https://github.com/vllm-project/vllm/pull/11529 for more details. +> See [this PR](https://github.com/vllm-project/vllm/pull/11529) for more details. ```python completion = client.chat.completions.create( @@ -147,6 +151,7 @@ print(completion._request_id) ## CLI Reference (vllm-serve)= + ### `vllm serve` The `vllm serve` command is used to launch the OpenAI-compatible server. @@ -175,7 +180,7 @@ uvicorn-log-level: "info" To use the above config file: ```bash -$ vllm serve SOME_MODEL --config config.yaml +vllm serve SOME_MODEL --config config.yaml ``` ```{note} @@ -186,6 +191,7 @@ The order of priorities is `command line > config file values > defaults`. ## API Reference (completions-api)= + ### Completions API Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); @@ -212,6 +218,7 @@ The following extra parameters are supported: ``` (chat-api)= + ### Chat API Our Chat API is compatible with [OpenAI's Chat Completions API](https://platform.openai.com/docs/api-reference/chat); @@ -243,6 +250,7 @@ The following extra parameters are supported: ``` (embeddings-api)= + ### Embeddings API Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings); @@ -284,6 +292,7 @@ For chat-like input (i.e. if `messages` is passed), these extra parameters are s ``` (tokenizer-api)= + ### Tokenizer API Our Tokenizer API is a simple wrapper over [HuggingFace-style tokenizers](https://huggingface.co/docs/transformers/en/main_classes/tokenizer). @@ -293,6 +302,7 @@ It consists of two endpoints: - `/detokenize` corresponds to calling `tokenizer.decode()`. (pooling-api)= + ### Pooling API Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states. @@ -302,6 +312,7 @@ The input format is the same as [Embeddings API](#embeddings-api), but the outpu Code example: (score-api)= + ### Score API Our Score API applies a cross-encoder model to predict scores for sentence pairs. diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh index 8926e92c06659..c4bdb888651d4 100755 --- a/tools/doc-lint.sh +++ b/tools/doc-lint.sh @@ -1,3 +1,3 @@ #!/bin/bash -pymarkdownlnt -d line-length scan docs/ -r \ No newline at end of file +pymarkdownlnt -d line-length,first-line-heading,no-inline-html,no-duplicate-header scan docs -r \ No newline at end of file From 005034f507cdf0246817e6d879a1e2dbaf83a931 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 15:07:00 -0500 Subject: [PATCH 12/15] Update job name, tool version check, fix docs Signed-off-by: Rafael Vasquez --- .github/workflows/doc-lint.yml | 2 +- docs/source/api/model/index.md | 1 - docs/source/getting_started/quickstart.md | 12 ++++++------ format.sh | 4 ++-- 4 files changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml index c0709978048ec..2f5ee8bbfd8c5 100644 --- a/.github/workflows/doc-lint.yml +++ b/.github/workflows/doc-lint.yml @@ -13,7 +13,7 @@ on: - "docs/**" jobs: - sphinx-lint: + doc-lint: runs-on: ubuntu-latest strategy: matrix: diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md index b8437e3c3517a..113792147be7c 100644 --- a/docs/source/api/model/index.md +++ b/docs/source/api/model/index.md @@ -9,4 +9,3 @@ interfaces_base interfaces adapters ``` - diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index d64d1bb9d20ef..5190e1e39bc6a 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -19,17 +19,17 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -$ uv venv myenv --python 3.12 --seed -$ source myenv/bin/activate -$ uv pip install vllm +uv venv myenv --python 3.12 --seed +source myenv/bin/activate +uv pip install vllm ``` You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. ```console -$ conda create -n myenv python=3.12 -y -$ conda activate myenv -$ pip install vllm +conda create -n myenv python=3.12 -y +conda activate myenv +pip install vllm ``` ```{note} diff --git a/format.sh b/format.sh index 522963b7d9386..2277eef93c745 100755 --- a/format.sh +++ b/format.sh @@ -41,7 +41,7 @@ MYPY_VERSION=$(mypy --version | awk '{print $2}') CODESPELL_VERSION=$(codespell --version) ISORT_VERSION=$(isort --vn) CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') +PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}') # # params: tool name, tool version, required version tool_version_check() { @@ -58,7 +58,7 @@ tool_version_check "mypy" "$MYPY_VERSION" tool_version_check "isort" "$ISORT_VERSION" tool_version_check "codespell" "$CODESPELL_VERSION" tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" +tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION" YAPF_FLAGS=( '--recursive' From 7e75436302ac086dfcb8ce54e2da624597ed46c9 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 16:23:39 -0500 Subject: [PATCH 13/15] Fix numbering, add newline Signed-off-by: Rafael Vasquez --- docs/source/deployment/frameworks/skypilot.md | 14 +- docs/source/deployment/k8s.md | 448 +++++++++--------- tools/doc-lint.sh | 2 +- 3 files changed, 232 insertions(+), 232 deletions(-) diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index bc2fbb93d5332..051fc2f2a8d4e 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -334,12 +334,12 @@ run: | 1. Start the chat web UI: -```console -sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) -``` + ```console + sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm) + ``` -1. Then, we can access the GUI at the returned gradio link: +2. Then, we can access the GUI at the returned gradio link: -```console -| INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live -``` + ```console + | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live + ``` diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index e58916d64e835..2bf7032ccc480 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -16,233 +16,233 @@ Before you begin, ensure that you have the following: 1. Create a PVC, Secret and Deployment for vLLM -PVC is used to store the model cache and it is optional, you can use hostPath or other storage options - -```yaml -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: mistral-7b - namespace: default -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 50Gi - storageClassName: default - volumeMode: Filesystem -``` - -Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: hf-token-secret - namespace: default -type: Opaque -stringData: - token: "REPLACE_WITH_TOKEN" -``` - -Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. - -Here are two examples for using NVIDIA GPU and AMD GPU. - -- NVIDIA GPU - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "2Gi" - containers: - - name: mistral-7b - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8000 + PVC is used to store the model cache and it is optional, you can use hostPath or other storage options + + ```yaml + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: mistral-7b + namespace: default + spec: + accessModes: + - ReadWriteOnce resources: - limits: - cpu: "10" - memory: 20G - nvidia.com/gpu: "1" requests: - cpu: "2" - memory: 6G - nvidia.com/gpu: "1" - volumeMounts: - - mountPath: /root/.cache/huggingface - name: cache-volume - - name: shm - mountPath: /dev/shm - livenessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /health - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 5 -``` - -- AMD GPU - -You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mistral-7b - namespace: default - labels: - app: mistral-7b -spec: - replicas: 1 - selector: - matchLabels: - app: mistral-7b - template: - metadata: - labels: - app: mistral-7b - spec: - volumes: - # PVC - - name: cache-volume - persistentVolumeClaim: - claimName: mistral-7b - # vLLM needs to access the host's shared memory for tensor parallel inference. - - name: shm - emptyDir: - medium: Memory - sizeLimit: "8Gi" - hostNetwork: true - hostIPC: true - containers: - - name: mistral-7b - image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 - securityContext: - seccompProfile: - type: Unconfined - runAsGroup: 44 - capabilities: - add: - - SYS_PTRACE - command: ["/bin/sh", "-c"] - args: [ - "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token + storage: 50Gi + storageClassName: default + volumeMode: Filesystem + ``` + + Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models + + ```yaml + apiVersion: v1 + kind: Secret + metadata: + name: hf-token-secret + namespace: default + type: Opaque + stringData: + token: "REPLACE_WITH_TOKEN" + ``` + + Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + + Here are two examples for using NVIDIA GPU and AMD GPU. + + - NVIDIA GPU + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "2Gi" + containers: + - name: mistral-7b + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-Instruct-v0.3 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 6G + nvidia.com/gpu: "1" + volumeMounts: + - mountPath: /root/.cache/huggingface + name: cache-volume + - name: shm + mountPath: /dev/shm + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 5 + ``` + + - AMD GPU + + You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + + ```yaml + apiVersion: apps/v1 + kind: Deployment + metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b + spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + ``` + + You can get the full example with steps and sample yaml files from . + +2. Create a Kubernetes Service for vLLM + + Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: + + ```yaml + apiVersion: v1 + kind: Service + metadata: + name: mistral-7b + namespace: default + spec: ports: - - containerPort: 8000 - resources: - limits: - cpu: "10" - memory: 20G - amd.com/gpu: "1" - requests: - cpu: "6" - memory: 6G - amd.com/gpu: "1" - volumeMounts: - - name: cache-volume - mountPath: /root/.cache/huggingface - - name: shm - mountPath: /dev/shm -``` - -You can get the full example with steps and sample yaml files from . - -1. Create a Kubernetes Service for vLLM - -Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: - -```yaml -apiVersion: v1 -kind: Service -metadata: - name: mistral-7b - namespace: default -spec: - ports: - - name: http-mistral-7b - port: 80 - protocol: TCP - targetPort: 8000 - # The label selector should match the deployment labels & it is useful for prefix caching feature - selector: - app: mistral-7b - sessionAffinity: None - type: ClusterIP -``` - -1. Deploy and Test - -Apply the deployment and service configurations using `kubectl apply -f `: - -```console -kubectl apply -f deployment.yaml -kubectl apply -f service.yaml -``` - -To test the deployment, run the following `curl` command: - -```console -curl http://mistral-7b.default.svc.cluster.local/v1/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "mistralai/Mistral-7B-Instruct-v0.3", - "prompt": "San Francisco is a", - "max_tokens": 7, - "temperature": 0 - }' -``` - -If the service is correctly deployed, you should receive a response from the vLLM model. + - name: http-mistral-7b + port: 80 + protocol: TCP + targetPort: 8000 + # The label selector should match the deployment labels & it is useful for prefix caching feature + selector: + app: mistral-7b + sessionAffinity: None + type: ClusterIP + ``` + +3. Deploy and Test + + Apply the deployment and service configurations using `kubectl apply -f `: + + ```console + kubectl apply -f deployment.yaml + kubectl apply -f service.yaml + ``` + + To test the deployment, run the following `curl` command: + + ```console + curl http://mistral-7b.default.svc.cluster.local/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "mistralai/Mistral-7B-Instruct-v0.3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + ``` + + If the service is correctly deployed, you should receive a response from the vLLM model. ## Conclusion diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh index c4bdb888651d4..c34e401b8e715 100755 --- a/tools/doc-lint.sh +++ b/tools/doc-lint.sh @@ -1,3 +1,3 @@ #!/bin/bash -pymarkdownlnt -d line-length,first-line-heading,no-inline-html,no-duplicate-header scan docs -r \ No newline at end of file +pymarkdownlnt -d line-length,first-line-heading,no-inline-html,no-duplicate-header scan docs -r From 4d7ecacb809328c99e803ee6c35e53b9812b92ba Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Thu, 9 Jan 2025 16:26:20 -0500 Subject: [PATCH 14/15] Fix list indentation Signed-off-by: Rafael Vasquez --- docs/source/deployment/k8s.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/deployment/k8s.md b/docs/source/deployment/k8s.md index 2bf7032ccc480..cbc95c20ff4b3 100644 --- a/docs/source/deployment/k8s.md +++ b/docs/source/deployment/k8s.md @@ -51,7 +51,7 @@ Before you begin, ensure that you have the following: Here are two examples for using NVIDIA GPU and AMD GPU. - - NVIDIA GPU + NVIDIA GPU: ```yaml apiVersion: apps/v1 @@ -123,7 +123,7 @@ Before you begin, ensure that you have the following: periodSeconds: 5 ``` - - AMD GPU + AMD GPU: You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. From 7ff51ed9974bd35ffef065f3ac9a37327da9bbc7 Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 10 Jan 2025 12:22:17 -0500 Subject: [PATCH 15/15] Move rule mods to pyproject.toml Signed-off-by: Rafael Vasquez --- pyproject.toml | 6 ++++++ tools/doc-lint.sh | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0ac3f39ef7a5f..82275ccafb572 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,3 +101,9 @@ markers = [ "skip_v1: do not run this test with v1", "optional: optional tests that are automatically skipped, include --optional to run them", ] + +[tool.pymarkdown] +plugins.md013.enabled = false # line-length +plugins.md041.enabled = false # first-line-h1 +plugins.md033.enabled = false # inline-html +plugins.md024.allow_different_nesting = true # no-duplicate-headers diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh index c34e401b8e715..19a55ddfa91c4 100755 --- a/tools/doc-lint.sh +++ b/tools/doc-lint.sh @@ -1,3 +1,3 @@ #!/bin/bash -pymarkdownlnt -d line-length,first-line-heading,no-inline-html,no-duplicate-header scan docs -r +pymarkdownlnt scan docs -r