From 1047e9eed0224a40660af5836613cc919324f8f1 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 10:24:01 +0000 Subject: [PATCH 1/3] Improve github links Signed-off-by: DarkLight1337 --- docs/source/conf.py | 27 +++++++++++++++ .../contributing/dockerfile/dockerfile.md | 4 +-- docs/source/contributing/overview.md | 14 ++++---- .../contributing/profiling/profiling_index.md | 8 ++--- docs/source/design/arch_overview.md | 17 ++++------ docs/source/design/multiprocessing.md | 27 ++++++++------- docs/source/generate_examples.py | 3 +- .../getting_started/amd-installation.md | 4 +-- .../getting_started/cpu-installation.md | 4 +-- docs/source/getting_started/debugging.md | 7 ++-- .../getting_started/gaudi-installation.md | 6 ++-- docs/source/getting_started/installation.md | 2 +- docs/source/getting_started/quickstart.md | 7 ++-- .../getting_started/tpu-installation.md | 3 +- .../getting_started/xpu-installation.md | 2 +- docs/source/models/adding_model.md | 8 ++--- .../models/enabling_multimodal_inputs.md | 12 +++---- docs/source/models/generative_models.md | 6 ++-- docs/source/models/pooling_models.md | 6 ++-- docs/source/models/supported_models.md | 4 +-- docs/source/performance/benchmarks.md | 4 +-- .../source/quantization/supported_hardware.md | 2 +- docs/source/serving/deploying_with_docker.md | 2 +- docs/source/serving/distributed_serving.md | 4 +-- .../serving/openai_compatible_server.md | 23 ++++--------- docs/source/usage/compatibility_matrix.md | 34 +++++++++---------- docs/source/usage/lora.md | 3 +- docs/source/usage/multimodal_inputs.md | 24 ++++++------- docs/source/usage/spec_decode.md | 8 ++--- docs/source/usage/structured_outputs.md | 4 +-- docs/source/usage/usage_stats.md | 2 +- 31 files changed, 145 insertions(+), 136 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6f1d1842fe686..0f2c067c54ef0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -74,6 +74,33 @@ html_static_path = ["_static"] html_js_files = ["custom.js"] +myst_url_schemes = { + 'http': None, + 'https': None, + 'mailto': None, + 'ftp': None, + "gh-issue": { + "url": "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", + "title": "Issue #{{path}}", + "classes": ["github"], + }, + "gh-pr": { + "url": "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", + "title": "Pull Request #{{path}}", + "classes": ["github"], + }, + "gh-dir": { + "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}", + "title": "{{path}}", + "classes": ["github"], + }, + "gh-code": { + "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}", + "title": "{{path}}", + "classes": ["github"], + }, +} + # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE') if READTHEDOCS_VERSION_TYPE == "tag": diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index d72b99fe017b6..119d2aeaaaf6d 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -1,7 +1,7 @@ # Dockerfile -See [here](https://github.com/vllm-project/vllm/blob/main/Dockerfile) for the main Dockerfile to construct -the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](https://docs.vllm.ai/en/stable/serving/deploying_with_docker.html). +We provide a to construct the image for running an OpenAI compatible server with vLLM. +More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md). Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 53e8e78f08e72..6e09f0dabaa26 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -13,11 +13,12 @@ Finally, one of the most impactful ways to support us is by raising awareness ab ## License -See [LICENSE](https://github.com/vllm-project/vllm/tree/main/LICENSE). +See . ## Developing -Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details. +Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. +Check out the [building from source](#build-from-source) documentation for details. ## Testing @@ -43,7 +44,7 @@ Currently, the repository does not pass the `mypy` tests. If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. ```{important} -If you discover a security vulnerability, please follow the instructions [here](https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability). +If you discover a security vulnerability, please follow the instructions [here](gh-code:SECURITY.md#reporting-a-vulnerability). ``` ## Pull Requests & Code Reviews @@ -54,9 +55,9 @@ code quality and improve the efficiency of the review process. ### DCO and Signed-off-by -When contributing changes to this project, you must agree to the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). +When contributing changes to this project, you must agree to the . Commits must include a `Signed-off-by:` header which certifies agreement with -the terms of the [DCO](https://github.com/vllm-project/vllm/tree/main/DCO). +the terms of the DCO. Using `-s` with `git commit` will automatically add this header. @@ -89,8 +90,7 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use [format.sh](https://github.com/vllm-project/vllm/blob/main/format.sh) to format your - code. +- Pass all linter checks. Please use to format your code. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 04e01da556231..93162730699c3 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -22,13 +22,13 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve `export VLLM_RPC_TIMEOUT=1800000` ``` -## Example commands and usage: +## Example commands and usage -### Offline Inference: +### Offline Inference -Refer to [examples/offline_inference_with_profiler.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py) for an example. +Refer to for an example. -### OpenAI Server: +### OpenAI Server ```bash VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 511bee20a91f4..6ddcad6bed95b 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -55,7 +55,7 @@ for output in outputs: More API details can be found in the {doc}`Offline Inference ` section of the API docs. -The code for the `LLM` class can be found in [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py). +The code for the `LLM` class can be found in . ### OpenAI-compatible API server @@ -66,7 +66,7 @@ This server can be started using the `vllm serve` command. vllm serve ``` -The code for the `vllm` CLI can be found in [vllm/scripts.py](https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py). +The code for the `vllm` CLI can be found in . Sometimes you may see the API server entrypoint used directly instead of via the `vllm` CLI command. For example: @@ -75,7 +75,7 @@ Sometimes you may see the API server entrypoint used directly instead of via the python -m vllm.entrypoints.openai.api_server --model ``` -That code can be found in [vllm/entrypoints/openai/api_server.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py). +That code can be found in . More details on the API server can be found in the {doc}`OpenAI Compatible Server ` document. @@ -105,7 +105,7 @@ processing. - **Output Processing**: Processes the outputs generated by the model, decoding the token IDs from a language model into human-readable text. -The code for `LLMEngine` can be found in [vllm/engine/llm_engine.py]. +The code for `LLMEngine` can be found in . ### AsyncLLMEngine @@ -115,10 +115,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it can handle multiple concurrent requests and stream outputs to clients. The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo -API server that serves as a simpler example in -[vllm/entrypoints/api_server.py]. +API server that serves as a simpler example in . -The code for `AsyncLLMEngine` can be found in [vllm/engine/async_llm_engine.py]. +The code for `AsyncLLMEngine` can be found in . ## Worker @@ -252,7 +251,3 @@ big problem. In summary, the complete config object `VllmConfig` can be treated as an engine-level global state that is shared among all vLLM classes. - -[vllm/engine/async_llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py -[vllm/engine/llm_engine.py]: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py -[vllm/entrypoints/api_server.py]: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index b58456ecc6da8..34564413b34f6 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -2,13 +2,14 @@ ## Debugging -Please see the [Debugging -Tips](https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing) +Please see the [Debugging Tips](#debugging-python-multiprocessing) page for information on known issues and how to solve them. ## Introduction -*Note that source code references are to the state of the code at the time of writing in December, 2024.* +```{important} +The source code references are to the state of the code at the time of writing in December, 2024. +``` The use of Python multiprocessing in vLLM is complicated by: @@ -20,7 +21,7 @@ This document describes how vLLM deals with these challenges. ## Multiprocessing Methods -[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include: +[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html.md#contexts-and-start-methods) include: - `spawn` - spawn a new Python process. This will be the default as of Python 3.14. @@ -82,7 +83,7 @@ There are other miscellaneous places hard-coding the use of `spawn`: Related PRs: -- +- ## Prior State in v1 @@ -96,7 +97,7 @@ engine core. - - -- https://github.com/vllm-project/vllm/blob/d05f88679bedd73939251a17c3d785a354b2946c/vllm/v1/engine/core_client.py#L44-L45 +- It was off by default for all the reasons mentioned above - compatibility with dependencies and code using vLLM as a library. @@ -119,17 +120,17 @@ instruct users to either add a `__main__` guard or to disable multiprocessing. If that known-failure case occurs, the user will see two messages that explain what is happening. First, a log message from vLLM: -``` - WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously - initialized. We must use the `spawn` multiprocessing start method. Setting - VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing - for more information. +```console +WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously + initialized. We must use the `spawn` multiprocessing start method. Setting + VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See + https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + for more information. ``` Second, Python itself will raise an exception with a nice explanation: -``` +```console RuntimeError: An attempt has been made to start a new process before the current process has finished its bootstrapping phase. diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index 4c5a9d9c1da38..eb9201410a7e1 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -36,11 +36,10 @@ def generate_examples(): # Generate the example docs for each example script for script_path, doc_path in zip(script_paths, doc_paths): - script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}" # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: <{script_url}>.\n\n" + f"Source: .\n\n" f"```{{literalinclude}} {include_path}\n" ":language: python\n" ":linenos:\n```") diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md index b9ccbd7d6c7fc..5cf6d9a1d9f77 100644 --- a/docs/source/getting_started/amd-installation.md +++ b/docs/source/getting_started/amd-installation.md @@ -22,7 +22,7 @@ Installation options: You can build and install vLLM from source. -First, build a docker image from [Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) and launch a docker container from the image. +First, build a docker image from and launch a docker container from the image. It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: ```console @@ -33,7 +33,7 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` -[Dockerfile.rocm](https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm) uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: - `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md index 4ab5437f091d5..b6f181ace6274 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/cpu-installation.md @@ -145,10 +145,10 @@ $ python examples/offline_inference.py - On CPU based setup with NUMA enabled, the memory access performance may be largely impacted by the [topology](https://github.com/intel/intel-extension-for-pytorch/blob/main/docs/tutorials/performance_tuning/tuning_guide.md#non-uniform-memory-access-numa). For NUMA architecture, two optimizations are to recommended: Tensor Parallel or Data Parallel. - - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](https://github.com/vllm-project/vllm/pull/6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: + - Using Tensor Parallel for a latency constraints deployment: following GPU backend design, a Megatron-LM's parallel algorithm will be used to shard the model, based on the number of NUMA nodes (e.g. TP = 2 for a two NUMA node system). With [TP feature on CPU](gh-pr:6125) merged, Tensor Parallel is supported for serving and offline inferencing. In general each NUMA node is treated as one GPU card. Below is the example script to enable Tensor Parallel = 2 for serving: ```console $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md index 2f11c95ce0e77..b7e0184f62cae 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/debugging.md @@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Model is too large -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using [this example](https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html) . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging @@ -139,6 +139,7 @@ A multi-node environment is more complicated than a single-node one. If you see Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. ``` +(debugging-python-multiprocessing)= ## Python multiprocessing ### `RuntimeError` Exception @@ -195,5 +196,5 @@ if __name__ == '__main__': ## Known Issues -- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](https://github.com/vllm-project/vllm/pull/6759). -- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](https://github.com/vllm-project/vllm/issues/5723#issuecomment-2554389656) . +- In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). +- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md index 170d7e49ba806..acf42f210dffb 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/gaudi-installation.md @@ -80,10 +80,8 @@ $ python setup.py develop ## Supported Features -- [Offline batched - inference](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#offline-batched-inference) -- Online inference via [OpenAI-Compatible - Server](https://docs.vllm.ai/en/latest/getting_started/quickstart.html#openai-compatible-server) +- [Offline batched inference](#offline-batched-inference) +- Online inference via [OpenAI-Compatible Server](#openai-compatible-server) - HPU autodetection - no need to manually select device within vLLM - Paged KV cache with algorithms enabled for Intel Gaudi accelerators - Custom Intel Gaudi implementations of Paged Attention, KV cache ops, diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md index 8ca634f966a06..996fb346f43d4 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation.md @@ -24,7 +24,7 @@ $ pip install vllm ``` ```{note} -Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See [this issue](https://github.com/vllm-project/vllm/issues/8420) for more details. +Although we recommend using `conda` to create and manage Python environments, it is highly recommended to use `pip` to install vLLM. This is because `pip` can install `torch` with separate library packages like `NCCL`, while `conda` installs `torch` with statically linked `NCCL`. This can cause issues when vLLM tries to use `NCCL`. See for more details. ``` ````{note} diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index e3508bce68c2d..0384e65693bc6 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -29,7 +29,7 @@ Please refer to the {ref}`installation documentation ` for more de ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py). +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -87,7 +87,8 @@ $ vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` ```{note} -By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](https://github.com/vllm-project/vllm/blob/main/docs/source/serving/openai_compatible_server.md#chat-template). +By default, the server uses a predefined chat template stored in the tokenizer. +You can learn about overriding it [here](#chat-template). ``` This server can be queried in the same format as OpenAI API. For example, to list the models: @@ -130,7 +131,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md index f4916460026d1..8b75a3ea0e2a8 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/tpu-installation.md @@ -154,8 +154,7 @@ For more information about using TPUs with GKE, see ## Build a docker image with {code}`Dockerfile.tpu` -You can use [Dockerfile.tpu](https://github.com/vllm-project/vllm/blob/main/Dockerfile.tpu) -to build a Docker image with TPU support. +You can use to build a Docker image with TPU support. ```console $ docker build -f Dockerfile.tpu -t vllm-tpu . diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md index 5c57509aef2db..3bfe5e6b4cf2d 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/xpu-installation.md @@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \ $ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh). +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md index 3739873bb547b..858fcdf68436f 100644 --- a/docs/source/models/adding_model.md +++ b/docs/source/models/adding_model.md @@ -31,8 +31,8 @@ If you don't want to fork the repository and modify vLLM's codebase, please refe ## 1. Bring your model code -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the [vllm/model_executor/models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory. -For instance, vLLM's [OPT model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. +Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. +For instance, vLLM's [OPT model](gh-code:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. ```{warning} When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. @@ -99,7 +99,7 @@ Currently, vLLM supports the basic multi-head attention mechanism and its varian If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. ``` -For reference, check out the [LLAMA model](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out the [vLLM models](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) directory for more examples. +For reference, check out our [Llama implementation](gh-code:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. ## 3. (Optional) Implement tensor parallelism and quantization support @@ -123,7 +123,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model -Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in [vllm/model_executor/models/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py). +Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . ## 6. Out-of-Tree Model Integration diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md index 2f93eb826fb1e..fea297b296ed0 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -78,8 +78,8 @@ and register it via {meth}`INPUT_REGISTRY.register_dummy_data ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found in [examples/offline_inference_chat.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_chat.py). +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: @@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ## Online Inference -Our [OpenAI Compatible Server](../serving/openai_compatible_server) provides endpoints that correspond to the offline APIs: +Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. - [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 6d034f652d2ab..1105b4b9cfc00 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -65,7 +65,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found in [examples/offline_inference_embedding.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_embedding.py). +A code example can be found here: ### `LLM.classify` @@ -80,7 +80,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found in [examples/offline_inference_classification.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_classification.py). +A code example can be found here: ### `LLM.score` @@ -102,7 +102,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found in [examples/offline_inference_scoring.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_scoring.py). +A code example can be found here: ## Online Inference diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 545a2ccaa5634..099e6c8f02815 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -756,7 +756,7 @@ and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGenerati ```{note} The official {code}`openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork ({code}`HwwwH/MiniCPM-V-2`) for now. -For more details, please see: +For more details, please see: ``` ### Pooling Models @@ -834,5 +834,5 @@ We have the following levels of testing for models: 1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test. 2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test. -3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](https://github.com/vllm-project/vllm/tree/main/tests) and [examples](https://github.com/vllm-project/vllm/tree/main/examples) for the models that have passed this test. +3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:main/examples) for the models that have passed this test. 4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category. diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md index 50ef4a1f3b54d..4443430d4184d 100644 --- a/docs/source/performance/benchmarks.md +++ b/docs/source/performance/benchmarks.md @@ -15,7 +15,7 @@ The performance benchmarks are used for development to confirm whether new chang The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). -More information on the performance benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found [here](gh-code:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). (nightly-benchmarks)= @@ -25,4 +25,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). -More information on the nightly benchmarks and their parameters can be found [here](https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md). +More information on the nightly benchmarks and their parameters can be found [here](gh-code:.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md index d2160772a24cb..843ee21627d78 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/quantization/supported_hardware.md @@ -129,4 +129,4 @@ The table below shows the compatibility of various quantization implementations Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. -For the most up-to-date information on hardware support and quantization methods, please check the [quantization directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization) or consult with the vLLM development team. +For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md index 2d8ceed8cecfd..d47066292cf64 100644 --- a/docs/source/serving/deploying_with_docker.md +++ b/docs/source/serving/deploying_with_docker.md @@ -25,7 +25,7 @@ memory to share data between processes under the hood, particularly for tensor p ## Building vLLM's Docker Image from Source -You can build and run vLLM from source via the provided [Dockerfile](https://github.com/vllm-project/vllm/blob/main/Dockerfile). To build vLLM: +You can build and run vLLM from source via the provided . To build vLLM: ```console $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index e0485d66c0a26..c321c99323f67 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided a helper [script](https://github.com/vllm-project/vllm/tree/main/examples/run_cluster.sh) to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: @@ -95,7 +95,7 @@ $ --tensor-parallel-size 16 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. ```{warning} -After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](https://docs.vllm.ai/en/latest/getting_started/debugging.html) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See the [discussion](https://github.com/vllm-project/vllm/issues/6803) for more information. +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](../getting_started/debugging.md) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. ``` ```{warning} diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 597618cc5a215..2d4514e17e9fa 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -65,8 +65,7 @@ and all chat requests will error. vllm serve --chat-template ./path-to-chat-template.jinja ``` -vLLM community provides a set of chat templates for popular models. You can find them in the examples -directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) +vLLM community provides a set of chat templates for popular models. You can find them under the directory. With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies both a `type` and a `text` field. An example is provided below: @@ -184,9 +183,7 @@ The order of priorities is `command line > config file values > defaults`. Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -#### Code example - -See [examples/openai_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py). +Code example: #### Extra parameters @@ -217,9 +214,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -#### Code example - -See [examples/openai_chat_completion_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client.py). +Code example: #### Extra parameters @@ -252,9 +247,7 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -#### Code example - -See [examples/openai_embedding_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_embedding_client.py). +Code example: #### Extra parameters @@ -298,9 +291,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -#### Code example - -See [examples/openai_pooling_client.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_pooling_client.py). +Code example: (score-api)= ### Score API @@ -310,9 +301,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -#### Code example - -See [examples/openai_cross_encoder_score.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_cross_encoder_score.py). +Code example: #### Single inference diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/usage/compatibility_matrix.md index 763b49dac4f8a..3cefa12ea8a1d 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/usage/compatibility_matrix.md @@ -82,7 +82,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - * - [LoRA](#lora-adapter) - - [✗](https://github.com/vllm-project/vllm/pull/9057) + - [✗](gh-pr:9057) - ✅ - - @@ -168,10 +168,10 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - * - enc-dec - ✗ - - [✗](https://github.com/vllm-project/vllm/issues/7366) + - [✗](gh-issue:7366) - ✗ - ✗ - - [✗](https://github.com/vllm-project/vllm/issues/7366) + - [✗](gh-issue:7366) - ✅ - ✅ - @@ -205,7 +205,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/pull/8199) + - [✗](gh-pr:8199) - ✅ - ✗ - ✅ @@ -244,7 +244,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✗ - ✗ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/8198) + - [✗](gh-issue:8198) - ✅ - - @@ -253,8 +253,8 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - * - mm - ✅ - - [✗](https://github.com/vllm-project/vllm/pull/8348) - - [✗](https://github.com/vllm-project/vllm/pull/7199) + - [✗](gh-pr:8348) + - [✗](gh-pr:7199) - ? - ? - ✅ @@ -273,14 +273,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/6137) + - [✗](gh-issue:6137) - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - [✗](https://github.com/vllm-project/vllm/issues/7968) + - [✗](gh-issue:7968) - ✅ - - @@ -290,14 +290,14 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/6137) + - [✗](gh-issue:6137) - ✅ - ✗ - ✅ - ✅ - ✅ - ? - - [✗](https://github.com/vllm-project/vllm/issues/7968>) + - [✗](gh-issue:7968>) - ? - ✅ - @@ -314,7 +314,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/9893) + - [✗](gh-issue:9893) - ? - ✅ - ✅ @@ -338,7 +338,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - CPU - AMD * - [CP](#chunked-prefill) - - [✗](https://github.com/vllm-project/vllm/issues/2729) + - [✗](gh-issue:2729) - ✅ - ✅ - ✅ @@ -346,7 +346,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ * - [APC](#apc) - - [✗](https://github.com/vllm-project/vllm/issues/3687) + - [✗](gh-issue:3687) - ✅ - ✅ - ✅ @@ -359,7 +359,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/pull/4830) + - [✗](gh-pr:4830) - ✅ * - prmpt adptr - ✅ @@ -367,7 +367,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/8475) + - [✗](gh-issue:8475) - ✅ * - [SD](#spec_decode) - ✅ @@ -439,7 +439,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - - [✗](https://github.com/vllm-project/vllm/issues/8477) + - [✗](gh-issue:8477) - ✅ * - best-of - ✅ diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md index e2ddde74aaa45..4ac3f788b91ee 100644 --- a/docs/source/usage/lora.md +++ b/docs/source/usage/lora.md @@ -47,8 +47,7 @@ outputs = llm.generate( ) ``` -Check out [examples/multilora_inference.py](https://github.com/vllm-project/vllm/blob/main/examples/multilora_inference.py) -for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md index b0c887398b1b7..ed789f0af8256 100644 --- a/docs/source/usage/multimodal_inputs.md +++ b/docs/source/usage/multimodal_inputs.md @@ -5,7 +5,7 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. ```{note} -We are actively iterating on multi-modal support. See [this RFC](https://github.com/vllm-project/vllm/issues/4194) for upcoming changes, +We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. ``` @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -A code example can be found in [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py). +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -A code example can be found in [examples/offline_inference_vision_language_multi_image.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py). +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary instead of using multi-image input. -Please refer to [examples/offline_inference_vision_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language.py) for more details. +Full example: ### Audio You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. -Please refer to [examples/offline_inference_audio_language.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_audio_language.py) for more details. +Full example: ### Embedding @@ -208,7 +208,7 @@ A chat template is **required** to use Chat Completions API. Although most models come with a chat template, for others you have to define one yourself. The chat template can be inferred based on the documentation on the model's HuggingFace repo. -For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja). +For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: ``` ### Image @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -296,7 +296,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT= Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. -You can use [these tests](https://github.com/vllm-project/vllm/blob/main/tests/entrypoints/openai/test_video.py) as reference. +You can use [these tests](gh-code:entrypoints/openai/test_video.py) as reference. ````{note} By default, the timeout for fetching videos through HTTP URL url is `30` seconds. @@ -399,7 +399,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -A full code example can be found in [examples/openai_chat_completion_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_client_for_multimodal.py). +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -435,7 +435,7 @@ Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to expl to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, -and can be found [here](https://github.com/vllm-project/vllm/blob/main/examples/template_vlm2vec.jinja). +and can be found here: ``` Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: @@ -475,7 +475,7 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ Like with VLM2Vec, we have to explicitly pass `--task embed`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled -by [this custom chat template](https://github.com/vllm-project/vllm/blob/main/examples/template_dse_qwen2_vl.jinja). +by a custom chat template: ``` ```{important} @@ -483,4 +483,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -A full code example can be found in [examples/openai_chat_embedding_client_for_multimodal.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py). +Full example: diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md index 77e35c437de30..8302da81b6173 100644 --- a/docs/source/usage/spec_decode.md +++ b/docs/source/usage/spec_decode.md @@ -4,8 +4,8 @@ ```{warning} Please note that speculative decoding in vLLM is not yet optimized and does -not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work -to optimize it is ongoing and can be followed in [this issue.](https://github.com/vllm-project/vllm/issues/4630) +not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. +The work to optimize it is ongoing and can be followed here: ``` ```{warning} @@ -176,7 +176,7 @@ speculative decoding, breaking down the guarantees into three key areas: > distribution. [View Test Code](https://github.com/vllm-project/vllm/blob/47b65a550866c7ffbd076ecb74106714838ce7da/tests/samplers/test_rejection_sampler.py#L252) > - **Greedy Sampling Equality**: Confirms that greedy sampling with speculative decoding matches greedy sampling > without it. This verifies that vLLM's speculative decoding framework, when integrated with the vLLM forward pass and the vLLM rejection sampler, - > provides a lossless guarantee. Almost all of the tests in [this directory](https://github.com/vllm-project/vllm/tree/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e) + > provides a lossless guarantee. Almost all of the tests in . > verify this property using [this assertion implementation](https://github.com/vllm-project/vllm/blob/b67ae00cdbbe1a58ffc8ff170f0c8d79044a684a/tests/spec_decode/e2e/conftest.py#L291) 3. **vLLM Logprob Stability** @@ -202,4 +202,4 @@ For mitigation strategies, please refer to the FAQ entry *Can the output of a pr - [A Hacker's Guide to Speculative Decoding in vLLM](https://www.youtube.com/watch?v=9wNAgpX6z_4) - [What is Lookahead Scheduling in vLLM?](https://docs.google.com/document/d/1Z9TvqzzBPnh5WHcRwjvK2UEeFeq5zMZb5mFE8jR0HCs/edit#heading=h.1fjfb0donq5a) - [Information on batch expansion](https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit#heading=h.kk7dq05lc6q8) -- [Dynamic speculative decoding](https://github.com/vllm-project/vllm/issues/4565) +- [Dynamic speculative decoding](gh-issue:4565) diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 14dd387743aac..fd7d1529e0221 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -The complete code of the examples can be found on [examples/openai_chat_completion_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_completion_structured_outputs.py). +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -A complete example with all options can be found in [examples/offline_inference_structured_outputs.py](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_structured_outputs.py). +Full example: diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md index a7eb6144571a4..389ba69a60212 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/usage/usage_stats.md @@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette ## What data is collected? -You can see the up to date list of data collected by vLLM in the [usage_lib.py](https://github.com/vllm-project/vllm/blob/main/vllm/usage/usage_lib.py). +The list of data collected by the latest version of vLLM can be found here: Here is an example as of v0.4.0: From 277fdbff6c53ad0b748562f656b916cc91066ffc Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 10:35:20 +0000 Subject: [PATCH 2/3] format Signed-off-by: DarkLight1337 --- docs/source/conf.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 0f2c067c54ef0..fa50a3ec863b8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -80,12 +80,14 @@ 'mailto': None, 'ftp': None, "gh-issue": { - "url": "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", + "url": + "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}", "title": "Issue #{{path}}", "classes": ["github"], }, "gh-pr": { - "url": "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", + "url": + "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}", "title": "Pull Request #{{path}}", "classes": ["github"], }, From c3cc97056bddca59ade7d5052a1eea1b0bb51804 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 25 Dec 2024 15:12:40 +0000 Subject: [PATCH 3/3] Rename Signed-off-by: DarkLight1337 --- docs/source/conf.py | 2 +- .../contributing/dockerfile/dockerfile.md | 2 +- docs/source/contributing/overview.md | 8 +++---- .../contributing/profiling/profiling_index.md | 2 +- docs/source/design/arch_overview.md | 12 +++++----- docs/source/generate_examples.py | 2 +- .../getting_started/amd-installation.md | 4 ++-- docs/source/getting_started/debugging.md | 2 +- docs/source/getting_started/quickstart.md | 4 ++-- .../getting_started/tpu-installation.md | 2 +- .../getting_started/xpu-installation.md | 2 +- docs/source/models/adding_model.md | 6 ++--- .../models/enabling_multimodal_inputs.md | 12 +++++----- docs/source/models/generative_models.md | 4 ++-- docs/source/models/pooling_models.md | 6 ++--- docs/source/performance/benchmarks.md | 4 ++-- docs/source/serving/deploying_with_docker.md | 2 +- docs/source/serving/distributed_serving.md | 2 +- .../serving/openai_compatible_server.md | 10 ++++----- docs/source/usage/lora.md | 2 +- docs/source/usage/multimodal_inputs.md | 22 +++++++++---------- docs/source/usage/structured_outputs.md | 4 ++-- docs/source/usage/usage_stats.md | 2 +- 23 files changed, 59 insertions(+), 59 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index fa50a3ec863b8..1fe0474631140 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -96,7 +96,7 @@ "title": "{{path}}", "classes": ["github"], }, - "gh-code": { + "gh-file": { "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}", "title": "{{path}}", "classes": ["github"], diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 119d2aeaaaf6d..6535414a7dca4 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -1,6 +1,6 @@ # Dockerfile -We provide a to construct the image for running an OpenAI compatible server with vLLM. +We provide a to construct the image for running an OpenAI compatible server with vLLM. More information about deploying with Docker can be found [here](../../serving/deploying_with_docker.md). Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes: diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 6e09f0dabaa26..9dac41cff0bcb 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -13,7 +13,7 @@ Finally, one of the most impactful ways to support us is by raising awareness ab ## License -See . +See . ## Developing @@ -44,7 +44,7 @@ Currently, the repository does not pass the `mypy` tests. If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. ```{important} -If you discover a security vulnerability, please follow the instructions [here](gh-code:SECURITY.md#reporting-a-vulnerability). +If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). ``` ## Pull Requests & Code Reviews @@ -55,7 +55,7 @@ code quality and improve the efficiency of the review process. ### DCO and Signed-off-by -When contributing changes to this project, you must agree to the . +When contributing changes to this project, you must agree to the . Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO. @@ -90,7 +90,7 @@ If the PR spans more than one category, please include all relevant prefixes. The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use to format your code. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 93162730699c3..46210957c19ec 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -26,7 +26,7 @@ Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the serve ### Offline Inference -Refer to for an example. +Refer to for an example. ### OpenAI Server diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 6ddcad6bed95b..475a3e5fa9ddc 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -55,7 +55,7 @@ for output in outputs: More API details can be found in the {doc}`Offline Inference ` section of the API docs. -The code for the `LLM` class can be found in . +The code for the `LLM` class can be found in . ### OpenAI-compatible API server @@ -66,7 +66,7 @@ This server can be started using the `vllm serve` command. vllm serve ``` -The code for the `vllm` CLI can be found in . +The code for the `vllm` CLI can be found in . Sometimes you may see the API server entrypoint used directly instead of via the `vllm` CLI command. For example: @@ -75,7 +75,7 @@ Sometimes you may see the API server entrypoint used directly instead of via the python -m vllm.entrypoints.openai.api_server --model ``` -That code can be found in . +That code can be found in . More details on the API server can be found in the {doc}`OpenAI Compatible Server ` document. @@ -105,7 +105,7 @@ processing. - **Output Processing**: Processes the outputs generated by the model, decoding the token IDs from a language model into human-readable text. -The code for `LLMEngine` can be found in . +The code for `LLMEngine` can be found in . ### AsyncLLMEngine @@ -115,9 +115,9 @@ incoming requests. The `AsyncLLMEngine` is designed for online serving, where it can handle multiple concurrent requests and stream outputs to clients. The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo -API server that serves as a simpler example in . +API server that serves as a simpler example in . -The code for `AsyncLLMEngine` can be found in . +The code for `AsyncLLMEngine` can be found in . ## Worker diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index eb9201410a7e1..aef32f7559f74 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -39,7 +39,7 @@ def generate_examples(): # Make script_path relative to doc_path and call it include_path include_path = '../../../..' / script_path.relative_to(root_dir) content = (f"{generate_title(doc_path.stem)}\n\n" - f"Source: .\n\n" + f"Source: .\n\n" f"```{{literalinclude}} {include_path}\n" ":language: python\n" ":linenos:\n```") diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/amd-installation.md index 5cf6d9a1d9f77..6d01efbbf8828 100644 --- a/docs/source/getting_started/amd-installation.md +++ b/docs/source/getting_started/amd-installation.md @@ -22,7 +22,7 @@ Installation options: You can build and install vLLM from source. -First, build a docker image from and launch a docker container from the image. +First, build a docker image from and launch a docker container from the image. It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon: ```console @@ -33,7 +33,7 @@ It is important that the user kicks off the docker build using buildkit. Either } ``` - uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. + uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: - `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md index b7e0184f62cae..3b0029f2e88ce 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/debugging.md @@ -24,7 +24,7 @@ To isolate the model downloading and loading issue, you can use the `--load-form ## Model is too large -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 0384e65693bc6..165e5df146dcd 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -29,7 +29,7 @@ Please refer to the {ref}`installation documentation ` for more de ## Offline Batched Inference -With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`: @@ -131,7 +131,7 @@ completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", print("Completion result:", completion) ``` -A more detailed client example can be found here: +A more detailed client example can be found here: ### OpenAI Chat Completions API with vLLM diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md index 8b75a3ea0e2a8..f2a949e7247d8 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/tpu-installation.md @@ -154,7 +154,7 @@ For more information about using TPUs with GKE, see ## Build a docker image with {code}`Dockerfile.tpu` -You can use to build a Docker image with TPU support. +You can use to build a Docker image with TPU support. ```console $ docker build -f Dockerfile.tpu -t vllm-tpu . diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/xpu-installation.md index 3bfe5e6b4cf2d..9554ae4b7fb44 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/xpu-installation.md @@ -71,4 +71,4 @@ $ --pipeline-parallel-size=2 \ $ -tp=8 ``` -By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. +By default, a ray instance will be launched automatically if no existing one is detected in system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the helper script. diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md index 858fcdf68436f..02537fba020c4 100644 --- a/docs/source/models/adding_model.md +++ b/docs/source/models/adding_model.md @@ -32,7 +32,7 @@ If you don't want to fork the repository and modify vLLM's codebase, please refe ## 1. Bring your model code Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. -For instance, vLLM's [OPT model](gh-code:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. +For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. ```{warning} When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. @@ -99,7 +99,7 @@ Currently, vLLM supports the basic multi-head attention mechanism and its varian If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. ``` -For reference, check out our [Llama implementation](gh-code:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. +For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. ## 3. (Optional) Implement tensor parallelism and quantization support @@ -123,7 +123,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model -Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . +Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . ## 6. Out-of-Tree Model Integration diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/models/enabling_multimodal_inputs.md index fea297b296ed0..fdd770887900e 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/models/enabling_multimodal_inputs.md @@ -78,8 +78,8 @@ and register it via {meth}`INPUT_REGISTRY.register_dummy_data +A code example can be found here: ### `LLM.beam_search` @@ -103,7 +103,7 @@ for output in outputs: print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -A code example can be found here: +A code example can be found here: If the model doesn't have a chat template or you want to specify another one, you can explicitly pass a chat template: diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 1105b4b9cfc00..76c96c9edcc5d 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -65,7 +65,7 @@ embeds = output.outputs.embedding print(f"Embeddings: {embeds!r} (size={len(embeds)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.classify` @@ -80,7 +80,7 @@ probs = output.outputs.probs print(f"Class Probabilities: {probs!r} (size={len(probs)})") ``` -A code example can be found here: +A code example can be found here: ### `LLM.score` @@ -102,7 +102,7 @@ score = output.outputs.score print(f"Score: {score}") ``` -A code example can be found here: +A code example can be found here: ## Online Inference diff --git a/docs/source/performance/benchmarks.md b/docs/source/performance/benchmarks.md index 4443430d4184d..39dc470a1c708 100644 --- a/docs/source/performance/benchmarks.md +++ b/docs/source/performance/benchmarks.md @@ -15,7 +15,7 @@ The performance benchmarks are used for development to confirm whether new chang The latest performance results are hosted on the public [vLLM Performance Dashboard](https://perf.vllm.ai). -More information on the performance benchmarks and their parameters can be found [here](gh-code:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). +More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md). (nightly-benchmarks)= @@ -25,4 +25,4 @@ These compare vLLM's performance against alternatives (`tgi`, `trt-llm`, and `lm The latest nightly benchmark results are shared in major release blog posts such as [vLLM v0.6.0](https://blog.vllm.ai/2024/09/05/perf-update.html). -More information on the nightly benchmarks and their parameters can be found [here](gh-code:.buildkite/nightly-benchmarks/nightly-descriptions.md). +More information on the nightly benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/nightly-descriptions.md). diff --git a/docs/source/serving/deploying_with_docker.md b/docs/source/serving/deploying_with_docker.md index d47066292cf64..844bd27800c7a 100644 --- a/docs/source/serving/deploying_with_docker.md +++ b/docs/source/serving/deploying_with_docker.md @@ -25,7 +25,7 @@ memory to share data between processes under the hood, particularly for tensor p ## Building vLLM's Docker Image from Source -You can build and run vLLM from source via the provided . To build vLLM: +You can build and run vLLM from source via the provided . To build vLLM: ```console $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2 diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index c321c99323f67..c0a4b23f6dc70 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -51,7 +51,7 @@ $ --pipeline-parallel-size 2 If a single node does not have enough GPUs to hold the model, you can run the model using multiple nodes. It is important to make sure the execution environment is the same on all nodes, including the model path, the Python environment. The recommended way is to use docker images to ensure the same environment, and hide the heterogeneity of the host machines via mapping them into the same docker configuration. -The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. +The first step, is to start containers and organize them into a cluster. We have provided the helper script to start the cluster. Please note, this script launches docker without administrative privileges that would be required to access GPU performance counters when running profiling and tracing tools. For that purpose, the script can have `CAP_SYS_ADMIN` to the docker container by using the `--cap-add` option in the docker run command. Pick a node as the head node, and run the following command: diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index 2d4514e17e9fa..23c66f72162d2 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -183,7 +183,7 @@ The order of priorities is `command line > config file values > defaults`. Our Completions API is compatible with [OpenAI's Completions API](https://platform.openai.com/docs/api-reference/completions); you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it. -Code example: +Code example: #### Extra parameters @@ -214,7 +214,7 @@ We support both [Vision](https://platform.openai.com/docs/guides/vision)- and see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. - *Note: `image_url.detail` parameter is not supported.* -Code example: +Code example: #### Extra parameters @@ -247,7 +247,7 @@ which will be treated as a single prompt to the model. This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. ``` -Code example: +Code example: #### Extra parameters @@ -291,7 +291,7 @@ Our Pooling API encodes input prompts using a [pooling model](../models/pooling_ The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats. -Code example: +Code example: (score-api)= ### Score API @@ -301,7 +301,7 @@ Usually, the score for a sentence pair refers to the similarity between two sent You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). -Code example: +Code example: #### Single inference diff --git a/docs/source/usage/lora.md b/docs/source/usage/lora.md index 4ac3f788b91ee..cf06916d70f44 100644 --- a/docs/source/usage/lora.md +++ b/docs/source/usage/lora.md @@ -47,7 +47,7 @@ outputs = llm.generate( ) ``` -Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. +Check out for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. ## Serving LoRA Adapters diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/usage/multimodal_inputs.md index ed789f0af8256..82a3f3b8909a1 100644 --- a/docs/source/usage/multimodal_inputs.md +++ b/docs/source/usage/multimodal_inputs.md @@ -60,7 +60,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: To substitute multiple images inside the same text prompt, you can pass in a list of images instead: @@ -91,7 +91,7 @@ for o in outputs: print(generated_text) ``` -Full example: +Full example: Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -125,13 +125,13 @@ for o in outputs: You can pass a list of NumPy arrays directly to the {code}`'video'` field of the multi-modal dictionary instead of using multi-image input. -Full example: +Full example: ### Audio You can pass a tuple {code}`(array, sampling_rate)` to the {code}`'audio'` field of the multi-modal dictionary. -Full example: +Full example: ### Embedding @@ -208,7 +208,7 @@ A chat template is **required** to use Chat Completions API. Although most models come with a chat template, for others you have to define one yourself. The chat template can be inferred based on the documentation on the model's HuggingFace repo. -For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: +For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: ``` ### Image @@ -271,7 +271,7 @@ chat_response = client.chat.completions.create( print("Chat completion output:", chat_response.choices[0].message.content) ``` -Full example: +Full example: ```{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, @@ -296,7 +296,7 @@ $ export VLLM_IMAGE_FETCH_TIMEOUT= Instead of {code}`image_url`, you can pass a video file via {code}`video_url`. -You can use [these tests](gh-code:entrypoints/openai/test_video.py) as reference. +You can use [these tests](gh-file:entrypoints/openai/test_video.py) as reference. ````{note} By default, the timeout for fetching videos through HTTP URL url is `30` seconds. @@ -399,7 +399,7 @@ result = chat_completion_from_url.choices[0].message.content print("Chat completion output from audio url:", result) ``` -Full example: +Full example: ````{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. @@ -435,7 +435,7 @@ Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to expl to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, -and can be found here: +and can be found here: ``` Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: @@ -475,7 +475,7 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ Like with VLM2Vec, we have to explicitly pass `--task embed`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled -by a custom chat template: +by a custom chat template: ``` ```{important} @@ -483,4 +483,4 @@ Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of th example below for details. ``` -Full example: +Full example: diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index fd7d1529e0221..3f5d9ffc26278 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -131,7 +131,7 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -Full example: +Full example: ## Experimental Automatic Parsing (OpenAI API) @@ -257,4 +257,4 @@ outputs = llm.generate( print(outputs[0].outputs[0].text) ``` -Full example: +Full example: diff --git a/docs/source/usage/usage_stats.md b/docs/source/usage/usage_stats.md index 389ba69a60212..3d02fbab9216e 100644 --- a/docs/source/usage/usage_stats.md +++ b/docs/source/usage/usage_stats.md @@ -4,7 +4,7 @@ vLLM collects anonymous usage data by default to help the engineering team bette ## What data is collected? -The list of data collected by the latest version of vLLM can be found here: +The list of data collected by the latest version of vLLM can be found here: Here is an example as of v0.4.0: