From d2cbe4267f30c64bc728a249ce354af03b380e51 Mon Sep 17 00:00:00 2001
From: drikster80 <ed.sealing@gmail.com>
Date: Wed, 20 Nov 2024 14:00:01 +0000
Subject: [PATCH 01/25] Update Docker for aarch64 builds

Signed-off-by: drikster80 <ed.sealing@gmail.com>
---
 Dockerfile                  | 82 ++++++++++++++++++++++++++++++++++---
 requirements-build.txt      |  2 +-
 requirements-cuda-arm64.txt |  3 ++
 requirements-cuda.txt       |  4 +-
 4 files changed, 83 insertions(+), 8 deletions(-)
 create mode 100644 requirements-cuda-arm64.txt

diff --git a/Dockerfile b/Dockerfile
index 220dbe26712ec..b82e61c4f1cbb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,13 +11,14 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install -y ccache software-properties-common git curl sudo kmod \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -113,6 +125,53 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        apt-get update && apt-get install zlib1g-dev && \
+        python3 -m pip install packaging pybind11 && \
+        git clone https://github.com/openai/triton && \
+        cd triton/python && \
+        git submodule update --init --recursive && \
+        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation git+https://github.com/vllm-project/flash-attention.git ; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=FALSE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/Dao-AILab/causal-conv1d.git ; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+    MAMBA_FORCE_BUILD=TRUE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/state-spaces/mamba.git ; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        apt-get update && apt-get install -y cuda-toolkit-12-4 && \
+        git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
+        cd flashinfer/python && \
+        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=bind,source=.git,target=.git \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        git clone -b 0.44.1 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \
+        cd bitsandbytes && \
+        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
+    fi
+
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # Default max size of the wheel is 250MB
@@ -124,6 +183,7 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
     else \
         echo "Skipping wheel size check."; \
     fi
+
 #################### EXTENSION Build IMAGE ####################
 
 #################### DEV IMAGE ####################
@@ -143,6 +203,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,14 +231,23 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
+# On Arm64 platforms, all newly compiled wheels will also be installed (flashinfer, triton, mamba, etc.)
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
@@ -214,7 +286,7 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' timm==0.9.10
+    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'
 
 ENV VLLM_USAGE_SOURCE production-docker-image
 
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..388b193403e88 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1
+torch==2.5.1; platform_machine != 'aarch64'
 wheel
 jinja2
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
new file mode 100644
index 0000000000000..a8baf1dedb5a8
--- /dev/null
+++ b/requirements-cuda-arm64.txt
@@ -0,0 +1,3 @@
+--index-url https://download.pytorch.org/whl/nightly/cu124
+torchvision; platform_machine == 'aarch64'
+torch; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 058ab7c1ee9df..5d4dee8c7129a 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1
+torch == 2.5.1; platform_machine != 'aarch64'
 # These must be updated alongside torch
-torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From 29ed3585d96e3501bebfc693247728bc792e1147 Mon Sep 17 00:00:00 2001
From: drikster80 <ed.sealing@gmail.com>
Date: Wed, 20 Nov 2024 18:55:24 +0000
Subject: [PATCH 02/25] Update docs for arm64 docker builds & GH200 example

Signed-off-by: drikster80 <ed.sealing@gmail.com>
---
 docs/source/serving/deploying_with_docker.rst | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 14d94b09e9b9c..3118e19daf118 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -37,6 +37,32 @@ You can build and run vLLM from source via the provided `Dockerfile <https://git
         current GPU type the machine is running on, you can add the argument ``--build-arg torch_cuda_arch_list=""``
         for vLLM to find the current GPU type and build for that.
 
+Building for Arm64/aarch64
+--------------------------
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+.. note::
+
+        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+.. code-block:: console
+
+    # Example of building on Nvidia GH200 server. (Memory usage: ~180GB, Build time: ~2387s / ~40 min)
+    $ DOCKER_BUILDKIT=1 sudo docker build . \
+      --target vllm-openai \
+      -platform "linux/arm64" \
+      -t drikster80/vllm-gh200-openai:v0.6.4.post1 \
+      --build-arg max_jobs=66 \
+      --build-arg nvcc_threads=2 \
+      --build-arg torch_cuda_arch_list="9.0+PTX" \
+      --build-arg vllm_fa_cmake_gpu_arches="90-real"
+
+
+
 
 To run vLLM:
 

From f2635a41d12d0274ec78b9fa6dcedd4424806baa Mon Sep 17 00:00:00 2001
From: drikster80 <ed.sealing@gmail.com>
Date: Wed, 20 Nov 2024 21:48:37 +0000
Subject: [PATCH 03/25] Fix FLASHINFER not installing by default on x86_64

Signed-off-by: drikster80 <ed.sealing@gmail.com>
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index b82e61c4f1cbb..c87288b0605c2 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -245,7 +245,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
-if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
     python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
 fi
 COPY examples examples

From ccef455e5e57379ac09be1b106ef3a462ee9fb2e Mon Sep 17 00:00:00 2001
From: drikster80 <ed.sealing@gmail.com>
Date: Wed, 4 Dec 2024 16:31:06 +0000
Subject: [PATCH 04/25] Removed build of flash-attn. Added libnccl2

---
 Dockerfile | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c87288b0605c2..764c6c0312979 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -136,12 +136,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation git+https://github.com/vllm-project/flash-attention.git ; \
-    fi
-
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
@@ -214,7 +208,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip libnccl2\
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \

From 8b3c0b34608d00b80686388d010584b4c82308d8 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Sun, 15 Dec 2024 21:24:52 +0800
Subject: [PATCH 05/25] simplified Dockerfile build process for ARM64 systems

---
 Dockerfile | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c1b6e1bbfe354..3291a023222b5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,7 @@ ARG CUDA_VERSION=12.4.1
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
 ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
+ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
@@ -46,9 +47,14 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
@@ -63,6 +69,7 @@ ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}
 
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
+ARG TARGETPLATFORM
 
 # install build dependencies
 COPY requirements-build.txt requirements-build.txt
@@ -70,6 +77,11 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -134,8 +146,8 @@ COPY requirements-test.txt requirements-test.txt
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
-
 #################### DEV IMAGE ####################
+
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -143,6 +155,9 @@ ARG CUDA_VERSION=12.4.1
 ARG PYTHON_VERSION=3.12
 WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
+ARG TARGETPLATFORM
+
+COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -168,18 +183,25 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# install vllm wheel first, so that torch etc will be installed
+# Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    . /etc/environment && \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip uninstall -y torch && \
+        python3 -m pip install -r requirements-cuda-arm64.txt; \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+. /etc/environment && \
+if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.6/flashinfer-0.1.6+cu121torch2.4-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
+fi
 COPY examples examples
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -209,7 +231,6 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 RUN mkdir test_docs
 RUN mv docs test_docs/
 RUN mv vllm test_docs/
-
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
@@ -218,8 +239,11 @@ FROM vllm-base AS vllm-openai
 
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' timm==0.9.10
-
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
+    else \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'; \
+    fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

From 69bb56707cadde4b78cf396e3c2779e5375ed2f9 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Sun, 15 Dec 2024 21:27:05 +0800
Subject: [PATCH 06/25] bitsandbytes>=0.45.0

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 3291a023222b5..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -242,7 +242,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10'; \
     else \
-        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.44.0' 'timm==0.9.10'; \
+        pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
     fi
 ENV VLLM_USAGE_SOURCE production-docker-image
 

From d5d2911a529db9475af10fd85ba50a90d1d5d878 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Mon, 16 Dec 2024 11:45:04 +0800
Subject: [PATCH 07/25] rm needless kmod/libnccl2 && rm needless build from
 source for triton/causal-conv1d/mamba/flashinfer/bitsandbytes

---
 Dockerfile | 46 ++--------------------------------------------
 1 file changed, 2 insertions(+), 44 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 489382b3d1eca..123703848749c 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo kmod \
+    && apt-get install -y ccache software-properties-common git curl sudo \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -125,47 +125,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        apt-get update && apt-get install zlib1g-dev && \
-        python3 -m pip install packaging pybind11 && \
-        git clone https://github.com/openai/triton && \
-        cd triton/python && \
-        git submodule update --init --recursive && \
-        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
-    fi
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=FALSE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/Dao-AILab/causal-conv1d.git ; \
-    fi
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    MAMBA_FORCE_BUILD=TRUE pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir git+https://github.com/state-spaces/mamba.git ; \
-    fi
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        apt-get update && apt-get install -y cuda-toolkit-12-4 && \
-        git clone -b v0.1.6 https://github.com/flashinfer-ai/flashinfer.git --recursive && \
-        cd flashinfer/python && \
-        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
-    fi
-
-RUN --mount=type=cache,target=/root/.cache/pip \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        git clone -b 0.44.1 https://github.com/bitsandbytes-foundation/bitsandbytes.git && \
-        cd bitsandbytes && \
-        pip --verbose wheel --use-pep517 --no-deps -w /workspace/dist --no-build-isolation --no-cache-dir . ; \
-    fi
-
-
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # Default max size of the wheel is 250MB
@@ -177,7 +136,6 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
     else \
         echo "Skipping wheel size check."; \
     fi
-
 #################### EXTENSION Build IMAGE ####################
 
 #################### DEV IMAGE ####################
@@ -208,7 +166,7 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip libnccl2\
+    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
     && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \

From 075ab9db4e0ae8db4ea53c18a7fa729650b5b139 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Mon, 16 Dec 2024 15:27:47 +0800
Subject: [PATCH 08/25] update Memory usage && rename image && pinned version
 for torch torchvision

---
 docs/source/serving/deploying_with_docker.rst | 4 ++--
 requirements-cuda-arm64.txt                   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 3118e19daf118..11a9f12fd17cd 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -51,11 +51,11 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~180GB, Build time: ~2387s / ~40 min)
+    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
       -platform "linux/arm64" \
-      -t drikster80/vllm-gh200-openai:v0.6.4.post1 \
+      -t vllm/vllm-gh200-openai:latest \
       --build-arg max_jobs=66 \
       --build-arg nvcc_threads=2 \
       --build-arg torch_cuda_arch_list="9.0+PTX" \
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
index a8baf1dedb5a8..bbcb5cb7012ce 100644
--- a/requirements-cuda-arm64.txt
+++ b/requirements-cuda-arm64.txt
@@ -1,3 +1,3 @@
 --index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision; platform_machine == 'aarch64'
-torch; platform_machine == 'aarch64'
+torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
+torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'

From 34e714087d684fefb24e00ed385e78ddbe0437a6 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 19:44:37 +0800
Subject: [PATCH 09/25] rm requirements-cuda-arm64.txt && rm redundant torch 
 install/uninstall && rm arm64 platform condition in requirements && add
 python3 use_existing_torch.py before build in run-gh200-test.sh

---
 .buildkite/run-gh200-test.sh                  | 13 +++++++++++++
 Dockerfile                                    | 15 ++-------------
 docs/source/serving/deploying_with_docker.rst |  4 ++++
 requirements-build.txt                        |  2 +-
 requirements-cuda-arm64.txt                   |  3 ---
 requirements-cuda.txt                         |  4 ++--
 6 files changed, 22 insertions(+), 19 deletions(-)
 delete mode 100644 requirements-cuda-arm64.txt

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index d06604f96f2b8..6cf071b02b23a 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -4,6 +4,19 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# Download the python
+PYTHON_VERSION=3.12
+apt-get update -y \
+  && apt-get update -y \
+  && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
+  && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
+  && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
+  && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
+  && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
+  && python3 --version && python3 -m pip --version
+
+python3 use_existing_torch.py
+
 # Try building the docker image
 DOCKER_BUILDKIT=1 docker build . \
   --target vllm-openai \
diff --git a/Dockerfile b/Dockerfile
index 123703848749c..2a8f714f41515 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -47,13 +47,13 @@ WORKDIR /workspace
 # install build and runtime dependencies
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \
+        && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215";  \
     fi
 
 # cuda arch list used by torch
@@ -77,11 +77,6 @@ COPY requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 COPY . .
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
@@ -188,12 +183,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        pip uninstall -y torch && \
-        python3 -m pip install -r requirements-cuda-arm64.txt; \
-    fi
-
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 11a9f12fd17cd..ccc3b9c92ca78 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -52,6 +52,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 .. code-block:: console
 
     # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build.
+    $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124"
+    $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"
+    $ python3 use_existing_torch.py
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
       -platform "linux/arm64" \
diff --git a/requirements-build.txt b/requirements-build.txt
index 388b193403e88..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -4,6 +4,6 @@ ninja
 packaging
 setuptools>=61
 setuptools-scm>=8
-torch==2.5.1; platform_machine != 'aarch64'
+torch==2.5.1
 wheel
 jinja2
diff --git a/requirements-cuda-arm64.txt b/requirements-cuda-arm64.txt
deleted file mode 100644
index bbcb5cb7012ce..0000000000000
--- a/requirements-cuda-arm64.txt
+++ /dev/null
@@ -1,3 +0,0 @@
---index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
-torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 5d4dee8c7129a..058ab7c1ee9df 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,7 +4,7 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1; platform_machine != 'aarch64'
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1

From c65bf72e6e10d2b968ad9ce332256bfe4f073495 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 19:51:24 +0800
Subject: [PATCH 10/25] fix docs --platform "linux/arm64"

---
 docs/source/serving/deploying_with_docker.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index ccc3b9c92ca78..ecbb0775f598e 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -58,7 +58,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
     $ python3 use_existing_torch.py
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
-      -platform "linux/arm64" \
+      --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
       --build-arg max_jobs=66 \
       --build-arg nvcc_threads=2 \

From 6bddcd4e58aa34afc2d61f5c2e7bb622e93fa5e6 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 19:52:49 +0800
Subject: [PATCH 11/25] rm docs blanks

---
 docs/source/serving/deploying_with_docker.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index ecbb0775f598e..729cc07560049 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -65,9 +65,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-
-
-
 To run vLLM:
 
 .. code-block:: console

From 6b80175de547f40a201a583c1bdf62b3aa7e06ea Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 19:58:31 +0800
Subject: [PATCH 12/25] add torch nightly build in run-gh200-test.sh

---
 .buildkite/run-gh200-test.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 6cf071b02b23a..ff8ae37742dfb 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -15,6 +15,11 @@ apt-get update -y \
   && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
   && python3 --version && python3 -m pip --version
 
+# Install the nightly version of torch and torchvision
+python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124"
+python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"
+
+# Skip the new torch installation during build since we are using the specified version
 python3 use_existing_torch.py
 
 # Try building the docker image

From 93b82d739f8ebb3abb1f3252bcd89dc305185b31 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 20:07:09 +0800
Subject: [PATCH 13/25] edit doc note && rm build-arg max_jobs nvcc_threads

---
 docs/source/serving/deploying_with_docker.rst | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 729cc07560049..03c79a18d7765 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -45,13 +45,11 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 .. note::
 
-        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
-        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+        Multiple modules must be compiled, so this process can take a while.
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~12GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
     # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build.
     $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124"
     $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"
@@ -60,8 +58,6 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
       --target vllm-openai \
       --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
-      --build-arg max_jobs=66 \
-      --build-arg nvcc_threads=2 \
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 

From d0264892c3f3d4997827546a1cb8a5f8c3811a8a Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 21:30:00 +0800
Subject: [PATCH 14/25] add args max_jobs nvcc_threads to speed up && install
 pytorch first && outlines == 0.1.11 # Requires pytorch && rm torch install
 before docker build

---
 .buildkite/run-gh200-test.sh                  |  6 +-----
 Dockerfile                                    | 10 +++++-----
 docs/source/serving/deploying_with_docker.rst |  9 +++++----
 requirements-common.txt                       |  2 +-
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index ff8ae37742dfb..2525656ec9839 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -15,11 +15,7 @@ apt-get update -y \
   && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
   && python3 --version && python3 -m pip --version
 
-# Install the nightly version of torch and torchvision
-python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124"
-python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"
-
-# Skip the new torch installation during build since we are using the specified version
+# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
 python3 use_existing_torch.py
 
 # Try building the docker image
diff --git a/Dockerfile b/Dockerfile
index 2a8f714f41515..ccd6e5064f8cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,16 +45,16 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install -r requirements-cuda.txt
-
+# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \
         && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215";  \
     fi
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install -r requirements-cuda.txt
 
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 03c79a18d7765..574081b112a59 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -45,19 +45,20 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 .. note::
 
-        Multiple modules must be compiled, so this process can take a while.
+        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 
 .. code-block:: console
 
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
-    # Note: You should download the torch and torchvision wheels from the PyTorch nightly site, and run the use_existing_torch.py script to skip the download of other torch wheels during the build.
-    $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124"
-    $ python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215"
     $ python3 use_existing_torch.py
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
       --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \
+      --build-arg max_jobs=66 \
+      --build-arg nvcc_threads=2 \
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
diff --git a/requirements-common.txt b/requirements-common.txt
index 1c935303c8d79..c2dea20005a5d 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -19,7 +19,7 @@ pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11
+outlines == 0.1.11 # Requires pytorch
 lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10

From fb1759e3a3e1f77a41a0702602b70ff82ff282fd Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Wed, 18 Dec 2024 22:24:41 +0800
Subject: [PATCH 15/25] update vllm-gh200-openai Image size: 3.23GB

---
 docs/source/serving/deploying_with_docker.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 574081b112a59..7aceb49463449 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 7.26GB)
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 3.23GB)
     $ python3 use_existing_torch.py
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \

From 62c8e29905ab4de046451514064beda6ad08197a Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Thu, 19 Dec 2024 13:41:19 +0800
Subject: [PATCH 16/25] add debug "python3 -m pip list | grep torch || true" &&
 compressed-tensors requires pytorch

---
 Dockerfile              | 39 ++++++++++++++++++++++++++++++++++-----
 requirements-common.txt |  2 +-
 2 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ccd6e5064f8cb..f348c19a1807e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,17 +45,26 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements
+# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" \
-        && python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torchvision==0.22.0.dev20241215";  \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -120,6 +129,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # Default max size of the wheel is 250MB
@@ -152,8 +164,6 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
-COPY requirements-cuda-arm64.txt requirements-cuda-arm64.txt
-
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
@@ -178,11 +188,23 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
+# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten
+RUN --mount=type=cache,target=/root/.cache/pip \
+    if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+        python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
+    fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
@@ -226,6 +248,9 @@ RUN mv vllm test_docs/
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
@@ -233,6 +258,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     else \
         pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
     fi
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip list | grep torch || true
+
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/requirements-common.txt b/requirements-common.txt
index c2dea20005a5d..e5f6c7e5dbd9d 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -34,5 +34,5 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
+compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
 depyf==0.18.0 # required for profiling and debugging torch.compile

From dd483d974c750e48d3407f4bc42b96ca016a797e Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Thu, 19 Dec 2024 14:36:38 +0800
Subject: [PATCH 17/25] rm "python3 -m pip list | grep torch || true" && add
 cloudpickle && update image size

---
 Dockerfile                                    | 28 ++-----------------
 docs/source/serving/deploying_with_docker.rst |  2 +-
 requirements-common.txt                       |  1 +
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index f348c19a1807e..ed2d51d90e215 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,26 +45,17 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
+# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 COPY requirements-common.txt requirements-common.txt
 COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -129,9 +120,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
         python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # Default max size of the wheel is 250MB
@@ -188,23 +176,17 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten
+# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten during the wheel installation
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 # Install vllm wheel first, so that torch etc will be installed.
 RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
     --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install dist/*.whl --verbose
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 RUN --mount=type=cache,target=/root/.cache/pip \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
@@ -248,9 +230,6 @@ RUN mv vllm test_docs/
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 # install additional dependencies for openai api server
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
@@ -259,9 +238,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10'; \
     fi
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip list | grep torch || true
-
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 7aceb49463449..93a7df39547e7 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 .. code-block:: console
 
-    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 3.23GB)
+    # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
     $ python3 use_existing_torch.py
     $ DOCKER_BUILDKIT=1 sudo docker build . \
       --target vllm-openai \
diff --git a/requirements-common.txt b/requirements-common.txt
index e5f6c7e5dbd9d..6ec7bed463caf 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -36,3 +36,4 @@ setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we n
 einops # Required for Qwen2-VL.
 compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
 depyf==0.18.0 # required for profiling and debugging torch.compile
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
\ No newline at end of file

From 022eaf990797db0f2c76605bd03083ad9ff31ae5 Mon Sep 17 00:00:00 2001
From: cenzhiyao <2523403608@qq.com>
Date: Thu, 19 Dec 2024 15:24:07 +0800
Subject: [PATCH 18/25] rm sudo in doc

---
 docs/source/serving/deploying_with_docker.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 93a7df39547e7..93cf2eb51ee85 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -53,7 +53,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
     # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
     $ python3 use_existing_torch.py
-    $ DOCKER_BUILDKIT=1 sudo docker build . \
+    $ DOCKER_BUILDKIT=1 docker build . \
       --target vllm-openai \
       --platform "linux/arm64" \
       -t vllm/vllm-gh200-openai:latest \

From c61c57967bfa04c0cf34b811ba3faf56f8a15de4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 16:30:25 -0800
Subject: [PATCH 19/25] avoid installing python

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/run-gh200-test.sh | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
index 2525656ec9839..4fc6d089cc666 100644
--- a/.buildkite/run-gh200-test.sh
+++ b/.buildkite/run-gh200-test.sh
@@ -4,17 +4,6 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
-# Download the python
-PYTHON_VERSION=3.12
-apt-get update -y \
-  && apt-get update -y \
-  && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-  && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-  && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-  && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
-  && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
-  && python3 --version && python3 -m pip --version
-
 # Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
 python3 use_existing_torch.py
 

From 5c056e6f8492484b250becaf822e2a816ba24ed4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 16:34:54 -0800
Subject: [PATCH 20/25] add comments

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 Dockerfile | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ed2d51d90e215..0944050f7dfca 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,7 +45,11 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 WORKDIR /workspace
 
 # install build and runtime dependencies
-# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten by the requirements
+
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \
@@ -176,7 +180,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
 
-# for arm64, we need to install torch and torchvision from the nightly builds first, so that it won't be overwritten during the wheel installation
+# arm64 (GH200) build follows the practice of "use existing pytorch" build,
+# we need to install torch and torchvision from the nightly builds first,
+# pytorch will not appear as a vLLM dependency in all of the following steps
+# after this step
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
         python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215";  \

From 383f799c695a201dfe17cbd73b51fc89dfc79f87 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 16:37:19 -0800
Subject: [PATCH 21/25] fix comments

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 requirements-common.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 6ec7bed463caf..3ac301f0a4a60 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -35,5 +35,5 @@ six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that need
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
 compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
-depyf==0.18.0 # required for profiling and debugging torch.compile
-cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
\ No newline at end of file
+depyf==0.18.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py

From fccb98521ab58939e9398f56361396bf6451e782 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 16:59:55 -0800
Subject: [PATCH 22/25] doc change

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/serving/deploying_with_docker.rst | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 93cf2eb51ee85..b62b066bd6108 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -3,6 +3,9 @@
 Deploying with Docker
 ============================
 
+Use vLLM's Official Docker Image
+--------------------------------
+
 vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 
@@ -24,12 +27,15 @@ The image can be used to run OpenAI compatible server and is available on Docker
         memory to share data between processes under the hood, particularly for tensor parallel inference.
 
 
+Building vLLM's Docker Image from Source
+----------------------------------------
+
 You can build and run vLLM from source via the provided `Dockerfile <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_. To build vLLM:
 
 .. code-block:: console
 
-    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-
+    $ # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai
 
 .. note::
 
@@ -62,7 +68,10 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
       --build-arg torch_cuda_arch_list="9.0+PTX" \
       --build-arg vllm_fa_cmake_gpu_arches="90-real"
 
-To run vLLM:
+Use the custom-built vLLM Docker image
+--------------------------------------
+
+To run vLLM with the custom-built Docker image:
 
 .. code-block:: console
 
@@ -72,6 +81,8 @@ To run vLLM:
         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
         vllm/vllm-openai <args...>
 
+The argument ``vllm/vllm-openai`` specifies the image to run, and should be replaced with the name of the custom-built image (the ``-t`` tag from the build command).
+
 .. note::
 
-        **For `v0.4.1` and `v0.4.2` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
+        **For ``v0.4.1``` and ``v0.4.2``` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .

From 67129a77c9de3536223660dc9569e5fa052a103f Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 17:05:13 -0800
Subject: [PATCH 23/25] fix docs

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/serving/deploying_with_docker.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index b62b066bd6108..ad89fdb5a5139 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -47,7 +47,7 @@ Building for Arm64/aarch64
 --------------------------
 
 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+of PyTorch Nightly and should be considered **experimental**. Using the flag ``--platform "linux/arm64"`` will attempt to build for arm64.
 
 .. note::
 
@@ -85,4 +85,4 @@ The argument ``vllm/vllm-openai`` specifies the image to run, and should be repl
 
 .. note::
 
-        **For ``v0.4.1``` and ``v0.4.2``` only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .
+        **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. ``/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable ``VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1`` .

From f7f54f6347326ab3f861d347025d55d8ec83aaf3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 17:10:08 -0800
Subject: [PATCH 24/25] fix docs

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/serving/deploying_with_docker.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index ad89fdb5a5139..5d5ee4e3e02ce 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -51,7 +51,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag ``-
 
 .. note::
 
-        Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+        Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
         flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
         Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 

From 7dbe69e1e0f12df34d855c76ff6c170f8f29d5aa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 19 Dec 2024 17:10:41 -0800
Subject: [PATCH 25/25] fix docs

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/serving/deploying_with_docker.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 5d5ee4e3e02ce..b64eef819cd2e 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -52,7 +52,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag ``-
 .. note::
 
         Multiple modules must be compiled, so this process can take a while. Recommend using ``--build-arg max_jobs=`` & ``--build-arg nvcc_threads=``
-        flags to speed up build process. However, ensure your 'max_jobs' is substantially larger than 'nvcc_threads' to get the most benefits.
+        flags to speed up build process. However, ensure your ``max_jobs`` is substantially larger than ``nvcc_threads`` to get the most benefits.
         Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
 
 .. code-block:: console